diff --git a/Cargo.lock b/Cargo.lock index a3f38cf7b0..2fb7d9dd44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4380,6 +4380,7 @@ dependencies = [ "dropshot", "expectorate", "futures", + "gateway-client", "headers", "hex", "http", @@ -4393,6 +4394,7 @@ dependencies = [ "newtype_derive", "nexus-db-model", "nexus-defaults", + "nexus-inventory", "nexus-test-utils", "nexus-types", "omicron-common 0.1.0", @@ -4452,6 +4454,26 @@ dependencies = [ "serde_json", ] +[[package]] +name = "nexus-inventory" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "expectorate", + "futures", + "gateway-client", + "gateway-messages", + "gateway-test-utils", + "nexus-types", + "omicron-workspace-hack", + "regex", + "slog", + "strum", + "tokio", + "uuid", +] + [[package]] name = "nexus-test-interface" version = "0.1.0" @@ -4479,6 +4501,8 @@ dependencies = [ "dns-server", "dns-service-client 0.1.0", "dropshot", + "gateway-messages", + "gateway-test-utils", "headers", "http", "hyper", @@ -4526,6 +4550,7 @@ dependencies = [ "chrono", "dns-service-client 0.1.0", "futures", + "gateway-client", "newtype_derive", "omicron-common 0.1.0", "omicron-passwords 0.1.0", @@ -5070,6 +5095,7 @@ dependencies = [ "nexus-db-model", "nexus-db-queries", "nexus-defaults", + "nexus-inventory", "nexus-test-interface", "nexus-test-utils", "nexus-test-utils-macros", diff --git a/Cargo.toml b/Cargo.toml index edf10917d2..c436e3572d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/test-interface", "nexus/test-utils-macros", "nexus/test-utils", @@ -108,6 +109,7 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/types", "oximeter/collector", "oximeter/db", @@ -234,6 +236,7 @@ nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } +nexus-inventory = { path = "nexus/inventory" } omicron-certificates = { path = "certificates" } omicron-passwords = { path = "passwords" } omicron-workspace-hack = "0.1.0" @@ -372,8 +375,8 @@ tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } unicode-width = "0.1.11" update-engine = { path = "update-engine" } -uuid = { version = "1.4.1", features = ["serde", "v4"] } usdt = "0.3" +uuid = { version = "1.4.1", features = ["serde", "v4"] } walkdir = "2.4" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index da50356d2e..4e821e2676 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -335,6 +335,8 @@ pub struct BackgroundTaskConfig { pub dns_external: DnsTasksConfig, /// configuration for external endpoint list watcher pub external_endpoints: ExternalEndpointsConfig, + /// configuration for inventory tasks + pub inventory: InventoryConfig, } #[serde_as] @@ -369,6 +371,30 @@ pub struct ExternalEndpointsConfig { // allow/disallow wildcard certs, don't serve expired certs, etc.) } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct InventoryConfig { + /// period (in seconds) for periodic activations of this background task + /// + /// Each activation fetches information about all harware and software in + /// the system and inserts it into the database. This generates a moderate + /// amount of data. + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// maximum number of past collections to keep in the database + /// + /// This is a very coarse mechanism to keep the system from overwhelming + /// itself with inventory data. + pub nkeep: u32, + + /// disable inventory collection altogether + /// + /// This is an emergency lever for support / operations. It should never be + /// necessary. + pub disable: bool, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -467,19 +493,16 @@ impl std::fmt::Display for SchemeName { #[cfg(test)] mod test { - use super::Tunables; use super::{ - default_techport_external_server_port, AuthnConfig, Config, - ConsoleConfig, LoadError, PackageConfig, SchemeName, - TimeseriesDbConfig, UpdatesConfig, + default_techport_external_server_port, AuthnConfig, + BackgroundTaskConfig, Config, ConfigDropshotWithTls, ConsoleConfig, + Database, DeploymentConfig, DnsTasksConfig, DpdConfig, + ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, + LoadErrorKind, MgdConfig, PackageConfig, SchemeName, + TimeseriesDbConfig, Tunables, UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; - use crate::nexus_config::{ - BackgroundTaskConfig, ConfigDropshotWithTls, Database, - DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, - InternalDns, LoadErrorKind, MgdConfig, - }; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingIfExists; @@ -626,6 +649,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 11 + inventory.disable = false [default_region_allocation_strategy] type = "random" seed = 0 @@ -719,6 +745,11 @@ mod test { }, external_endpoints: ExternalEndpointsConfig { period_secs: Duration::from_secs(9), + }, + inventory: InventoryConfig { + period_secs: Duration::from_secs(10), + nkeep: 11, + disable: false, } }, default_region_allocation_strategy: @@ -773,6 +804,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 3 + inventory.disable = false [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index e3a0debbbb..efcefdea43 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -12,7 +12,7 @@ //! would be the only consumer -- and in that case it's okay to query the //! database directly. -// NOTE: eminates from Tabled macros +// NOTE: emanates from Tabled macros #![allow(clippy::useless_vec)] use crate::Omdb; @@ -30,6 +30,7 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; +use gateway_client::types::SpType; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -37,17 +38,22 @@ use nexus_db_model::DnsName; use nexus_db_model::DnsVersion; use nexus_db_model::DnsZone; use nexus_db_model::ExternalIp; +use nexus_db_model::HwBaseboardId; use nexus_db_model::Instance; +use nexus_db_model::InvCollection; use nexus_db_model::Project; use nexus_db_model::Region; use nexus_db_model::RegionSnapshot; use nexus_db_model::Sled; use nexus_db_model::Snapshot; use nexus_db_model::SnapshotState; +use nexus_db_model::SwCaboose; use nexus_db_model::Vmm; use nexus_db_model::Zpool; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::DataStoreConnection; +use nexus_db_queries::db::datastore::DataStoreInventoryTest; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; @@ -56,11 +62,14 @@ use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsRecord; use nexus_types::internal_api::params::Srv; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Generation; use omicron_common::postgres_config::PostgresConfigWithUrl; use std::cmp::Ordering; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; @@ -132,6 +141,8 @@ enum DbCommands { Disks(DiskArgs), /// Print information about internal and external DNS Dns(DnsArgs), + /// Print information about collected hardware/software inventory + Inventory(InventoryArgs), /// Print information about control plane services Services(ServicesArgs), /// Print information about sleds @@ -212,6 +223,42 @@ impl CliDnsGroup { } } +#[derive(Debug, Args)] +struct InventoryArgs { + #[command(subcommand)] + command: InventoryCommands, +} + +#[derive(Debug, Subcommand)] +enum InventoryCommands { + /// list all baseboards ever found + BaseboardIds, + /// list all cabooses ever found + Cabooses, + /// list and show details from particular collections + Collections(CollectionsArgs), +} + +#[derive(Debug, Args)] +struct CollectionsArgs { + #[command(subcommand)] + command: CollectionsCommands, +} + +#[derive(Debug, Subcommand)] +enum CollectionsCommands { + /// list collections + List, + /// show what was found in a particular collection + Show(CollectionsShowArgs), +} + +#[derive(Debug, Args)] +struct CollectionsShowArgs { + /// id of the collection + id: Uuid, +} + #[derive(Debug, Args)] struct ServicesArgs { #[command(subcommand)] @@ -335,6 +382,10 @@ impl DbArgs { cmd_db_dns_names(&opctx, &datastore, self.fetch_limit, args) .await } + DbCommands::Inventory(inventory_args) => { + cmd_db_inventory(&datastore, self.fetch_limit, inventory_args) + .await + } DbCommands::Services(ServicesArgs { command: ServicesCommands::ListInstances, }) => { @@ -429,15 +480,23 @@ where D: Display, { if items.len() == usize::try_from(limit.get()).unwrap() { - eprintln!( - "WARN: {}: found {} items (the limit). There may be more items \ - that were ignored. Consider overriding with --fetch-limit.", - context(), - items.len(), - ); + limit_error(limit, context); } } +fn limit_error(limit: NonZeroU32, context: F) +where + F: FnOnce() -> D, + D: Display, +{ + eprintln!( + "WARN: {}: found {} items (the limit). There may be more items \ + that were ignored. Consider overriding with --fetch-limit.", + context(), + limit, + ); +} + /// Returns pagination parameters to fetch the first page of results for a /// paginated endpoint fn first_page<'a, T>(limit: NonZeroU32) -> DataPageParams<'a, T> { @@ -1688,3 +1747,404 @@ fn format_record(record: &DnsRecord) -> impl Display { } } } + +// Inventory + +async fn cmd_db_inventory( + datastore: &DataStore, + limit: NonZeroU32, + inventory_args: &InventoryArgs, +) -> Result<(), anyhow::Error> { + let conn = datastore.pool_connection_for_tests().await?; + match inventory_args.command { + InventoryCommands::BaseboardIds => { + cmd_db_inventory_baseboard_ids(&conn, limit).await + } + InventoryCommands::Cabooses => { + cmd_db_inventory_cabooses(&conn, limit).await + } + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::List, + }) => cmd_db_inventory_collections_list(&conn, limit).await, + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::Show(CollectionsShowArgs { id }), + }) => cmd_db_inventory_collections_show(datastore, id, limit).await, + } +} + +async fn cmd_db_inventory_baseboard_ids( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct BaseboardRow { + id: Uuid, + part_number: String, + serial_number: String, + } + + use db::schema::hw_baseboard_id::dsl; + let baseboard_ids = dsl::hw_baseboard_id + .order_by((dsl::part_number, dsl::serial_number)) + .limit(i64::from(u32::from(limit))) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboard ids")?; + check_limit(&baseboard_ids, limit, || "loading baseboard ids"); + + let rows = baseboard_ids.into_iter().map(|baseboard_id| BaseboardRow { + id: baseboard_id.id, + part_number: baseboard_id.part_number, + serial_number: baseboard_id.serial_number, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_cabooses( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow { + id: Uuid, + board: String, + git_commit: String, + name: String, + version: String, + } + + use db::schema::sw_caboose::dsl; + let mut cabooses = dsl::sw_caboose + .limit(i64::from(u32::from(limit))) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading cabooses")?; + check_limit(&cabooses, limit, || "loading cabooses"); + cabooses.sort(); + + let rows = cabooses.into_iter().map(|caboose| CabooseRow { + id: caboose.id, + board: caboose.board, + name: caboose.name, + version: caboose.version, + git_commit: caboose.git_commit, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_list( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CollectionRow { + id: Uuid, + started: String, + took: String, + nsps: i64, + nerrors: i64, + } + + let collections = { + use db::schema::inv_collection::dsl; + dsl::inv_collection + .order_by(dsl::time_started) + .limit(i64::from(u32::from(limit))) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collections")? + }; + check_limit(&collections, limit, || "loading collections"); + + let mut rows = Vec::new(); + for collection in collections { + let nerrors = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting errors")? + }; + + let nsps = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting SPs")? + }; + + let took = format!( + "{} ms", + collection + .time_done + .signed_duration_since(&collection.time_started) + .num_milliseconds() + ); + rows.push(CollectionRow { + id: collection.id, + started: humantime::format_rfc3339_seconds( + collection.time_started.into(), + ) + .to_string(), + took, + nsps, + nerrors, + }); + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_show( + datastore: &DataStore, + id: Uuid, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + let (collection, incomplete) = datastore + .inventory_collection_read_best_effort(id, limit) + .await + .context("reading collection")?; + if incomplete { + limit_error(limit, || "loading collection"); + } + + inv_collection_print(&collection).await?; + let nerrors = inv_collection_print_errors(&collection).await?; + inv_collection_print_devices(&collection).await?; + + if nerrors > 0 { + eprintln!( + "warning: {} collection error{} {} reported above", + nerrors, + if nerrors == 1 { "was" } else { "were" }, + if nerrors == 1 { "" } else { "s" } + ); + } + + Ok(()) +} + +async fn inv_collection_print( + collection: &Collection, +) -> Result<(), anyhow::Error> { + println!("collection: {}", collection.id); + println!( + "collector: {}{}", + collection.collector, + if collection.collector.parse::().is_ok() { + " (likely a Nexus instance)" + } else { + "" + } + ); + println!( + "started: {}", + humantime::format_rfc3339_millis(collection.time_started.into()) + ); + println!( + "done: {}", + humantime::format_rfc3339_millis(collection.time_done.into()) + ); + + Ok(()) +} + +async fn inv_collection_print_errors( + collection: &Collection, +) -> Result { + println!("errors: {}", collection.errors.len()); + for (index, message) in collection.errors.iter().enumerate() { + println!(" error {}: {}", index, message); + } + + Ok(collection + .errors + .len() + .try_into() + .expect("could not convert error count into u32 (yikes)")) +} + +async fn inv_collection_print_devices( + collection: &Collection, +) -> Result<(), anyhow::Error> { + // Assemble a list of baseboard ids, sorted first by device type (sled, + // switch, power), then by slot number. This is the order in which we will + // print everything out. + let mut sorted_baseboard_ids: Vec<_> = + collection.sps.keys().cloned().collect(); + sorted_baseboard_ids.sort_by(|s1, s2| { + let sp1 = collection.sps.get(s1).unwrap(); + let sp2 = collection.sps.get(s2).unwrap(); + sp1.sp_type.cmp(&sp2.sp_type).then(sp1.sp_slot.cmp(&sp2.sp_slot)) + }); + + // Now print them. + for baseboard_id in &sorted_baseboard_ids { + // This unwrap should not fail because the collection we're iterating + // over came from the one we're looking into now. + let sp = collection.sps.get(baseboard_id).unwrap(); + let baseboard = collection.baseboards.get(baseboard_id); + let rot = collection.rots.get(baseboard_id); + + println!(""); + match baseboard { + None => { + // It should be impossible to find an SP whose baseboard + // information we didn't previously fetch. That's either a bug + // in this tool (for failing to fetch or find the right + // baseboard information) or the inventory system (for failing + // to insert a record into the hw_baseboard_id table). + println!( + "{:?} (serial number unknown -- this is a bug)", + sp.sp_type + ); + println!(" part number: unknown"); + } + Some(baseboard) => { + println!("{:?} {}", sp.sp_type, baseboard.serial_number); + println!(" part number: {}", baseboard.part_number); + } + }; + + println!(" power: {:?}", sp.power_state); + println!(" revision: {}", sp.baseboard_revision); + print!(" MGS slot: {:?} {}", sp.sp_type, sp.sp_slot); + if let SpType::Sled = sp.sp_type { + print!(" (cubby {})", sp.sp_slot); + } + println!(""); + println!(" found at: {} from {}", sp.time_collected, sp.source); + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: String, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, + } + + println!(" cabooses:"); + let caboose_rows: Vec<_> = CabooseWhich::iter() + .filter_map(|c| { + collection.caboose_for(c, baseboard_id).map(|d| (c, d)) + }) + .map(|(c, found_caboose)| CabooseRow { + slot: format!("{:?}", c), + board: &found_caboose.caboose.board, + name: &found_caboose.caboose.name, + version: &found_caboose.caboose.version, + git_commit: &found_caboose.caboose.git_commit, + }) + .collect(); + let table = tabled::Table::new(caboose_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", textwrap::indent(&table.to_string(), " ")); + + if let Some(rot) = rot { + println!(" RoT: active slot: slot {:?}", rot.active_slot); + println!( + " RoT: persistent boot preference: slot {:?}", + rot.persistent_boot_preference, + ); + println!( + " RoT: pending persistent boot preference: {}", + rot.pending_persistent_boot_preference + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + println!( + " RoT: transient boot preference: {}", + rot.transient_boot_preference + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot A SHA3-256: {}", + rot.slot_a_sha3_256_digest + .clone() + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot B SHA3-256: {}", + rot.slot_b_sha3_256_digest + .clone() + .unwrap_or_else(|| String::from("-")) + ); + } else { + println!(" RoT: no information found"); + } + } + + println!(""); + for sp_missing_rot in collection + .sps + .keys() + .collect::>() + .difference(&collection.rots.keys().collect::>()) + { + // It's not a bug in either omdb or the inventory system to find an SP + // with no RoT. It just means that when we collected inventory from the + // SP, it couldn't communicate with its RoT. + let sp = collection.sps.get(*sp_missing_rot).unwrap(); + println!( + "warning: found SP with no RoT: {:?} slot {}", + sp.sp_type, sp.sp_slot + ); + } + + for rot_missing_sp in collection + .rots + .keys() + .collect::>() + .difference(&collection.sps.keys().collect::>()) + { + // It *is* a bug in the inventory system (or omdb) to find an RoT with + // no SP, since we get the RoT information from the SP in the first + // place. + println!( + "error: found RoT with no SP: \ + hw_baseboard_id {:?} -- this is a bug", + rot_missing_sp + ); + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index d2938418e1..770cba9f62 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -433,7 +433,7 @@ async fn show_sp_details( board: caboose.board, git_commit: caboose.git_commit, name: caboose.name, - version: caboose.version.unwrap_or_else(|| "-".to_string()), + version: caboose.version, } } } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 7599fc209d..128d4315f2 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -6,6 +6,7 @@ use crate::Omdb; use anyhow::Context; +use chrono::DateTime; use chrono::SecondsFormat; use chrono::Utc; use clap::Args; @@ -144,7 +145,10 @@ async fn cmd_nexus_background_tasks_show( ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; - let mut tasks = response.into_inner(); + // Convert the HashMap to a BTreeMap because we want the keys in sorted + // order. + let mut tasks = + response.into_inner().into_iter().collect::>(); // We want to pick the order that we print some tasks intentionally. Then // we want to print anything else that we find. @@ -478,6 +482,38 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } } } + } else if name == "inventory_collection" { + #[derive(Deserialize)] + struct InventorySuccess { + collection_id: Uuid, + time_started: DateTime, + time_done: DateTime, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(found_inventory) => { + println!( + " last collection id: {}", + found_inventory.collection_id + ); + println!( + " last collection started: {}", + found_inventory + .time_started + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + println!( + " last collection done: {}", + found_inventory + .time_done + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 0fbef95f27..7949c1eb61 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,6 +57,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT @@ -113,6 +117,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. @@ -156,6 +164,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index a830cf671a..8162b6d9de 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -251,6 +251,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ @@ -332,6 +336,15 @@ task: "external_endpoints" TLS certificates: 0 +task: "inventory_collection" + configured period: every 10m + currently executing: no + last completed activation: iter 3, triggered by an explicit signal + started at (s ago) and ran for ms + last collection id: REDACTED_UUID_REDACTED_UUID_REDACTED + last collection started: + last collection done: + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 6ab6cb33fc..e859c325a5 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -92,6 +92,7 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory services Print information about control plane services sleds Print information about sleds instances Print information about customer instances @@ -115,6 +116,7 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory services Print information about control plane services sleds Print information about sleds instances Print information about customer instances diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index e79184f7e5..66778d96e7 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -403,6 +403,10 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { cptestctx.silo_name, cptestctx.external_dns_zone_name, ); + println!( + "omicron-dev: management gateway: http://{}", + cptestctx.gateway.client.bind_address, + ); println!("omicron-dev: silo name: {}", cptestctx.silo_name,); println!( "omicron-dev: privileged user name: {}", diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 12bc7b465a..2db6121f1d 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -29,8 +29,6 @@ use dropshot::WebsocketEndpointResult; use dropshot::WebsocketUpgrade; use futures::TryFutureExt; use gateway_messages::SpComponent; -use gateway_messages::SpError; -use gateway_sp_comms::error::CommunicationError; use gateway_sp_comms::HostPhase2Provider; use omicron_common::update::ArtifactHash; use schemars::JsonSchema; @@ -488,7 +486,7 @@ pub struct SpComponentCaboose { pub git_commit: String, pub board: String, pub name: String, - pub version: Option, + pub version: String, } /// Identity of a host phase2 recovery image. @@ -725,18 +723,15 @@ async fn sp_component_caboose_get( .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME) .await .map_err(SpCommsError::from)?; - let version = match sp + let version = sp .read_component_caboose(component, firmware_slot, CABOOSE_KEY_VERSION) .await - { - Ok(value) => Some(from_utf8(&CABOOSE_KEY_VERSION, value)?), - Err(CommunicationError::SpError(SpError::NoSuchCabooseKey(_))) => None, - Err(err) => return Err(SpCommsError::from(err).into()), - }; + .map_err(SpCommsError::from)?; let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; let name = from_utf8(&CABOOSE_KEY_NAME, name)?; + let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; let caboose = SpComponentCaboose { git_commit, board, name, version }; diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 323386ba25..feb25eb1f1 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -84,6 +84,7 @@ usdt.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true +nexus-inventory.workspace = true nexus-types.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs new file mode 100644 index 0000000000..5b09f289bb --- /dev/null +++ b/nexus/db-model/src/inventory.rs @@ -0,0 +1,443 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing the hardware/software inventory in the database + +use crate::schema::{ + hw_baseboard_id, inv_caboose, inv_collection, inv_collection_error, + inv_root_of_trust, inv_service_processor, sw_caboose, +}; +use crate::{impl_enum_type, SqlU16, SqlU32}; +use chrono::DateTime; +use chrono::Utc; +use diesel::backend::Backend; +use diesel::deserialize::{self, FromSql}; +use diesel::expression::AsExpression; +use diesel::pg::Pg; +use diesel::serialize::ToSql; +use diesel::{serialize, sql_types}; +use nexus_types::inventory::{ + BaseboardId, Caboose, Collection, PowerState, RotSlot, +}; +use uuid::Uuid; + +// See [`nexus_types::inventory::PowerState`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_power_state"))] + pub struct HwPowerStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwPowerStateEnum)] + pub enum HwPowerState; + + // Enum values + A0 => b"A0" + A1 => b"A1" + A2 => b"A2" +); + +impl From for HwPowerState { + fn from(p: PowerState) -> Self { + match p { + PowerState::A0 => HwPowerState::A0, + PowerState::A1 => HwPowerState::A1, + PowerState::A2 => HwPowerState::A2, + } + } +} + +impl From for PowerState { + fn from(value: HwPowerState) -> Self { + match value { + HwPowerState::A0 => PowerState::A0, + HwPowerState::A1 => PowerState::A1, + HwPowerState::A2 => PowerState::A2, + } + } +} + +// See [`nexus_types::inventory::RotSlot`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_rot_slot"))] + pub struct HwRotSlotEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwRotSlotEnum)] + pub enum HwRotSlot; + + // Enum values + A => b"A" + B => b"B" +); + +impl From for HwRotSlot { + fn from(value: RotSlot) -> Self { + match value { + RotSlot::A => HwRotSlot::A, + RotSlot::B => HwRotSlot::B, + } + } +} + +impl From for RotSlot { + fn from(value: HwRotSlot) -> RotSlot { + match value { + HwRotSlot::A => RotSlot::A, + HwRotSlot::B => RotSlot::B, + } + } +} + +// See [`nexus_types::inventory::CabooseWhich`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "caboose_which"))] + pub struct CabooseWhichEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = CabooseWhichEnum)] + pub enum CabooseWhich; + + // Enum values + SpSlot0 => b"sp_slot_0" + SpSlot1 => b"sp_slot_1" + RotSlotA => b"rot_slot_A" + RotSlotB => b"rot_slot_B" +); + +impl From for CabooseWhich { + fn from(c: nexus_types::inventory::CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; + match c { + nexus_inventory::CabooseWhich::SpSlot0 => CabooseWhich::SpSlot0, + nexus_inventory::CabooseWhich::SpSlot1 => CabooseWhich::SpSlot1, + nexus_inventory::CabooseWhich::RotSlotA => CabooseWhich::RotSlotA, + nexus_inventory::CabooseWhich::RotSlotB => CabooseWhich::RotSlotB, + } + } +} + +impl From for nexus_types::inventory::CabooseWhich { + fn from(row: CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; + match row { + CabooseWhich::SpSlot0 => nexus_inventory::CabooseWhich::SpSlot0, + CabooseWhich::SpSlot1 => nexus_inventory::CabooseWhich::SpSlot1, + CabooseWhich::RotSlotA => nexus_inventory::CabooseWhich::RotSlotA, + CabooseWhich::RotSlotB => nexus_inventory::CabooseWhich::RotSlotB, + } + } +} + +// See [`nexus_types::inventory::SpType`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sp_type"))] + pub struct SpTypeEnum; + + #[derive( + Copy, + Clone, + Debug, + AsExpression, + FromSqlRow, + PartialOrd, + Ord, + PartialEq, + Eq + )] + #[diesel(sql_type = SpTypeEnum)] + pub enum SpType; + + // Enum values + Sled => b"sled" + Switch => b"switch" + Power => b"power" +); + +impl From for SpType { + fn from(value: nexus_types::inventory::SpType) -> Self { + match value { + nexus_types::inventory::SpType::Sled => SpType::Sled, + nexus_types::inventory::SpType::Power => SpType::Power, + nexus_types::inventory::SpType::Switch => SpType::Switch, + } + } +} + +impl From for nexus_types::inventory::SpType { + fn from(value: SpType) -> Self { + match value { + SpType::Sled => nexus_types::inventory::SpType::Sled, + SpType::Switch => nexus_types::inventory::SpType::Switch, + SpType::Power => nexus_types::inventory::SpType::Power, + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection)] +pub struct InvCollection { + pub id: Uuid, + pub time_started: DateTime, + pub time_done: DateTime, + pub collector: String, +} + +impl<'a> From<&'a Collection> for InvCollection { + fn from(c: &'a Collection) -> Self { + InvCollection { + id: c.id, + time_started: c.time_started, + time_done: c.time_done, + collector: c.collector.clone(), + } + } +} + +/// See [`nexus_types::inventory::BaseboardId`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = hw_baseboard_id)] +pub struct HwBaseboardId { + pub id: Uuid, + pub part_number: String, + pub serial_number: String, +} + +impl From for HwBaseboardId { + fn from(c: BaseboardId) -> Self { + HwBaseboardId { + id: Uuid::new_v4(), + part_number: c.part_number, + serial_number: c.serial_number, + } + } +} + +impl From for BaseboardId { + fn from(row: HwBaseboardId) -> Self { + BaseboardId { + part_number: row.part_number, + serial_number: row.serial_number, + } + } +} + +/// See [`nexus_types::inventory::Caboose`]. +#[derive( + Queryable, + Insertable, + Clone, + Debug, + Selectable, + Eq, + PartialEq, + Ord, + PartialOrd, +)] +#[diesel(table_name = sw_caboose)] +pub struct SwCaboose { + pub id: Uuid, + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for SwCaboose { + fn from(c: Caboose) -> Self { + SwCaboose { + id: Uuid::new_v4(), + board: c.board, + git_commit: c.git_commit, + name: c.name, + version: c.version, + } + } +} + +impl From for Caboose { + fn from(row: SwCaboose) -> Self { + Self { + board: row.board, + git_commit: row.git_commit, + name: row.name, + version: row.version, + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection_error)] +pub struct InvCollectionError { + pub inv_collection_id: Uuid, + pub idx: SqlU16, + pub message: String, +} + +impl InvCollectionError { + pub fn new(inv_collection_id: Uuid, idx: u16, message: String) -> Self { + InvCollectionError { + inv_collection_id, + idx: SqlU16::from(idx), + message, + } + } +} + +/// See [`nexus_types::inventory::ServiceProcessor`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_service_processor)] +pub struct InvServiceProcessor { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + + pub baseboard_revision: BaseboardRevision, + pub hubris_archive_id: String, + pub power_state: HwPowerState, +} + +impl From for nexus_types::inventory::ServiceProcessor { + fn from(row: InvServiceProcessor) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + sp_type: nexus_types::inventory::SpType::from(row.sp_type), + sp_slot: **row.sp_slot, + baseboard_revision: **row.baseboard_revision, + hubris_archive: row.hubris_archive_id, + power_state: PowerState::from(row.power_state), + } + } +} + +/// Newtype wrapping the MGS-reported slot number for an SP +/// +/// Current racks only have 32 slots for any given SP type. MGS represents the +/// slot number with a u32. We truncate it to a u16 (which still requires +/// storing it as an i32 in the database, since the database doesn't natively +/// support signed integers). +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int4)] +pub struct SpMgsSlot(SqlU16); + +NewtypeFrom! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDeref! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDisplay! { () pub struct SpMgsSlot(SqlU16); } + +impl ToSql for SpMgsSlot { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for SpMgsSlot +where + DB: Backend, + SqlU16: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(SpMgsSlot(SqlU16::from_sql(bytes)?)) + } +} + +/// Newtype wrapping the revision number for a particular baseboard +/// +/// MGS reports this as a u32 and we represent it the same way, though that +/// would be quite a lot of hardware revisions to go through! +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int8)] +pub struct BaseboardRevision(SqlU32); + +NewtypeFrom! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDeref! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDisplay! { () pub struct BaseboardRevision(SqlU32); } + +impl ToSql for BaseboardRevision { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for BaseboardRevision +where + DB: Backend, + SqlU32: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(BaseboardRevision(SqlU32::from_sql(bytes)?)) + } +} + +/// See [`nexus_types::inventory::RotState`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_root_of_trust)] +pub struct InvRootOfTrust { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub slot_active: HwRotSlot, + pub slot_boot_pref_transient: Option, + pub slot_boot_pref_persistent: HwRotSlot, + pub slot_boot_pref_persistent_pending: Option, + pub slot_a_sha3_256: Option, + pub slot_b_sha3_256: Option, +} + +impl From for nexus_types::inventory::RotState { + fn from(row: InvRootOfTrust) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + active_slot: RotSlot::from(row.slot_active), + persistent_boot_preference: RotSlot::from( + row.slot_boot_pref_persistent, + ), + pending_persistent_boot_preference: row + .slot_boot_pref_persistent_pending + .map(RotSlot::from), + transient_boot_preference: row + .slot_boot_pref_transient + .map(RotSlot::from), + slot_a_sha3_256_digest: row.slot_a_sha3_256, + slot_b_sha3_256_digest: row.slot_b_sha3_256, + } + } +} + +/// See [`nexus_types::inventory::CabooseFound`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_caboose)] +pub struct InvCaboose { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub which: CabooseWhich, + pub sw_caboose_id: Uuid, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index f399605f55..7aa8a6b076 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -32,6 +32,7 @@ mod image; mod instance; mod instance_cpu_count; mod instance_state; +mod inventory; mod ip_pool; mod ipv4net; mod ipv6; @@ -121,6 +122,7 @@ pub use image::*; pub use instance::*; pub use instance_cpu_count::*; pub use instance_state::*; +pub use inventory::*; pub use ip_pool::*; pub use ipv4net::*; pub use ipv6::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index e079432e5a..cff261e01a 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1140,6 +1140,87 @@ table! { } } +/* hardware inventory */ + +table! { + hw_baseboard_id (id) { + id -> Uuid, + part_number -> Text, + serial_number -> Text, + } +} + +table! { + sw_caboose (id) { + id -> Uuid, + board -> Text, + git_commit -> Text, + name -> Text, + version -> Text, + } +} + +table! { + inv_collection (id) { + id -> Uuid, + time_started -> Timestamptz, + time_done -> Timestamptz, + collector -> Text, + } +} + +table! { + inv_collection_error (inv_collection_id, idx) { + inv_collection_id -> Uuid, + idx -> Int4, + message -> Text, + } +} + +table! { + inv_service_processor (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + sp_type -> crate::SpTypeEnum, + sp_slot -> Int4, + + baseboard_revision -> Int8, + hubris_archive_id -> Text, + power_state -> crate::HwPowerStateEnum, + } +} + +table! { + inv_root_of_trust (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + slot_active -> crate::HwRotSlotEnum, + slot_boot_pref_transient -> Nullable, + slot_boot_pref_persistent -> crate::HwRotSlotEnum, + slot_boot_pref_persistent_pending -> Nullable, + slot_a_sha3_256 -> Nullable, + slot_b_sha3_256 -> Nullable, + } +} + +table! { + inv_caboose (inv_collection_id, hw_baseboard_id, which) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + which -> crate::CabooseWhichEnum, + sw_caboose_id -> Uuid, + } +} + table! { bootstore_keys (key, generation) { key -> Text, @@ -1162,7 +1243,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(8, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(9, 0, 0); allow_tables_to_appear_in_same_query!( system_update, @@ -1174,6 +1255,10 @@ joinable!(system_update_component_update -> component_update (component_update_i allow_tables_to_appear_in_same_query!(ip_pool_range, ip_pool); joinable!(ip_pool_range -> ip_pool (ip_pool_id)); +allow_tables_to_appear_in_same_query!(inv_collection, inv_collection_error); +joinable!(inv_collection_error -> inv_collection (inv_collection_id)); +allow_tables_to_appear_in_same_query!(hw_baseboard_id, sw_caboose, inv_caboose); + allow_tables_to_appear_in_same_query!( dataset, disk, diff --git a/nexus/db-model/src/unsigned.rs b/nexus/db-model/src/unsigned.rs index 7059c6bcad..b4e9db2308 100644 --- a/nexus/db-model/src/unsigned.rs +++ b/nexus/db-model/src/unsigned.rs @@ -83,6 +83,7 @@ pub struct SqlU16(pub u16); NewtypeFrom! { () pub struct SqlU16(u16); } NewtypeDeref! { () pub struct SqlU16(u16); } +NewtypeDisplay! { () pub struct SqlU16(u16); } impl SqlU16 { pub fn new(value: u16) -> Self { @@ -134,6 +135,7 @@ pub struct SqlU32(pub u32); NewtypeFrom! { () pub struct SqlU32(u32); } NewtypeDeref! { () pub struct SqlU32(u32); } +NewtypeDisplay! { () pub struct SqlU32(u32); } impl SqlU32 { pub fn new(value: u32) -> Self { diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index c16c0f5319..5edf4f1e89 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -70,8 +70,10 @@ omicron-workspace-hack.workspace = true assert_matches.workspace = true expectorate.workspace = true hyper-rustls.workspace = true +gateway-client.workspace = true internal-dns.workspace = true itertools.workspace = true +nexus-inventory.workspace = true nexus-test-utils.workspace = true omicron-sled-agent.workspace = true omicron-test-utils.workspace = true diff --git a/nexus/db-queries/src/authz/api_resources.rs b/nexus/db-queries/src/authz/api_resources.rs index ec959e2907..b22fe1ac25 100644 --- a/nexus/db-queries/src/authz/api_resources.rs +++ b/nexus/db-queries/src/authz/api_resources.rs @@ -473,6 +473,61 @@ impl AuthorizedResource for DeviceAuthRequestList { } } +/// Synthetic resource used for modeling access to low-level hardware inventory +/// data +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Inventory; +pub const INVENTORY: Inventory = Inventory {}; + +impl oso::PolarClass for Inventory { + fn get_polar_class_builder() -> oso::ClassBuilder { + // Roles are not directly attached to Inventory + oso::Class::builder() + .with_equality_check() + .add_method( + "has_role", + |_: &Inventory, _actor: AuthenticatedActor, _role: String| { + false + }, + ) + .add_attribute_getter("fleet", |_| FLEET) + } +} + +impl AuthorizedResource for Inventory { + fn load_roles<'a, 'b, 'c, 'd, 'e, 'f>( + &'a self, + opctx: &'b OpContext, + datastore: &'c DataStore, + authn: &'d authn::Context, + roleset: &'e mut RoleSet, + ) -> futures::future::BoxFuture<'f, Result<(), Error>> + where + 'a: 'f, + 'b: 'f, + 'c: 'f, + 'd: 'f, + 'e: 'f, + { + load_roles_for_resource_tree(&FLEET, opctx, datastore, authn, roleset) + .boxed() + } + + fn on_unauthorized( + &self, + _: &Authz, + error: Error, + _: AnyActor, + _: Action, + ) -> Error { + error + } + + fn polar_class(&self) -> oso::Class { + Self::get_polar_class() + } +} + /// Synthetic resource describing the list of Certificates associated with a /// Silo #[derive(Clone, Debug, Eq, PartialEq)] diff --git a/nexus/db-queries/src/authz/omicron.polar b/nexus/db-queries/src/authz/omicron.polar index 119eccc8e9..87fdf72f6a 100644 --- a/nexus/db-queries/src/authz/omicron.polar +++ b/nexus/db-queries/src/authz/omicron.polar @@ -365,6 +365,16 @@ resource DnsConfig { has_relation(fleet: Fleet, "parent_fleet", dns_config: DnsConfig) if dns_config.fleet = fleet; +# Describes the policy for reading and modifying low-level inventory +resource Inventory { + permissions = [ "read", "modify" ]; + relations = { parent_fleet: Fleet }; + "read" if "viewer" on "parent_fleet"; + "modify" if "admin" on "parent_fleet"; +} +has_relation(fleet: Fleet, "parent_fleet", inventory: Inventory) + if inventory.fleet = fleet; + # Describes the policy for accessing "/v1/system/ip-pools" in the API resource IpPoolList { permissions = [ diff --git a/nexus/db-queries/src/authz/oso_generic.rs b/nexus/db-queries/src/authz/oso_generic.rs index bcd7a42945..e642062ead 100644 --- a/nexus/db-queries/src/authz/oso_generic.rs +++ b/nexus/db-queries/src/authz/oso_generic.rs @@ -106,6 +106,7 @@ pub fn make_omicron_oso(log: &slog::Logger) -> Result { Database::get_polar_class(), DnsConfig::get_polar_class(), Fleet::get_polar_class(), + Inventory::get_polar_class(), IpPoolList::get_polar_class(), ConsoleSessionList::get_polar_class(), DeviceAuthRequestList::get_polar_class(), diff --git a/nexus/db-queries/src/authz/policy_test/resource_builder.rs b/nexus/db-queries/src/authz/policy_test/resource_builder.rs index a4c68ea000..f10c969038 100644 --- a/nexus/db-queries/src/authz/policy_test/resource_builder.rs +++ b/nexus/db-queries/src/authz/policy_test/resource_builder.rs @@ -244,9 +244,10 @@ macro_rules! impl_dyn_authorized_resource_for_global { impl_dyn_authorized_resource_for_global!(authz::oso_generic::Database); impl_dyn_authorized_resource_for_global!(authz::ConsoleSessionList); +impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); impl_dyn_authorized_resource_for_global!(authz::DnsConfig); impl_dyn_authorized_resource_for_global!(authz::IpPoolList); -impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); +impl_dyn_authorized_resource_for_global!(authz::Inventory); impl DynAuthorizedResource for authz::SiloCertificateList { fn do_authorize<'a, 'b>( diff --git a/nexus/db-queries/src/authz/policy_test/resources.rs b/nexus/db-queries/src/authz/policy_test/resources.rs index 054fe6430b..3049f3b9bf 100644 --- a/nexus/db-queries/src/authz/policy_test/resources.rs +++ b/nexus/db-queries/src/authz/policy_test/resources.rs @@ -67,6 +67,7 @@ pub async fn make_resources( builder.new_resource(authz::CONSOLE_SESSION_LIST); builder.new_resource(authz::DNS_CONFIG); builder.new_resource(authz::DEVICE_AUTH_REQUEST_LIST); + builder.new_resource(authz::INVENTORY); builder.new_resource(authz::IP_POOL_LIST); // Silo/organization/project hierarchy diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs new file mode 100644 index 0000000000..6b7d97754a --- /dev/null +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -0,0 +1,1518 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; +use crate::db::TransactionError; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use async_bb8_diesel::AsyncConnection; +use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::AsyncSimpleConnection; +use diesel::expression::SelectableHelper; +use diesel::sql_types::Nullable; +use diesel::BoolExpressionMethods; +use diesel::ExpressionMethods; +use diesel::IntoSql; +use diesel::JoinOnDsl; +use diesel::NullableExpressionMethods; +use diesel::QueryDsl; +use diesel::Table; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::CabooseWhichEnum; +use nexus_db_model::HwBaseboardId; +use nexus_db_model::HwPowerState; +use nexus_db_model::HwPowerStateEnum; +use nexus_db_model::HwRotSlot; +use nexus_db_model::HwRotSlotEnum; +use nexus_db_model::InvCaboose; +use nexus_db_model::InvCollection; +use nexus_db_model::InvCollectionError; +use nexus_db_model::InvRootOfTrust; +use nexus_db_model::InvServiceProcessor; +use nexus_db_model::SpType; +use nexus_db_model::SpTypeEnum; +use nexus_db_model::SwCaboose; +use nexus_types::inventory::Collection; +use omicron_common::api::external::Error; +use omicron_common::api::external::InternalContext; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::num::NonZeroU32; +use std::sync::Arc; +use uuid::Uuid; + +impl DataStore { + /// Store a complete inventory collection into the database + pub async fn inventory_insert_collection( + &self, + opctx: &OpContext, + collection: &Collection, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + // In the database, the collection is represented essentially as a tree + // rooted at an `inv_collection` row. Other nodes in the tree point + // back at the `inv_collection` via `inv_collection_id`. + // + // It's helpful to assemble some values before entering the transaction + // so that we can produce the `Error` type that we want here. + let row_collection = InvCollection::from(collection); + let collection_id = row_collection.id; + let baseboards = collection + .baseboards + .iter() + .map(|b| HwBaseboardId::from((**b).clone())) + .collect::>(); + let cabooses = collection + .cabooses + .iter() + .map(|s| SwCaboose::from((**s).clone())) + .collect::>(); + let error_values = collection + .errors + .iter() + .enumerate() + .map(|(i, message)| { + let index = u16::try_from(i).map_err(|e| { + Error::internal_error(&format!( + "failed to convert error index to u16 (too \ + many errors in inventory collection?): {}", + e + )) + })?; + Ok(InvCollectionError::new( + collection_id, + index, + message.clone(), + )) + }) + .collect::, Error>>()?; + + // This implementation inserts all records associated with the + // collection in one transaction. This is primarily for simplicity. It + // means we don't have to worry about other readers seeing a + // half-inserted collection, nor leaving detritus around if we start + // inserting records and then crash. However, it does mean this is + // likely to be a big transaction and if that becomes a problem we could + // break this up as long as we address those problems. + // + // The SQL here is written so that it doesn't have to be an + // *interactive* transaction. That is, it should in principle be + // possible to generate all this SQL up front and send it as one big + // batch rather than making a bunch of round-trips to the database. + // We'd do that if we had an interface for doing that with bound + // parameters, etc. See oxidecomputer/omicron#973. + let pool = self.pool_connection_authorized(opctx).await?; + pool.transaction_async(|conn| async move { + // Insert records (and generate ids) for any baseboards that do not + // already exist in the database. These rows are not scoped to a + // particular collection. They contain only immutable data -- + // they're just a mapping between hardware-provided baseboard + // identifiers (part number and model number) and an + // Omicron-specific primary key (a UUID). + { + use db::schema::hw_baseboard_id::dsl; + let _ = diesel::insert_into(dsl::hw_baseboard_id) + .values(baseboards) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert records (and generate ids) for each distinct caboose that + // we've found. Like baseboards, these might already be present and + // rows in this table are not scoped to a particular collection + // because they only map (immutable) identifiers to UUIDs. + { + use db::schema::sw_caboose::dsl; + let _ = diesel::insert_into(dsl::sw_caboose) + .values(cabooses) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert a record describing the collection itself. + { + use db::schema::inv_collection::dsl; + let _ = diesel::insert_into(dsl::inv_collection) + .values(row_collection) + .execute_async(&conn) + .await?; + } + + // Insert rows for the service processors we found. These have a + // foreign key into the hw_baseboard_id table. We don't have those + // id values, though. We may have just inserted them, or maybe not + // (if they previously existed). To avoid dozens of unnecessary + // round-trips, we use INSERT INTO ... SELECT, which looks like + // this: + // + // INSERT INTO inv_service_processor + // SELECT + // id + // [other service_processor column values as literals] + // FROM hw_baseboard_id + // WHERE part_number = ... AND serial_number = ...; + // + // This way, we don't need to know the id. The database looks it up + // for us as it does the INSERT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_service_processor::dsl as sp_dsl; + + for (baseboard_id, sp) in &collection.sps { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + sp.time_collected + .into_sql::(), + sp.source + .clone() + .into_sql::(), + SpType::from(sp.sp_type).into_sql::(), + i32::from(sp.sp_slot) + .into_sql::(), + i64::from(sp.baseboard_revision) + .into_sql::(), + sp.hubris_archive + .clone() + .into_sql::(), + HwPowerState::from(sp.power_state) + .into_sql::(), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_service_processor::table, + ) + .values(selection) + .into_columns(( + sp_dsl::inv_collection_id, + sp_dsl::hw_baseboard_id, + sp_dsl::time_collected, + sp_dsl::source, + sp_dsl::sp_type, + sp_dsl::sp_slot, + sp_dsl::baseboard_revision, + sp_dsl::hubris_archive_id, + sp_dsl::power_state, + )) + .execute_async(&conn) + .await?; + + // This statement is just here to force a compilation error + // if the set of columns in `inv_service_processor` changes. + // The code above attempts to insert a row into + // `inv_service_processor` using an explicit list of columns + // and values. Without the following statement, If a new + // required column were added, this would only fail at + // runtime. + // + // If you're here because of a compile error, you might be + // changing the `inv_service_processor` table. Update the + // statement below and be sure to update the code above, + // too! + // + // See also similar comments in blocks below, near other + // uses of `all_columns(). + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _sp_type, + _sp_slot, + _baseboard_revision, + _hubris_archive_id, + _power_state, + ) = sp_dsl::inv_service_processor::all_columns(); + } + } + + // Insert rows for the roots of trust that we found. Like service + // processors, we do this using INSERT INTO ... SELECT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_root_of_trust::dsl as rot_dsl; + + for (baseboard_id, rot) in &collection.rots { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + rot.time_collected + .into_sql::(), + rot.source + .clone() + .into_sql::(), + HwRotSlot::from(rot.active_slot) + .into_sql::(), + HwRotSlot::from(rot.persistent_boot_preference) + .into_sql::(), + rot.pending_persistent_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.transient_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.slot_a_sha3_256_digest + .clone() + .into_sql::>( + ), + rot.slot_b_sha3_256_digest + .clone() + .into_sql::>( + ), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_root_of_trust::table, + ) + .values(selection) + .into_columns(( + rot_dsl::inv_collection_id, + rot_dsl::hw_baseboard_id, + rot_dsl::time_collected, + rot_dsl::source, + rot_dsl::slot_active, + rot_dsl::slot_boot_pref_persistent, + rot_dsl::slot_boot_pref_persistent_pending, + rot_dsl::slot_boot_pref_transient, + rot_dsl::slot_a_sha3_256, + rot_dsl::slot_b_sha3_256, + )) + .execute_async(&conn) + .await?; + + // See the comment in the previous block (where we use + // `inv_service_processor::all_columns()`). The same + // applies here. + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _slot_active, + _slot_boot_pref_persistent, + _slot_boot_pref_persistent_pending, + _slot_boot_pref_transient, + _slot_a_sha3_256, + _slot_b_sha3_256, + ) = rot_dsl::inv_root_of_trust::all_columns(); + } + } + + // Insert rows for the cabooses that we found. Like service + // processors and roots of trust, we do this using INSERT INTO ... + // SELECT. This one's a little more complicated because there are + // two foreign keys. Concretely, we have these three tables: + // + // - `hw_baseboard` with an "id" primary key and lookup columns + // "part_number" and "serial_number" + // - `sw_caboose` with an "id" primary key and lookup columns + // "board", "git_commit", "name", and "version" + // - `inv_caboose` with foreign keys "hw_baseboard_id", + // "sw_caboose_id", and various other columns + // + // We want to INSERT INTO `inv_caboose` a row with: + // + // - hw_baseboard_id (foreign key) the result of looking up an + // hw_baseboard row by a specific part number and serial number + // + // - sw_caboose_id (foreign key) the result of looking up a + // specific sw_caboose row by board, git_commit, name, and version + // + // - the other columns being literals + // + // To achieve this, we're going to generate something like: + // + // INSERT INTO + // inv_caboose ( + // hw_baseboard_id, + // sw_caboose_id, + // inv_collection_id, + // time_collected, + // source, + // which, + // ) + // SELECT ( + // hw_baseboard_id.id, + // sw_caboose.id, + // ... /* literal collection id */ + // ... /* literal time collected */ + // ... /* literal source */ + // ... /* literal 'which' */ + // ) + // FROM + // hw_baseboard + // INNER JOIN + // sw_caboose + // ON hw_baseboard.part_number = ... + // AND hw_baseboard.serial_number = ... + // AND sw_caboose.board = ... + // AND sw_caboose.git_commit = ... + // AND sw_caboose.name = ... + // AND sw_caboose.version = ...; + // + // Again, the whole point is to avoid back-and-forth between the + // client and the database. Those back-and-forth interactions can + // significantly increase latency and the probability of transaction + // conflicts. See RFD 192 for details. (Unfortunately, we still + // _are_ going back and forth here to issue each of these queries. + // But that's an artifact of the interface we currently have for + // sending queries. It should be possible to send all of these in + // one batch. + for (which, tree) in &collection.cabooses_found { + let db_which = nexus_db_model::CabooseWhich::from(*which); + for (baseboard_id, found_caboose) in tree { + use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; + use db::schema::inv_caboose::dsl as dsl_inv_caboose; + use db::schema::sw_caboose::dsl as dsl_sw_caboose; + + let selection = db::schema::hw_baseboard_id::table + .inner_join( + db::schema::sw_caboose::table.on( + dsl_baseboard_id::part_number + .eq(baseboard_id.part_number.clone()) + .and( + dsl_baseboard_id::serial_number.eq( + baseboard_id.serial_number.clone(), + ), + ) + .and(dsl_sw_caboose::board.eq( + found_caboose.caboose.board.clone(), + )) + .and( + dsl_sw_caboose::git_commit.eq( + found_caboose + .caboose + .git_commit + .clone(), + ), + ) + .and( + dsl_sw_caboose::name.eq(found_caboose + .caboose + .name + .clone()), + ) + .and(dsl_sw_caboose::version.eq( + found_caboose.caboose.version.clone(), + )), + ), + ) + .select(( + dsl_baseboard_id::id, + dsl_sw_caboose::id, + collection_id.into_sql::(), + found_caboose + .time_collected + .into_sql::(), + found_caboose + .source + .clone() + .into_sql::(), + db_which.into_sql::(), + )); + + let _ = diesel::insert_into(db::schema::inv_caboose::table) + .values(selection) + .into_columns(( + dsl_inv_caboose::hw_baseboard_id, + dsl_inv_caboose::sw_caboose_id, + dsl_inv_caboose::inv_collection_id, + dsl_inv_caboose::time_collected, + dsl_inv_caboose::source, + dsl_inv_caboose::which, + )) + .execute_async(&conn) + .await?; + + // See the comments above. The same applies here. If you + // update the statement below because the schema for + // `inv_caboose` has changed, be sure to update the code + // above, too! + let ( + _hw_baseboard_id, + _sw_caboose_id, + _inv_collection_id, + _time_collected, + _source, + _which, + ) = dsl_inv_caboose::inv_caboose::all_columns(); + } + } + + // Finally, insert the list of errors. + { + use db::schema::inv_collection_error::dsl as errors_dsl; + let _ = diesel::insert_into(errors_dsl::inv_collection_error) + .values(error_values) + .execute_async(&conn) + .await?; + } + + Ok(()) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + info!( + &opctx.log, + "inserted inventory collection"; + "collection_id" => collection.id.to_string(), + ); + + Ok(()) + } + + /// Prune inventory collections stored in the database, keeping at least + /// `nkeep`. + /// + /// This function removes as many collections as possible while preserving + /// the last `nkeep`. This will also preserve at least one "complete" + /// collection (i.e., one having zero errors). + // It might seem surprising that such a high-level application policy is + // embedded in the DataStore. The reason is that we want to push a bunch of + // the logic into the SQL to avoid interactive queries. + pub async fn inventory_prune_collections( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result<(), Error> { + // Assumptions: + // + // - Most of the time, there will be about `nkeep + 1` collections in + // the database. That's because the normal expected case is: we had + // `nkeep`, we created another one, and now we're pruning the oldest + // one. + // + // - There could be fewer collections in the database, early in the + // system's lifetime (before we've accumulated `nkeep` of them). + // + // - There could be many more collections in the database, if something + // has gone wrong and we've fallen behind in our cleanup. + // + // - Due to transient errors during the collection process, it's + // possible that a collection is known to be potentially incomplete. + // We can tell this because it has rows in `inv_collection_errors`. + // (It's possible that a collection can be incomplete with zero + // errors, but we can't know that here and so we can't do anything + // about it.) + // + // Goals: + // + // - When this function returns without error, there were at most + // `nkeep` collections in the database. + // + // - If we have to remove any collections, we want to start from the + // oldest ones. (We want to maintain a window of the last `nkeep`, + // not the first `nkeep - 1` from the beginning of time plus the most + // recent one.) + // + // - We want to avoid removing the last collection that had zero errors. + // (If we weren't careful, we might do this if there were `nkeep` + // collections with errors that were newer than the last complete + // collection.) + // + // Here's the plan: + // + // - Select from the database the `nkeep + 1` oldest collections and the + // number of errors associated with each one. + // + // - If we got fewer than `nkeep + 1` back, we're done. We shouldn't + // prune anything. + // + // - Otherwise, if the oldest collection is the only complete one, + // remove the next-oldest collection and go back to the top (repeat). + // + // - Otherwise, remove the oldest collection and go back to the top + // (repeat). + // + // This seems surprisingly complicated. It's designed to meet the above + // goals. + // + // Is this going to work if multiple Nexuses are doing it concurrently? + // This cannot remove the last complete collection because a given Nexus + // will only remove a complete collection if it has seen a newer + // complete one. This cannot result in keeping fewer than "nkeep" + // collections because any Nexus will only remove a collection if there + // are "nkeep" newer ones. In both of these cases, another Nexus might + // remove one of the ones that the first Nexus was counting on keeping, + // but only if there was a newer one to replace it. + + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + loop { + match self.inventory_find_pruneable(opctx, nkeep).await? { + None => break, + Some(collection_id) => { + self.inventory_delete_collection(opctx, collection_id) + .await? + } + } + } + + Ok(()) + } + + /// Return the oldest inventory collection that's eligible for pruning, + /// if any + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_find_pruneable( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result, Error> { + let conn = self.pool_connection_authorized(opctx).await?; + // Diesel requires us to use aliases in order to refer to the + // `inv_collection` table twice in the same query. + let (inv_collection1, inv_collection2) = diesel::alias!( + db::schema::inv_collection as inv_collection1, + db::schema::inv_collection as inv_collection2 + ); + + // This subquery essentially generates: + // + // SELECT id FROM inv_collection ORDER BY time_started" ASC LIMIT $1 + // + // where $1 becomes `nkeep + 1`. This just lists the `nkeep + 1` oldest + // collections. + let subquery = inv_collection1 + .select(inv_collection1.field(db::schema::inv_collection::id)) + .order_by( + inv_collection1 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .limit(i64::from(nkeep) + 1); + + // This essentially generates: + // + // SELECT + // inv_collection.id, + // count(inv_collection_error.inv_collection_id) + // FROM ( + // inv_collection + // LEFT OUTER JOIN + // inv_collection_error + // ON ( + // inv_collection_error.inv_collection_id = inv_collection.id + // ) + // ) WHERE ( + // inv_collection.id = ANY( <> ) + // ) + // GROUP BY inv_collection.id + // ORDER BY inv_collection.time_started ASC + // + // This looks a lot scarier than it is. The goal is to produce a + // two-column table that looks like this: + // + // collection_id1 count of errors from collection_id1 + // collection_id2 count of errors from collection_id2 + // collection_id3 count of errors from collection_id3 + // ... + // + let candidates: Vec<(Uuid, i64)> = inv_collection2 + .left_outer_join(db::schema::inv_collection_error::table) + .filter( + inv_collection2 + .field(db::schema::inv_collection::id) + .eq_any(subquery), + ) + .group_by(inv_collection2.field(db::schema::inv_collection::id)) + .select(( + inv_collection2.field(db::schema::inv_collection::id), + diesel::dsl::count( + db::schema::inv_collection_error::inv_collection_id + .nullable(), + ), + )) + .order_by( + inv_collection2 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .internal_context("listing oldest collections")?; + + if u32::try_from(candidates.len()).unwrap() <= nkeep { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal (too few)"; + "candidates" => ?candidates, + ); + return Ok(None); + } + + // We've now got up to "nkeep + 1" oldest collections, starting with the + // very oldest. We can get rid of the oldest one unless it's the only + // complete one. Another way to think about it: find the _last_ + // complete one. Remove it from the list of candidates. Now mark the + // first item in the remaining list for deletion. + let last_completed_idx = candidates + .iter() + .enumerate() + .rev() + .find(|(_i, (_collection_id, nerrors))| *nerrors == 0); + let candidate = match last_completed_idx { + Some((i, _)) if i == 0 => candidates.iter().skip(1).next(), + _ => candidates.iter().next(), + } + .map(|(collection_id, _nerrors)| *collection_id); + if let Some(c) = candidate { + debug!( + &opctx.log, + "inventory_prune_one: eligible for removal"; + "collection_id" => c.to_string(), + "candidates" => ?candidates, + ); + } else { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal"; + "candidates" => ?candidates, + ); + } + Ok(candidate) + } + + /// Removes an inventory collection from the database + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_delete_collection( + &self, + opctx: &OpContext, + collection_id: Uuid, + ) -> Result<(), Error> { + // As with inserting a whole collection, we remove it in one big + // transaction for simplicity. Similar considerations apply. We could + // break it up if these transactions become too big. But we'd need a + // way to stop other clients from discovering a collection after we + // start removing it and we'd also need to make sure we didn't leak a + // collection if we crash while deleting it. + let conn = self.pool_connection_authorized(opctx).await?; + let (ncollections, nsps, nrots, ncabooses, nerrors) = conn + .transaction_async(|conn| async move { + // Remove the record describing the collection itself. + let ncollections = { + use db::schema::inv_collection::dsl; + diesel::delete( + dsl::inv_collection.filter(dsl::id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for service processors. + let nsps = { + use db::schema::inv_service_processor::dsl; + diesel::delete( + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for roots of trust. + let nrots = { + use db::schema::inv_root_of_trust::dsl; + diesel::delete( + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for cabooses found. + let ncabooses = { + use db::schema::inv_caboose::dsl; + diesel::delete( + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for errors encountered. + let nerrors = { + use db::schema::inv_collection_error::dsl; + diesel::delete( + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + Ok((ncollections, nsps, nrots, ncabooses, nerrors)) + }) + .await + .map_err(|error| match error { + TransactionError::CustomError(e) => e, + TransactionError::Database(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; + + info!(&opctx.log, "removed inventory collection"; + "collection_id" => collection_id.to_string(), + "ncollections" => ncollections, + "nsps" => nsps, + "nrots" => nrots, + "ncabooses" => ncabooses, + "nerrors" => nerrors, + ); + + Ok(()) + } +} + +/// Extra interfaces that are not intended (and potentially unsafe) for use in +/// Nexus, but useful for testing and `omdb` +pub trait DataStoreInventoryTest: Send + Sync { + /// List all collections + /// + /// This does not paginate. + fn inventory_collections(&self) -> BoxFuture>>; + + /// Make a best effort to read the given collection while limiting queries + /// to `limit` results. Returns as much as it was able to get. The + /// returned bool indicates whether the returned collection might be + /// incomplete because the limit was reached. + fn inventory_collection_read_best_effort( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture>; + + /// Attempt to read the given collection while limiting queries to `limit` + /// records + fn inventory_collection_read_all_or_nothing( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture> { + async move { + let (collection, limit_reached) = + self.inventory_collection_read_best_effort(id, limit).await?; + anyhow::ensure!( + !limit_reached, + "hit limit of {} records while loading collection", + limit + ); + Ok(collection) + } + .boxed() + } +} + +impl DataStoreInventoryTest for DataStore { + fn inventory_collections(&self) -> BoxFuture>> { + async { + let conn = self + .pool_connection_for_tests() + .await + .context("getting connectoin")?; + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) + .await + .context("failed to allow table scan")?; + + use db::schema::inv_collection::dsl; + dsl::inv_collection + .select(dsl::id) + .order_by(dsl::time_started) + .load_async(&conn) + .await + .context("failed to list collections") + }) + .await + } + .boxed() + } + + // This function could move into the datastore if it proves helpful. We'd + // need to work out how to report the usual type of Error. For now we don't + // need it so we limit its scope to the test suite. + fn inventory_collection_read_best_effort( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture> { + async move { + let conn = &self + .pool_connection_for_tests() + .await + .context("getting connection")?; + let sql_limit = i64::from(u32::from(limit)); + let usize_limit = usize::try_from(u32::from(limit)).unwrap(); + let mut limit_reached = false; + let (time_started, time_done, collector) = { + use db::schema::inv_collection::dsl; + + let collections = dsl::inv_collection + .filter(dsl::id.eq(id)) + .limit(2) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collection")?; + anyhow::ensure!(collections.len() == 1); + let collection = collections.into_iter().next().unwrap(); + ( + collection.time_started, + collection.time_done, + collection.collector, + ) + }; + + let errors: Vec = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(id)) + .order_by(dsl::idx) + .limit(sql_limit) + .select(InvCollectionError::as_select()) + .load_async(&**conn) + .await + .context("loading collection errors")? + .into_iter() + .map(|e| e.message) + .collect() + }; + limit_reached = limit_reached || errors.len() == usize_limit; + + let sps: BTreeMap<_, _> = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvServiceProcessor::as_select()) + .load_async(&**conn) + .await + .context("loading service processors")? + .into_iter() + .map(|sp_row| { + let baseboard_id = sp_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::ServiceProcessor::from( + sp_row, + ), + ) + }) + .collect() + }; + limit_reached = limit_reached || sps.len() == usize_limit; + + let rots: BTreeMap<_, _> = { + use db::schema::inv_root_of_trust::dsl; + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvRootOfTrust::as_select()) + .load_async(&**conn) + .await + .context("loading roots of trust")? + .into_iter() + .map(|rot_row| { + let baseboard_id = rot_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::RotState::from(rot_row), + ) + }) + .collect() + }; + limit_reached = limit_reached || rots.len() == usize_limit; + + // Collect the unique baseboard ids referenced by SPs and RoTs. + let baseboard_id_ids: BTreeSet<_> = + sps.keys().chain(rots.keys()).cloned().collect(); + // Fetch the corresponding baseboard records. + let baseboards_by_id: BTreeMap<_, _> = { + use db::schema::hw_baseboard_id::dsl; + dsl::hw_baseboard_id + .filter(dsl::id.eq_any(baseboard_id_ids)) + .limit(sql_limit) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboards")? + .into_iter() + .map(|bb| { + ( + bb.id, + Arc::new( + nexus_types::inventory::BaseboardId::from(bb), + ), + ) + }) + .collect() + }; + limit_reached = + limit_reached || baseboards_by_id.len() == usize_limit; + + // Having those, we can replace the keys in the maps above with + // references to the actual baseboard rather than the uuid. + let sps = sps + .into_iter() + .map(|(id, sp)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), sp)) + .ok_or_else(|| { + anyhow!( + "missing baseboard that we should have fetched" + ) + }) + }) + .collect::, _>>()?; + let rots = + rots.into_iter() + .map(|(id, rot)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), rot)) + .ok_or_else(|| { + anyhow!("missing baseboard that we should have fetched") + }) + }) + .collect::, _>>()?; + + // Fetch records of cabooses found. + let inv_caboose_rows = { + use db::schema::inv_caboose::dsl; + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading inv_cabooses")? + }; + limit_reached = + limit_reached || inv_caboose_rows.len() == usize_limit; + + // Collect the unique sw_caboose_ids for those cabooses. + let sw_caboose_ids: BTreeSet<_> = inv_caboose_rows + .iter() + .map(|inv_caboose| inv_caboose.sw_caboose_id) + .collect(); + // Fetch the corresponing records. + let cabooses_by_id: BTreeMap<_, _> = { + use db::schema::sw_caboose::dsl; + dsl::sw_caboose + .filter(dsl::id.eq_any(sw_caboose_ids)) + .limit(sql_limit) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading sw_cabooses")? + .into_iter() + .map(|sw_caboose_row| { + ( + sw_caboose_row.id, + Arc::new(nexus_types::inventory::Caboose::from( + sw_caboose_row, + )), + ) + }) + .collect() + }; + limit_reached = + limit_reached || cabooses_by_id.len() == usize_limit; + + // Assemble the lists of cabooses found. + let mut cabooses_found = BTreeMap::new(); + for c in inv_caboose_rows { + let by_baseboard = cabooses_found + .entry(nexus_types::inventory::CabooseWhich::from(c.which)) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&c.hw_baseboard_id) else { + bail!( + "unknown baseboard found in inv_caboose: {}", + c.hw_baseboard_id + ); + }; + let Some(sw_caboose) = cabooses_by_id.get(&c.sw_caboose_id) + else { + bail!( + "unknown caboose found in inv_caboose: {}", + c.sw_caboose_id + ); + }; + + let previous = by_baseboard.insert( + bb.clone(), + nexus_types::inventory::CabooseFound { + time_collected: c.time_collected, + source: c.source, + caboose: sw_caboose.clone(), + }, + ); + anyhow::ensure!( + previous.is_none(), + "duplicate caboose found: {:?} baseboard {:?}", + c.which, + c.hw_baseboard_id + ); + } + + Ok(( + Collection { + id, + errors, + time_started, + time_done, + collector, + baseboards: baseboards_by_id.values().cloned().collect(), + cabooses: cabooses_by_id.values().cloned().collect(), + sps, + rots, + cabooses_found, + }, + limit_reached, + )) + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use crate::db::datastore::datastore_test; + use crate::db::datastore::inventory::DataStoreInventoryTest; + use crate::db::datastore::DataStore; + use crate::db::datastore::DataStoreConnection; + use crate::db::schema; + use anyhow::Context; + use async_bb8_diesel::AsyncConnection; + use async_bb8_diesel::AsyncRunQueryDsl; + use async_bb8_diesel::AsyncSimpleConnection; + use diesel::QueryDsl; + use gateway_client::types::SpType; + use nexus_inventory::examples::representative; + use nexus_inventory::examples::Representative; + use nexus_test_utils::db::test_setup_database; + use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; + use nexus_types::inventory::CabooseWhich; + use nexus_types::inventory::Collection; + use omicron_test_utils::dev; + use std::num::NonZeroU32; + use uuid::Uuid; + + async fn read_collection( + datastore: &DataStore, + id: Uuid, + ) -> anyhow::Result { + let limit = NonZeroU32::new(1000).unwrap(); + datastore.inventory_collection_read_all_or_nothing(id, limit).await + } + + async fn count_baseboards_cabooses( + conn: &DataStoreConnection<'_>, + ) -> anyhow::Result<(usize, usize)> { + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let bb_count = schema::hw_baseboard_id::dsl::hw_baseboard_id + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count baseboards")?; + let caboose_count = schema::sw_caboose::dsl::sw_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count cabooses")?; + let bb_count_usize = usize::try_from(bb_count) + .context("failed to convert baseboard count to usize")?; + let caboose_count_usize = usize::try_from(caboose_count) + .context("failed to convert caboose count to usize")?; + Ok((bb_count_usize, caboose_count_usize)) + }) + .await + } + + /// Tests inserting several collections, reading them back, and making sure + /// they look the same. + #[tokio::test] + async fn test_inventory_insert() { + // Setup + let logctx = dev::test_setup_log("inventory_insert"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create an empty collection and write it to the database. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection1 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection1) + .await + .expect("failed to insert collection"); + + // Read it back. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + let collection_read = read_collection(&datastore, collection1.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection1, collection_read); + + // There ought to be no baseboards or cabooses in the databases from + // that collection. + assert_eq!(collection1.baseboards.len(), 0); + assert_eq!(collection1.cabooses.len(), 0); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection1.baseboards.len(), nbaseboards); + assert_eq!(collection1.cabooses.len(), ncabooses); + + // Now insert a more complex collection, write it to the database, and + // read it back. + let Representative { builder, .. } = representative(); + let collection2 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection2) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection2.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection2, collection_read); + // Verify that we have exactly the set of cabooses and baseboards in the + // databases that came from this first non-empty collection. + assert_ne!(collection2.baseboards.len(), collection1.baseboards.len()); + assert_ne!(collection2.cabooses.len(), collection1.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection2.baseboards.len(), nbaseboards); + assert_eq!(collection2.cabooses.len(), ncabooses); + + // Now insert an equivalent collection again. Verify the distinct + // baseboards and cabooses again. This is important: the insertion + // process should re-use the baseboards and cabooses from the previous + // collection. + let Representative { builder, .. } = representative(); + let collection3 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection3) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection3.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection3, collection_read); + // Verify that we have the same number of cabooses and baseboards, since + // those didn't change. + assert_eq!(collection3.baseboards.len(), collection2.baseboards.len()); + assert_eq!(collection3.cabooses.len(), collection2.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection3.baseboards.len(), nbaseboards); + assert_eq!(collection3.cabooses.len(), ncabooses); + + // Now insert a collection that's almost equivalent, but has an extra + // couple of baseboards and caboose. Verify that we re-use the existing + // ones, but still insert the new ones. + let Representative { mut builder, .. } = representative(); + builder.found_sp_state( + "test suite", + SpType::Switch, + 1, + nexus_inventory::examples::sp_state("2"), + ); + let bb = builder + .found_sp_state( + "test suite", + SpType::Power, + 1, + nexus_inventory::examples::sp_state("3"), + ) + .unwrap(); + builder + .found_caboose( + &bb, + CabooseWhich::SpSlot0, + "dummy", + nexus_inventory::examples::caboose("dummy"), + ) + .unwrap(); + let collection4 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection4) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection4.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection4, collection_read); + // Verify the number of baseboards and collections again. + assert_eq!( + collection4.baseboards.len(), + collection3.baseboards.len() + 2 + ); + assert_eq!( + collection4.cabooses.len(), + collection3.baseboards.len() + 1 + ); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // This time, go back to our earlier collection. This logically removes + // some baseboards. They should still be present in the database, but + // not in the collection. + let Representative { builder, .. } = representative(); + let collection5 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection5.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection5, collection_read); + assert_eq!(collection5.baseboards.len(), collection3.baseboards.len()); + assert_eq!(collection5.cabooses.len(), collection3.cabooses.len()); + assert_ne!(collection5.baseboards.len(), collection4.baseboards.len()); + assert_ne!(collection5.cabooses.len(), collection4.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // Try to insert the same collection again and make sure it fails. + let error = datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect_err("unexpectedly succeeded in inserting collection"); + assert!(format!("{:#}", error) + .contains("duplicate key value violates unique constraint")); + + // Now that we've inserted a bunch of collections, we can test pruning. + // + // The datastore should start by pruning the oldest collection, unless + // it's the only collection with no errors. The oldest one is + // `collection1`, which _is_ the only one with no errors. So we should + // get back `collection2`. + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + println!( + "all collections: {:?}\n", + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + datastore + .inventory_prune_collections(&opctx, 4) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection3.id, collection4.id, collection5.id,] + ); + // Again, we should skip over collection1 and delete the next oldest: + // collection3. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + // At this point, if we're keeping 3, we don't need to prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + + // If we then insert an empty collection (which has no errors), + // collection1 becomes pruneable. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection6 = builder.build(); + println!( + "collection 6: {} ({:?})", + collection6.id, collection6.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection6) + .await + .expect("failed to insert collection"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id, collection6.id,] + ); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + // Again, at this point, we should not prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + + // If we insert another collection with errors, then prune, we should + // end up pruning collection 4. + let Representative { builder, .. } = representative(); + let collection7 = builder.build(); + println!( + "collection 7: {} ({:?})", + collection7.id, collection7.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection7) + .await + .expect("failed to insert collection"); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection5.id, collection6.id, collection7.id,] + ); + + // If we try to fetch a pruned collection, we should get nothing. + let _ = read_collection(&datastore, collection4.id) + .await + .expect_err("unexpectedly read pruned collection"); + + // But we should still be able to fetch the collections that do exist. + let collection_read = + read_collection(&datastore, collection5.id).await.unwrap(); + assert_eq!(collection5, collection_read); + let collection_read = + read_collection(&datastore, collection6.id).await.unwrap(); + assert_eq!(collection6, collection_read); + let collection_read = + read_collection(&datastore, collection7.id).await.unwrap(); + assert_eq!(collection7, collection_read); + + // We should prune more than one collection, if needed. We'll wind up + // with just collection6 because that's the latest one with no errors. + datastore + .inventory_prune_collections(&opctx, 1) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection6.id,] + ); + + // Remove the remaining collection and make sure the inventory tables + // are empty (i.e., we got everything). + datastore + .inventory_delete_collection(&opctx, collection6.id) + .await + .expect("failed to delete collection"); + assert_eq!(datastore.inventory_collections().await.unwrap(), &[]); + + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let count = schema::inv_collection::dsl::inv_collection + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_collection_error::dsl::inv_collection_error + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = + schema::inv_service_processor::dsl::inv_service_processor + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_root_of_trust::dsl::inv_root_of_trust + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_caboose::dsl::inv_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + Ok::<(), anyhow::Error>(()) + }) + .await + .expect("failed to check that tables were empty"); + + // We currently keep the baseboard ids and sw_cabooses around. + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_ne!(nbaseboards, 0); + assert_ne!(ncabooses, 0); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 2dc1e69a6f..91373f6875 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -61,6 +61,7 @@ mod external_ip; mod identity_provider; mod image; mod instance; +mod inventory; mod ip_pool; mod network_interface; mod oximeter; @@ -96,6 +97,7 @@ pub use db_metadata::{ }; pub use dns::DnsVersionUpdateBuilder; pub use instance::InstanceAndActiveVmm; +pub use inventory::DataStoreInventoryTest; pub use rack::RackInit; pub use silo::Discoverability; pub use switch_port::SwitchPortSettingsCombinedResult; @@ -138,6 +140,9 @@ impl RunnableQuery for T where { } +pub type DataStoreConnection<'a> = + bb8::PooledConnection<'a, ConnectionManager>; + pub struct DataStore { pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, @@ -205,21 +210,13 @@ impl DataStore { .unwrap(); } - async fn pool_authorized( - &self, - opctx: &OpContext, - ) -> Result<&bb8::Pool>, Error> { - opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; - Ok(self.pool.pool()) - } - /// Returns a connection to a connection from the database connection pool. pub(super) async fn pool_connection_authorized( &self, opctx: &OpContext, - ) -> Result>, Error> - { - let pool = self.pool_authorized(opctx).await?; + ) -> Result { + opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; + let pool = self.pool.pool(); let connection = pool.get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -233,8 +230,7 @@ impl DataStore { /// "pool_connection_authorized". pub(super) async fn pool_connection_unauthorized( &self, - ) -> Result>, Error> - { + ) -> Result { let connection = self.pool.pool().get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -245,8 +241,7 @@ impl DataStore { #[doc(hidden)] pub async fn pool_connection_for_tests( &self, - ) -> Result>, Error> - { + ) -> Result { self.pool_connection_unauthorized().await } diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs index 73c95f4e91..249852d832 100644 --- a/nexus/db-queries/src/db/pool.rs +++ b/nexus/db-queries/src/db/pool.rs @@ -45,6 +45,8 @@ pub struct Pool { impl Pool { pub fn new(log: &slog::Logger, db_config: &DbConfig) -> Self { + // Make sure diesel-dtrace's USDT probes are enabled. + usdt::register_probes().expect("Failed to register USDT DTrace probes"); Self::new_builder(log, db_config, bb8::Builder::new()) } diff --git a/nexus/db-queries/tests/output/authz-roles.out b/nexus/db-queries/tests/output/authz-roles.out index 72031c567e..963f00f7e8 100644 --- a/nexus/db-queries/tests/output/authz-roles.out +++ b/nexus/db-queries/tests/output/authz-roles.out @@ -68,6 +68,20 @@ resource: authz::DeviceAuthRequestList silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ unauthenticated ! ! ! ! ! ! ! ! +resource: authz::Inventory + + USER Q R LC RP M MP CC D + fleet-admin ✘ ✔ ✘ ✔ ✔ ✔ ✘ ✔ + fleet-collaborator ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + fleet-viewer ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + silo1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + unauthenticated ! ! ! ! ! ! ! ! + resource: authz::IpPoolList USER Q R LC RP M MP CC D diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 1a9afbc6bd..efc9aa9c27 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,6 +92,13 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 5 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml new file mode 100644 index 0000000000..965ff3f02a --- /dev/null +++ b/nexus/inventory/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "nexus-inventory" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-messages.workspace = true +nexus-types.workspace = true +slog.workspace = true +strum.workspace = true +uuid.workspace = true +omicron-workspace-hack.workspace = true + +[dev-dependencies] +expectorate.workspace = true +gateway-test-utils.workspace = true +regex.workspace = true +tokio.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs new file mode 100644 index 0000000000..ad008ee4df --- /dev/null +++ b/nexus/inventory/src/builder.rs @@ -0,0 +1,786 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for building inventory [`Collection`] dynamically +//! +//! This separates the concerns of _collection_ (literally just fetching data +//! from sources like MGS) from assembling a representation of what was +//! collected. + +use anyhow::anyhow; +use chrono::DateTime; +use chrono::Utc; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::Caboose; +use nexus_types::inventory::CabooseFound; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use nexus_types::inventory::RotState; +use nexus_types::inventory::ServiceProcessor; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use uuid::Uuid; + +/// Build an inventory [`Collection`] +/// +/// This interface is oriented around the interfaces used by an actual +/// collector. Where possible, it accepts types directly provided by the data +/// sources (e.g., `gateway_client`). +#[derive(Debug)] +pub struct CollectionBuilder { + // For field documentation, see the corresponding fields in `Collection`. + errors: Vec, + time_started: DateTime, + collector: String, + baseboards: BTreeSet>, + cabooses: BTreeSet>, + sps: BTreeMap, ServiceProcessor>, + rots: BTreeMap, RotState>, + cabooses_found: + BTreeMap, CabooseFound>>, +} + +impl CollectionBuilder { + /// Start building a new `Collection` + /// + /// `collector` is an arbitrary string describing the agent that collected + /// this data. It's generally a Nexus instance uuid but it can be anything. + /// It's just for debugging. + pub fn new(collector: &str) -> Self { + CollectionBuilder { + errors: vec![], + time_started: now(), + collector: collector.to_owned(), + baseboards: BTreeSet::new(), + cabooses: BTreeSet::new(), + sps: BTreeMap::new(), + rots: BTreeMap::new(), + cabooses_found: BTreeMap::new(), + } + } + + /// Assemble a complete `Collection` representation + pub fn build(self) -> Collection { + Collection { + id: Uuid::new_v4(), + errors: self + .errors + .into_iter() + .map(|e| format!("{:#}", e)) + .collect(), + time_started: self.time_started, + time_done: now(), + collector: self.collector, + baseboards: self.baseboards, + cabooses: self.cabooses, + sps: self.sps, + rots: self.rots, + cabooses_found: self.cabooses_found, + } + } + + /// Record service processor state `sp_state` reported by MGS + /// + /// `sp_type` and `slot` identify which SP this was. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_sp_state( + &mut self, + source: &str, + sp_type: SpType, + slot: u32, + sp_state: SpState, + ) -> Option> { + // Much ado about very little: MGS reports that "slot" is a u32, though + // in practice this seems very unlikely to be bigger than a u8. (How + // many slots can there be within one rack?) The database only supports + // signed integers, so if we assumed this really could span the range of + // a u32, we'd need to store it in an i64. Instead, assume here that we + // can stick it into a u16 (which still seems generous). This will + // allow us to store it into an Int32 in the database. + let Ok(sp_slot) = u16::try_from(slot) else { + self.found_error(anyhow!( + "MGS {:?}: SP {:?} slot {}: slot number did not fit into u16", + source, + sp_type, + slot + )); + return None; + }; + + // Normalize the baseboard id: i.e., if we've seen this baseboard + // before, use the same baseboard id record. Otherwise, make a new one. + let baseboard = Self::normalize_item( + &mut self.baseboards, + BaseboardId { + serial_number: sp_state.serial_number, + part_number: sp_state.model, + }, + ); + + // Separate the SP state into the SP-specific state and the RoT state, + // if any. + let now = now(); + let _ = self.sps.entry(baseboard.clone()).or_insert_with(|| { + ServiceProcessor { + time_collected: now, + source: source.to_owned(), + + sp_type, + sp_slot, + + baseboard_revision: sp_state.revision, + hubris_archive: sp_state.hubris_archive_id, + power_state: sp_state.power_state, + } + }); + + match sp_state.rot { + gateway_client::types::RotState::Enabled { + active, + pending_persistent_boot_preference, + persistent_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + transient_boot_preference, + } => { + let _ = + self.rots.entry(baseboard.clone()).or_insert_with(|| { + RotState { + time_collected: now, + source: source.to_owned(), + active_slot: active, + persistent_boot_preference, + pending_persistent_boot_preference, + transient_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + } + }); + } + gateway_client::types::RotState::CommunicationFailed { + message, + } => { + self.found_error(anyhow!( + "MGS {:?}: reading RoT state for {:?}: {}", + source, + baseboard, + message + )); + } + } + + Some(baseboard) + } + + /// Returns true if we already found the caboose for `which` for baseboard + /// `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn found_caboose_already( + &self, + baseboard: &BaseboardId, + which: CabooseWhich, + ) -> bool { + self.cabooses_found + .get(&which) + .map(|map| map.contains_key(baseboard)) + .unwrap_or(false) + } + + /// Record the given caboose information found for the given baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_caboose( + &mut self, + baseboard: &BaseboardId, + which: CabooseWhich, + source: &str, + caboose: SpComponentCaboose, + ) -> Result<(), anyhow::Error> { + // Normalize the caboose contents: i.e., if we've seen this exact + // caboose contents before, use the same record from before. Otherwise, + // make a new one. + let sw_caboose = + Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let by_id = + self.cabooses_found.entry(which).or_insert_with(|| BTreeMap::new()); + if let Some(previous) = by_id.insert( + baseboard.clone(), + CabooseFound { + time_collected: now(), + source: source.to_owned(), + caboose: sw_caboose.clone(), + }, + ) { + let error = if *previous.caboose == *sw_caboose { + anyhow!("reported multiple times (same value)",) + } else { + anyhow!( + "reported caboose multiple times (previously {:?}, \ + now {:?})", + previous, + sw_caboose + ) + }; + Err(error.context(format!( + "baseboard {:?} caboose {:?}", + baseboard, which + ))) + } else { + Ok(()) + } + } + + /// Helper function for normalizing items + /// + /// If `item` (or its equivalent) is not already in `items`, insert it. + /// Either way, return the item from `items`. (This will either be `item` + /// itself or whatever was already in `items`.) + fn normalize_item( + items: &mut BTreeSet>, + item: T, + ) -> Arc { + match items.get(&item) { + Some(found_item) => found_item.clone(), + None => { + let new_item = Arc::new(item); + items.insert(new_item.clone()); + new_item + } + } + } + + /// Record a collection error + /// + /// This is used for operational errors encountered during the collection + /// process (e.g., a down MGS instance). It's not intended for mis-uses of + /// this API, which are conveyed instead through returned errors (and should + /// probably cause the caller to stop collection altogether). + pub fn found_error(&mut self, error: anyhow::Error) { + self.errors.push(error); + } +} + +/// Returns the current time, truncated to the previous microsecond. +/// +/// This exists because the database doesn't store nanosecond-precision, so if +/// we store nanosecond-precision timestamps, then DateTime conversion is lossy +/// when round-tripping through the database. That's rather inconvenient. +fn now() -> DateTime { + let ts = Utc::now(); + let nanosecs = ts.timestamp_subsec_nanos(); + let micros = ts.timestamp_subsec_micros(); + let only_nanos = nanosecs - micros * 1000; + ts - std::time::Duration::from_nanos(u64::from(only_nanos)) +} + +#[cfg(test)] +mod test { + use super::now; + use super::CollectionBuilder; + use crate::examples::representative; + use crate::examples::sp_state; + use crate::examples::Representative; + use gateway_client::types::PowerState; + use gateway_client::types::RotSlot; + use gateway_client::types::RotState; + use gateway_client::types::SpComponentCaboose; + use gateway_client::types::SpState; + use gateway_client::types::SpType; + use nexus_types::inventory::BaseboardId; + use nexus_types::inventory::Caboose; + use nexus_types::inventory::CabooseWhich; + + // Verify the contents of an empty collection. + #[test] + fn test_empty() { + let time_before = now(); + let builder = CollectionBuilder::new("test_empty"); + let collection = builder.build(); + let time_after = now(); + + assert!(collection.errors.is_empty()); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_empty"); + assert!(collection.baseboards.is_empty()); + assert!(collection.cabooses.is_empty()); + assert!(collection.sps.is_empty()); + assert!(collection.rots.is_empty()); + assert!(collection.cabooses_found.is_empty()); + } + + // Simple test of a single, fairly typical collection that contains just + // about all kinds of valid data. That includes exercising: + // + // - all three baseboard types (switch, sled, PSC) + // - various valid values for all fields (sources, slot numbers, power + // states, baseboard revisions, cabooses, etc.) + // - some empty slots + // - some missing cabooses + // - some cabooses common to multiple baseboards; others not + // - serial number reused across different model numbers + // + // This test is admittedly pretty tedious and maybe not worthwhile but it's + // a useful quick check. + #[test] + fn test_basic() { + let time_before = now(); + let Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch, + psc, + } = representative(); + let collection = builder.build(); + let time_after = now(); + println!("{:#?}", collection); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "example"); + + // Verify the one error that ought to have been produced for the SP with + // no RoT information. + assert_eq!( + collection.errors.iter().map(|e| e.to_string()).collect::>(), + ["MGS \"fake MGS 1\": reading RoT state for BaseboardId \ + { part_number: \"model1\", serial_number: \"s2\" }: test suite \ + injected error"] + ); + + // Verify the baseboard ids found. + let expected_baseboards = + &[&sled1_bb, &sled2_bb, &sled3_bb, &switch, &psc]; + for bb in expected_baseboards { + assert!(collection.baseboards.contains(*bb)); + } + assert_eq!(collection.baseboards.len(), expected_baseboards.len()); + + // Verify the stuff that's easy to verify for all SPs: timestamps. + assert_eq!(collection.sps.len(), collection.baseboards.len()); + for (bb, sp) in collection.sps.iter() { + assert!(collection.time_started <= sp.time_collected); + assert!(sp.time_collected <= collection.time_done); + + if let Some(rot) = collection.rots.get(bb) { + assert_eq!(rot.source, sp.source); + assert_eq!(rot.time_collected, sp.time_collected); + } + + for which in [CabooseWhich::SpSlot0, CabooseWhich::SpSlot1] { + let caboose = collection.caboose_for(which, bb); + if let Some(c) = caboose { + assert!(collection.time_started <= c.time_collected); + assert!(c.time_collected <= collection.time_done); + assert!(collection.cabooses.contains(&c.caboose)); + } + } + } + + // Verify the common caboose. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch]; + let common_caboose = Caboose { + board: String::from("board_1"), + git_commit: String::from("git_commit_1"), + name: String::from("name_1"), + version: String::from("version_1"), + }; + for bb in &common_caboose_baseboards { + let _ = collection.sps.get(*bb).unwrap(); + let c0 = collection.caboose_for(CabooseWhich::SpSlot0, bb).unwrap(); + let c1 = collection.caboose_for(CabooseWhich::SpSlot1, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + + let _ = collection.rots.get(*bb).unwrap(); + let c0 = + collection.caboose_for(CabooseWhich::RotSlotA, bb).unwrap(); + let c1 = + collection.caboose_for(CabooseWhich::RotSlotB, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + } + assert!(collection.cabooses.contains(&common_caboose)); + + // Verify the specific, different data for the healthy SPs and RoTs that + // we reported. + // sled1 + let sp = collection.sps.get(&sled1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 3); + assert_eq!(sp.baseboard_revision, 0); + assert_eq!(sp.hubris_archive, "hubris1"); + assert_eq!(sp.power_state, PowerState::A0); + let rot = collection.rots.get(&sled1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::A); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest1" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest1" + ); + assert_eq!(rot.transient_boot_preference, None); + + // sled2 + let sp = collection.sps.get(&sled2_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 4); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris2"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&sled2_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, Some(RotSlot::A)); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest2" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest2" + ); + assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); + + // switch + let sp = collection.sps.get(&switch).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Switch); + assert_eq!(sp.sp_slot, 0); + assert_eq!(sp.baseboard_revision, 2); + assert_eq!(sp.hubris_archive, "hubris3"); + assert_eq!(sp.power_state, PowerState::A1); + let rot = collection.rots.get(&switch).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest3" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest3" + ); + assert_eq!(rot.transient_boot_preference, None); + + // PSC + let sp = collection.sps.get(&psc).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Power); + assert_eq!(sp.sp_slot, 1); + assert_eq!(sp.baseboard_revision, 3); + assert_eq!(sp.hubris_archive, "hubris4"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&psc).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest4" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest4" + ); + assert_eq!(rot.transient_boot_preference, None); + + // The PSC has four different cabooses! + let c = &collection + .caboose_for(CabooseWhich::SpSlot0, &psc) + .unwrap() + .caboose; + assert_eq!(c.board, "board_psc_sp_0"); + assert!(collection.cabooses.contains(c)); + let c = &collection + .caboose_for(CabooseWhich::SpSlot1, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_sp_1"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotA, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_rot_a"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotB, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_rot_b"); + + // Verify the reported SP state for sled3, which did not have a healthy + // RoT, nor any cabooses. + let sp = collection.sps.get(&sled3_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 5); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris5"); + assert_eq!(sp.power_state, PowerState::A2); + assert!(collection + .caboose_for(CabooseWhich::SpSlot0, &sled3_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled3_bb) + .is_none()); + assert!(!collection.rots.contains_key(&sled3_bb)); + + // There shouldn't be any other RoTs. + assert_eq!(collection.sps.len(), collection.rots.len() + 1); + + // There should be five cabooses: the four used for the PSC (see above), + // plus the common one. + assert_eq!(collection.cabooses.len(), 5); + } + + // Exercises all the failure cases that shouldn't happen in real systems. + // Despite all of these failures, we should get a valid collection at the + // end. + #[test] + fn test_problems() { + let mut builder = CollectionBuilder::new("test_problems"); + + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // report the same SP again with the same contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report the same SP again with different contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report an SP with an impossible slot number + let sled2_sp = builder.found_sp_state( + "fake MGS 1", + SpType::Sled, + u32::from(u16::MAX) + 1, + sp_state("1"), + ); + assert_eq!(sled2_sp, None); + + // report SP caboose for an unknown baseboard + let bogus_baseboard = BaseboardId { + part_number: String::from("p1"), + serial_number: String::from("bogus"), + }; + let caboose1 = SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + let error = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "reporting caboose for unknown baseboard: \ + BaseboardId { part_number: \"p1\", serial_number: \"bogus\" } \ + (Caboose { board: \"board1\", git_commit: \"git_commit1\", \ + name: \"name1\", version: \"version1\" })" + ); + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + + // report RoT caboose for an unknown baseboard + let error2 = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::RotSlotA, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!(error.to_string(), error2.to_string(),); + + // report the same caboose twice with the same contents + let _ = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap(); + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + format!("{:#}", error), + "baseboard BaseboardId { part_number: \"model1\", \ + serial_number: \"s1\" } caboose SpSlot0: reported multiple \ + times (same value)" + ); + // report the same caboose again with different contents + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + SpComponentCaboose { + board: String::from("board2"), + git_commit: String::from("git_commit2"), + name: String::from("name2"), + version: String::from("version2"), + }, + ) + .unwrap_err(); + let message = format!("{:#}", error); + println!("found error: {}", message); + assert!(message.contains( + "caboose SpSlot0: reported caboose multiple times (previously" + )); + assert!(message.contains(", now ")); + + // We should still get a valid collection. + let collection = builder.build(); + println!("{:#?}", collection); + assert_eq!(collection.collector, "test_problems"); + + // We should still have the one sled and its SP slot0 caboose. + assert!(collection.baseboards.contains(&sled1_bb)); + let _ = collection.sps.get(&sled1_bb).unwrap(); + let caboose = + collection.caboose_for(CabooseWhich::SpSlot0, &sled1_bb).unwrap(); + assert_eq!(caboose.caboose.board, "board2"); + assert!(collection.cabooses.contains(&caboose.caboose)); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled1_bb) + .is_none()); + let _ = collection.rots.get(&sled1_bb).unwrap(); + assert!(collection + .caboose_for(CabooseWhich::RotSlotA, &sled1_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::RotSlotB, &sled1_bb) + .is_none()); + + // We should see an error. + assert_eq!( + collection + .errors + .iter() + .map(|e| format!("{:#}", e)) + .collect::>(), + vec![ + "MGS \"fake MGS 1\": SP Sled slot 65536: \ + slot number did not fit into u16" + ] + ); + } +} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs new file mode 100644 index 0000000000..d40b09d2be --- /dev/null +++ b/nexus/inventory/src/collector.rs @@ -0,0 +1,389 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collection of inventory from Omicron components + +use crate::builder::CollectionBuilder; +use anyhow::Context; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use slog::{debug, error}; +use std::sync::Arc; +use strum::IntoEnumIterator; + +pub struct Collector { + log: slog::Logger, + mgs_clients: Vec>, + in_progress: CollectionBuilder, +} + +impl Collector { + pub fn new( + creator: &str, + mgs_clients: &[Arc], + log: slog::Logger, + ) -> Self { + Collector { + log, + mgs_clients: mgs_clients.to_vec(), + in_progress: CollectionBuilder::new(creator), + } + } + + /// Begin the process of collecting a complete hardware/software inventory + /// of the rack + /// + /// The collection process makes a bunch of requests to a bunch of + /// components. This can take a while and produce any number of errors. + /// Such errors generally don't cause this function to fail. Rather, the + /// returned `Collection` keeps track of these errors. + pub async fn collect_all(mut self) -> Result { + // We're about to do a bunch of asynchronous operations. With a + // combination of async, futures, and some cleverness, we could do much + // of this in parallel. But this code path is not remotely + // latency-sensitive. And there's real risk of overloading our + // downstream services. So we just do one step at a time. This also + // keeps the code simpler. + + debug!(&self.log, "begin collection"); + + // When we add stages to collect from other components (e.g., sled + // agents), those will go here. + self.collect_all_mgs().await; + + debug!(&self.log, "finished collection"); + + Ok(self.in_progress.build()) + } + + /// Collect inventory from all MGS instances + async fn collect_all_mgs(&mut self) { + let clients = self.mgs_clients.clone(); + for client in &clients { + self.collect_one_mgs(&client).await; + } + } + + async fn collect_one_mgs(&mut self, client: &gateway_client::Client) { + debug!(&self.log, "begin collection from MGS"; + "mgs_url" => client.baseurl() + ); + + // First, see which SPs MGS can see via Ignition. + let ignition_result = client.ignition_list().await.with_context(|| { + format!("MGS {:?}: listing ignition targets", client.baseurl()) + }); + + // Select only the SPs that appear powered on. + // + // This choice is debatable. It's conceivable that an SP could be + // functioning but not visible to ignition. In that case, we'd be + // better off trying to ask MGS about it even though ignition reports it + // powered off. But in practice, if ignition can't see it, it's much + // more likely that there's just nothing plugged in. And in that case, + // if we try to ask MGS about it, we have to wait for MGS to time out + // its attempt to reach it (currently several seconds). This choice + // enables inventory to complete much faster, at the expense of not + // being able to identify this particular condition. + let sps = match ignition_result { + Err(error) => { + self.in_progress.found_error(error); + return; + } + + Ok(targets) => { + targets.into_inner().into_iter().filter_map(|sp_ignition| { + match sp_ignition.details { + gateway_client::types::SpIgnition::No => None, + gateway_client::types::SpIgnition::Yes { + power: false, + .. + } => None, + gateway_client::types::SpIgnition::Yes { + power: true, + .. + } => Some(sp_ignition.id), + } + }) + } + }; + + // For each SP that ignition reports up, fetch the state and caboose + // information. + for sp in sps { + // First, fetch the state of the SP. If that fails, report the + // error but continue. + let result = + client.sp_get(sp.type_, sp.slot).await.with_context(|| { + format!( + "MGS {:?}: fetching state of SP {:?}", + client.baseurl(), + sp + ) + }); + let sp_state = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + + // Record the state that we found. + let Some(baseboard_id) = self.in_progress.found_sp_state( + client.baseurl(), + sp.type_, + sp.slot, + sp_state, + ) else { + // We failed to parse this SP for some reason. The error was + // reported already. Move on. + continue; + }; + + // For each kind of caboose that we care about, if it hasn't been + // fetched already, fetch it and record it. Generally, we'd only + // get here for the first MGS client. Assuming that one succeeds, + // the other(s) will skip this loop. + for which in CabooseWhich::iter() { + if self.in_progress.found_caboose_already(&baseboard_id, which) + { + continue; + } + + let (component, slot) = match which { + CabooseWhich::SpSlot0 => ("sp", 0), + CabooseWhich::SpSlot1 => ("sp", 1), + CabooseWhich::RotSlotA => ("rot", 0), + CabooseWhich::RotSlotB => ("rot", 1), + }; + + let result = client + .sp_component_caboose_get( + sp.type_, sp.slot, component, slot, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {:?}: caboose {:?}", + client.baseurl(), + sp, + which + ) + }); + let caboose = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + if let Err(error) = self.in_progress.found_caboose( + &baseboard_id, + which, + client.baseurl(), + caboose, + ) { + error!( + &self.log, + "error reporting caboose: {:?} {:?} {:?}: {:#}", + baseboard_id, + which, + client.baseurl(), + error + ); + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::Collector; + use gateway_messages::SpPort; + use nexus_types::inventory::Collection; + use std::fmt::Write; + use std::sync::Arc; + + fn dump_collection(collection: &Collection) -> String { + // Construct a stable, human-readable summary of the Collection + // contents. We could use a `Debug` impl for this, but that's not quite + // right: when debugging, for example, we want fields like the ids, but + // these change each time and we don't want to include them here. + // `Serialize` has the same problem -- the set of fields to include + // depends on what the serialization is for. It's easy enough to just + // print what we want here. + let mut s = String::new(); + write!(&mut s, "baseboards:\n").unwrap(); + for b in &collection.baseboards { + write!( + &mut s, + " part {:?} serial {:?}\n", + b.part_number, b.serial_number + ) + .unwrap(); + } + + write!(&mut s, "\ncabooses:\n").unwrap(); + for c in &collection.cabooses { + write!( + &mut s, + " board {:?} name {:?} version {:?} git_commit {:?}\n", + c.board, c.name, c.version, c.git_commit, + ) + .unwrap(); + } + + // All we really need to check here is that we're reporting the right + // SPs, RoTs, and cabooses. The actual SP data, RoT data, and caboose + // data comes straight from MGS. And proper handling of that data is + // tested in the builder. + write!(&mut s, "\nSPs:\n").unwrap(); + for (bb, _) in &collection.sps { + write!( + &mut s, + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, + ) + .unwrap(); + } + + write!(&mut s, "\nRoTs:\n").unwrap(); + for (bb, _) in &collection.rots { + write!( + &mut s, + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, + ) + .unwrap(); + } + + write!(&mut s, "\ncabooses found:\n").unwrap(); + for (kind, bb_to_found) in &collection.cabooses_found { + for (bb, found) in bb_to_found { + write!( + &mut s, + " {:?} baseboard part {:?} serial {:?}: board {:?}\n", + kind, bb.part_number, bb.serial_number, found.caboose.board, + ) + .unwrap(); + } + } + + write!(&mut s, "\nerrors:\n").unwrap(); + for e in &collection.errors { + // Some error strings have OS error numbers in them. We want to + // ignore those, particularly for CI, which runs these tests on + // multiple OSes. + let message = regex::Regex::new(r"os error \d+") + .unwrap() + .replace_all(&e, "os error <>"); + write!(&mut s, "error: {}\n", message).unwrap(); + } + + s + } + + #[tokio::test] + async fn test_basic() { + // Set up the stock MGS test setup which includes a couple of fake SPs. + // Then run a collection against it. + let gwtestctx = + gateway_test_utils::setup::test_setup("test_basic", SpPort::One) + .await; + let log = &gwtestctx.logctx.log; + let mgs_url = format!("http://{}/", gwtestctx.client.bind_address); + let mgs_client = + Arc::new(gateway_client::Client::new(&mgs_url, log.clone())); + let collector = + Collector::new("test-suite", &[mgs_client], log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs() { + // This is the same as the basic test, but we set up two different MGS + // instances and point the collector at both. We should get the same + // result. + let gwtestctx1 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_1", + SpPort::One, + ) + .await; + let gwtestctx2 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx1.logctx.log; + let mgs_clients = [&gwtestctx1, &gwtestctx2] + .into_iter() + .map(|g| { + let url = format!("http://{}/", g.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }) + .collect::>(); + let collector = Collector::new("test-suite", &mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx1.teardown().await; + gwtestctx2.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs_failure() { + // This is similar to the multi-MGS test, but we don't actually set up + // the second MGS. To the collector, it should look offline or + // otherwise non-functional. + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx.logctx.log; + let real_client = { + let url = format!("http://{}/", gwtestctx.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }; + let bad_client = { + // This IP range is guaranteed by RFC 6666 to discard traffic. + let url = "http://[100::1]:12345"; + let client = gateway_client::Client::new(url, log.clone()); + Arc::new(client) + }; + let mgs_clients = &[bad_client, real_client]; + let collector = Collector::new("test-suite", mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_errors.txt", &s); + + gwtestctx.teardown().await; + } +} diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs new file mode 100644 index 0000000000..52aca397bb --- /dev/null +++ b/nexus/inventory/src/examples.rs @@ -0,0 +1,254 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Example collections used for testing + +use crate::CollectionBuilder; +use gateway_client::types::PowerState; +use gateway_client::types::RotSlot; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseWhich; +use std::sync::Arc; +use strum::IntoEnumIterator; + +/// Returns an example Collection used for testing +/// +/// This collection is intended to cover a variety of possible inventory data, +/// including: +/// +/// - all three baseboard types (switch, sled, PSC) +/// - various valid values for all fields (sources, slot numbers, power +/// states, baseboard revisions, cabooses, etc.) +/// - some empty slots +/// - some missing cabooses +/// - some cabooses common to multiple baseboards; others not +/// - serial number reused across different model numbers +pub fn representative() -> Representative { + let mut builder = CollectionBuilder::new("example"); + + // an ordinary, working sled + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // another ordinary sled with different values for ordinary fields + let sled2_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Sled, + 4, + SpState { + base_mac_address: [1; 6], + hubris_archive_id: String::from("hubris2"), + model: String::from("model2"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: Some(RotSlot::A), + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest2")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest2")), + transient_boot_preference: Some(RotSlot::B), + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a switch + let switch1_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Switch, + 0, + SpState { + base_mac_address: [2; 6], + hubris_archive_id: String::from("hubris3"), + model: String::from("model3"), + power_state: PowerState::A1, + revision: 2, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest3")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest3")), + transient_boot_preference: None, + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a PSC + let psc_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Power, + 1, + SpState { + base_mac_address: [3; 6], + hubris_archive_id: String::from("hubris4"), + model: String::from("model4"), + power_state: PowerState::A2, + revision: 3, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest4")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest4")), + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // a sled with no RoT state or other optional fields + let sled3_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 5, + SpState { + base_mac_address: [4; 6], + hubris_archive_id: String::from("hubris5"), + model: String::from("model1"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::CommunicationFailed { + message: String::from("test suite injected error"), + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // Report some cabooses. + + // We'll use the same cabooses for most of these components, although + // that's not possible in a real system. We deliberately construct a + // new value each time to make sure the builder correctly normalizes it. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; + for bb in &common_caboose_baseboards { + for which in CabooseWhich::iter() { + assert!(!builder.found_caboose_already(bb, which)); + let _ = builder + .found_caboose(bb, which, "test suite", caboose("1")) + .unwrap(); + assert!(builder.found_caboose_already(bb, which)); + } + } + + // For the PSC, use different cabooses for both slots of both the SP and + // RoT, just to exercise that we correctly keep track of different + // cabooses. + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot0, + "test suite", + caboose("psc_sp_0"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot1, + "test suite", + caboose("psc_sp_1"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotA, + "test suite", + caboose("psc_rot_a"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotB, + "test suite", + caboose("psc_rot_b"), + ) + .unwrap(); + + // We deliberately provide no cabooses for sled3. + + Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch: switch1_bb, + psc: psc_bb, + } +} + +pub struct Representative { + pub builder: CollectionBuilder, + pub sleds: [Arc; 3], + pub switch: Arc, + pub psc: Arc, +} + +/// Returns an SP state that can be used to populate a collection for testing +pub fn sp_state(unique: &str) -> SpState { + SpState { + base_mac_address: [0; 6], + hubris_archive_id: format!("hubris{}", unique), + model: format!("model{}", unique), + power_state: PowerState::A2, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: format!("serial{}", unique), + } +} + +pub fn caboose(unique: &str) -> SpComponentCaboose { + SpComponentCaboose { + board: format!("board_{}", unique), + git_commit: format!("git_commit_{}", unique), + name: format!("name_{}", unique), + version: format!("version_{}", unique), + } +} diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs new file mode 100644 index 0000000000..3a5f60b387 --- /dev/null +++ b/nexus/inventory/src/lib.rs @@ -0,0 +1,27 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron component inventory +//! +//! This module provides [`Collector`], an interface for collecting a complete +//! hardware/software inventory in a running Omicron deployment +//! +//! This is really just the collection part. For separation of concerns, this +//! module doesn't know anything about storing these collections into the +//! database. That's provided by the datastore. The types associated with +//! collections are in `nexus_types::inventory` so they can be shared with other +//! parts of Nexus (like the datastore). +//! +//! This module lives inside Nexus but it has few dependencies on other parts of +//! Nexus. It could be incorporated into other components. (The corresponding +//! types in `nexus_types` might have to move, too) + +mod builder; +mod collector; +pub mod examples; + +// only exposed for test code to construct collections +pub use builder::CollectionBuilder; + +pub use collector::Collector; diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt new file mode 100644 index 0000000000..4a3bf62d63 --- /dev/null +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -0,0 +1,43 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + +errors: diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt new file mode 100644 index 0000000000..f231cc7d97 --- /dev/null +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -0,0 +1,44 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + +errors: +error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error trying to connect: tcp connect error: Network is unreachable (os error <>): tcp connect error: Network is unreachable (os error <>): Network is unreachable (os error <>) diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index 3fcf0483a5..7b05eab61b 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -177,7 +177,7 @@ pub struct Driver { /// /// This is returned by [`Driver::register()`] to identify the corresponding /// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::status()`] to identify the task. +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] pub struct TaskHandle(String); @@ -277,8 +277,8 @@ impl Driver { /// Enumerate all registered background tasks /// /// This is aimed at callers that want to get the status of all background - /// tasks. You'd call [`Driver::status()`] with each of the items produced - /// by the iterator. + /// tasks. You'd call [`Driver::task_status()`] with each of the items + /// produced by the iterator. pub fn tasks(&self) -> impl Iterator { self.tasks.keys() } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index aa949bbc9f..b000dd9bda 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -9,6 +9,7 @@ use super::dns_config; use super::dns_propagation; use super::dns_servers; use super::external_endpoints; +use super::inventory_collection; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -16,6 +17,7 @@ use omicron_common::nexus_config::BackgroundTaskConfig; use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::sync::Arc; +use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with /// them @@ -42,6 +44,9 @@ pub struct BackgroundTasks { pub external_endpoints: tokio::sync::watch::Receiver< Option, >, + + /// task handle for the task that collects inventory + pub task_inventory_collection: common::TaskHandle, } impl BackgroundTasks { @@ -50,6 +55,8 @@ impl BackgroundTasks { opctx: &OpContext, datastore: Arc, config: &BackgroundTaskConfig, + nexus_id: Uuid, + resolver: internal_dns::resolver::Resolver, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -70,8 +77,9 @@ impl BackgroundTasks { // Background task: External endpoints list watcher let (task_external_endpoints, external_endpoints) = { - let watcher = - external_endpoints::ExternalEndpointsWatcher::new(datastore); + let watcher = external_endpoints::ExternalEndpointsWatcher::new( + datastore.clone(), + ); let watcher_channel = watcher.watcher(); let task = driver.register( String::from("external_endpoints"), @@ -88,6 +96,30 @@ impl BackgroundTasks { (task, watcher_channel) }; + // Background task: inventory collector + let task_inventory_collection = { + let collector = inventory_collection::InventoryCollector::new( + datastore, + resolver, + &nexus_id.to_string(), + config.inventory.nkeep, + config.inventory.disable, + ); + let task = driver.register( + String::from("inventory_collection"), + String::from( + "collects hardware and software inventory data from the \ + whole system", + ), + config.inventory.period_secs, + Box::new(collector), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -96,6 +128,7 @@ impl BackgroundTasks { task_external_dns_servers, task_external_endpoints, external_endpoints, + task_inventory_collection, } } diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs new file mode 100644 index 0000000000..f095b094db --- /dev/null +++ b/nexus/src/app/background/inventory_collection.rs @@ -0,0 +1,243 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for reading inventory for the rack + +use super::common::BackgroundTask; +use anyhow::ensure; +use anyhow::Context; +use futures::future::BoxFuture; +use futures::FutureExt; +use internal_dns::ServiceName; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::inventory::Collection; +use serde_json::json; +use std::sync::Arc; + +/// Background task that reads inventory for the rack +pub struct InventoryCollector { + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: String, + nkeep: u32, + disable: bool, +} + +impl InventoryCollector { + pub fn new( + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, + disable: bool, + ) -> InventoryCollector { + InventoryCollector { + datastore, + resolver, + creator: creator.to_owned(), + nkeep, + disable, + } + } +} + +impl BackgroundTask for InventoryCollector { + fn activate<'a, 'b, 'c>( + &'a mut self, + opctx: &'b OpContext, + ) -> BoxFuture<'c, serde_json::Value> + where + 'a: 'c, + 'b: 'c, + { + async { + match inventory_activate( + opctx, + &self.datastore, + &self.resolver, + &self.creator, + self.nkeep, + self.disable, + ) + .await + .context("failed to collect inventory") + { + Err(error) => { + let message = format!("{:#}", error); + warn!(opctx.log, "inventory collection failed"; + "error" => message.clone()); + json!({ "error": message }) + } + Ok(collection) => { + debug!(opctx.log, "inventory collection complete"; + "collection_id" => collection.id.to_string(), + "time_started" => collection.time_started.to_string(), + ); + json!({ + "collection_id": collection.id.to_string(), + "time_started": collection.time_started.to_string(), + "time_done": collection.time_done.to_string() + }) + } + } + } + .boxed() + } +} + +async fn inventory_activate( + opctx: &OpContext, + datastore: &DataStore, + resolver: &internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, + disabled: bool, +) -> Result { + // If we're disabled, don't do anything. (This switch is only intended for + // unforeseen production emergencies.) + ensure!(!disabled, "disabled by explicit configuration"); + + // Prune old collections. We do this first, here, to ensure that we never + // develop an unbounded backlog of collections. (If this process were done + // by a separate task, it would be possible for the backlog to grow + // unbounded if that task were simply slower than the collection process, + // let alone if there were some kind of extended operational issue + // blocking deletion.) + datastore + .inventory_prune_collections(opctx, nkeep) + .await + .context("pruning old collections")?; + + // Find MGS clients. + let mgs_clients = resolver + .lookup_all_socket_v6(ServiceName::ManagementGatewayService) + .await + .context("looking up MGS addresses")? + .into_iter() + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + let log = opctx.log.new(o!("gateway_url" => url.clone())); + Arc::new(gateway_client::Client::new(&url, log)) + }) + .collect::>(); + + // Run a collection. + let inventory = nexus_inventory::Collector::new( + creator, + &mgs_clients, + opctx.log.clone(), + ); + let collection = + inventory.collect_all().await.context("collecting inventory")?; + + // Write it to the database. + datastore + .inventory_insert_collection(opctx, &collection) + .await + .context("saving inventory to database")?; + + Ok(collection) +} + +#[cfg(test)] +mod test { + use crate::app::background::common::BackgroundTask; + use crate::app::background::inventory_collection::InventoryCollector; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DataStoreInventoryTest; + use nexus_test_utils_macros::nexus_test; + use omicron_test_utils::dev::poll; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + // Test that each activation creates a new collection and that we prune old + // collections, too. + #[nexus_test(server = crate::Server)] + async fn test_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Nexus starts the very background task that we're also testing + // manually here. As a result, we should find a collection in the + // database before too long. Wait for it so that after it appears, we + // can assume the rest of the collections came from the instance that + // we're testing. + let mut last_collections = + poll::wait_for_condition::<_, anyhow::Error, _, _>( + || async { + let collections = datastore + .inventory_collections() + .await + .map_err(poll::CondCheckError::Failed)?; + if collections.is_empty() { + Err(poll::CondCheckError::NotYet) + } else { + Ok(collections) + } + }, + &std::time::Duration::from_millis(50), + &std::time::Duration::from_secs(15), + ) + .await + .expect("background task did not populate initial collection"); + + let resolver = internal_dns::resolver::Resolver::new_from_addrs( + cptestctx.logctx.log.clone(), + &[cptestctx.internal_dns.dns_server.local_address()], + ) + .unwrap(); + + // Now we'll create our own copy of the background task and activate it + // a bunch and make sure that it always creates a new collection and + // does not allow a backlog to accumulate. + let nkeep = 3; + let mut task = InventoryCollector::new( + datastore.clone(), + resolver.clone(), + "me", + nkeep, + false, + ); + let nkeep = usize::try_from(nkeep).unwrap(); + for i in 0..10 { + let _ = task.activate(&opctx).await; + let collections = datastore.inventory_collections().await.unwrap(); + println!( + "iter {}: last = {:?}, current = {:?}", + i, last_collections, collections + ); + + let expected_from_last: Vec<_> = if last_collections.len() <= nkeep + { + last_collections + } else { + last_collections.into_iter().skip(1).collect() + }; + let expected_from_current: Vec<_> = + collections.iter().rev().skip(1).rev().cloned().collect(); + assert_eq!(expected_from_last, expected_from_current); + assert_eq!(collections.len(), std::cmp::min(i + 2, nkeep + 1)); + last_collections = collections; + } + + // Create a disabled task and make sure that does nothing. + let mut task = InventoryCollector::new( + datastore.clone(), + resolver, + "disabled", + 3, + true, + ); + let previous = datastore.inventory_collections().await.unwrap(); + let _ = task.activate(&opctx).await; + let latest = datastore.inventory_collections().await.unwrap(); + assert_eq!(previous, latest); + } +} diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 9ba0780246..e1f474b41a 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -10,6 +10,7 @@ mod dns_propagation; mod dns_servers; mod external_endpoints; mod init; +mod inventory_collection; mod status; pub use common::Driver; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 7db93a158a..ef8132451a 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -349,6 +349,8 @@ impl Nexus { &background_ctx, Arc::clone(&db_datastore), &config.pkg.background_tasks, + config.deployment.id, + resolver.clone(), ); let external_resolver = { diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 7697d34ecd..bed690f839 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -237,6 +237,7 @@ impl super::Nexus { &self.background_tasks.task_external_dns_config, &self.background_tasks.task_external_dns_servers, &self.background_tasks.task_external_endpoints, + &self.background_tasks.task_inventory_collection, ] { self.background_tasks.activate(task); } diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 8cd25582be..56cee27b37 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -14,6 +14,8 @@ crucible-agent-client.workspace = true dns-server.workspace = true dns-service-client.workspace = true dropshot.workspace = true +gateway-messages.workspace = true +gateway-test-utils.workspace = true headers.workspace = true http.workspace = true hyper.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 701a6e8ba9..647232031d 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -14,6 +14,7 @@ use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; +use gateway_test_utils::setup::GatewayTestContext; use nexus_test_interface::NexusServer; use nexus_types::external_api::params::UserId; use nexus_types::internal_api::params::Certificate; @@ -86,6 +87,7 @@ pub struct ControlPlaneTestContext { pub sled_agent: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, + pub gateway: GatewayTestContext, pub dendrite: HashMap, pub mgd: HashMap, pub external_dns_zone_name: String, @@ -107,6 +109,7 @@ impl ControlPlaneTestContext { self.sled_agent.http_server.close().await.unwrap(); self.oximeter.close().await.unwrap(); self.producer.close().await.unwrap(); + self.gateway.teardown().await; for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -226,6 +229,7 @@ impl RackInitRequestBuilder { pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub config: &'a mut omicron_common::nexus_config::Config, + test_name: &'a str, rack_init_builder: RackInitRequestBuilder, pub start_time: chrono::DateTime, @@ -241,6 +245,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub sled_agent: Option, pub oximeter: Option, pub producer: Option, + pub gateway: Option, pub dendrite: HashMap, pub mgd: HashMap, @@ -259,7 +264,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { pub fn new( - test_name: &str, + test_name: &'a str, config: &'a mut omicron_common::nexus_config::Config, ) -> Self { let start_time = chrono::Utc::now(); @@ -267,6 +272,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { Self { config, + test_name, rack_init_builder: RackInitRequestBuilder::new(), start_time, logctx, @@ -279,6 +285,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { sled_agent: None, oximeter: None, producer: None, + gateway: None, dendrite: HashMap::new(), mgd: HashMap::new(), nexus_internal: None, @@ -377,6 +384,37 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .set_port(port); } + pub async fn start_gateway(&mut self) { + // For now, this MGS is not configured to match up in any way with + // either the simulated sled agent or the Dendrite instances. It's + // useful for testing stuff unrelated to that. But at some point we + // will probably want the reported data to match up better. + debug!(&self.logctx.log, "Starting Management Gateway"); + let gateway = gateway_test_utils::setup::test_setup( + self.test_name, + gateway_messages::SpPort::One, + ) + .await; + let fake_mgs_zone_id = Uuid::new_v4(); + let SocketAddr::V6(v6addr) = gateway.client.bind_address else { + panic!("MGS unexpectedly listening on IPv4?"); + }; + let zone = self + .rack_init_builder + .internal_dns_config + .host_zone(fake_mgs_zone_id, *v6addr.ip()) + .expect("Failed to add DNS for MGS zone"); + self.rack_init_builder + .internal_dns_config + .service_backend_zone( + internal_dns::ServiceName::ManagementGatewayService, + &zone, + v6addr.port(), + ) + .expect("Failed to add DNS for MGS service"); + self.gateway = Some(gateway); + } + pub async fn start_dendrite(&mut self, switch_location: SwitchLocation) { let log = &self.logctx.log; debug!(log, "Starting Dendrite for {switch_location}"); @@ -796,6 +834,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { oximeter: self.oximeter.unwrap(), producer: self.producer.unwrap(), logctx: self.logctx, + gateway: self.gateway.unwrap(), dendrite: self.dendrite, mgd: self.mgd, external_dns_zone_name: self.external_dns_zone_name.unwrap(), @@ -825,6 +864,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { if let Some(producer) = self.producer { producer.close().await.unwrap(); } + if let Some(gateway) = self.gateway { + gateway.teardown().await; + } for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -919,6 +961,7 @@ async fn setup_with_config_impl( ) -> ControlPlaneTestContext { builder.start_crdb_impl(populate).await; builder.start_clickhouse().await; + builder.start_gateway().await; builder.start_dendrite(SwitchLocation::Switch0).await; builder.start_dendrite(SwitchLocation::Switch1).await; builder.start_mgd(SwitchLocation::Switch0).await; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 09f13e55c7..54f7e03eef 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -90,8 +90,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the # `Random` strategy, instead of `RandomWithDistinctSleds` -type = "random" \ No newline at end of file +type = "random" diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index c499714c31..5722b065cf 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -23,6 +23,7 @@ uuid.workspace = true api_identity.workspace = true dns-service-client.workspace = true +gateway-client.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs new file mode 100644 index 0000000000..112eec3a65 --- /dev/null +++ b/nexus/types/src/inventory.rs @@ -0,0 +1,179 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types representing collection of hardware/software inventory +//! +//! This lives in nexus/types because it's used by both nexus/db-model and +//! nexus/inventory. (It could as well just live in nexus/db-model, but +//! nexus/inventory does not currently know about nexus/db-model and it's +//! convenient to separate these concerns.) + +use chrono::DateTime; +use chrono::Utc; +pub use gateway_client::types::PowerState; +pub use gateway_client::types::RotSlot; +pub use gateway_client::types::SpType; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use strum::EnumIter; +use uuid::Uuid; + +/// Results of collecting hardware/software inventory from various Omicron +/// components +/// +/// This type is structured so that it's both easy to collect and easy to insert +/// into the database. This means items that are represented with separate +/// database tables (like service processors and roots of trust) are represented +/// with separate records, even though they might come from the same source +/// (in this case, a single MGS request). +/// +/// We make heavy use of maps, sets, and Arcs here because some of these objects +/// are pointed-to by many other objects in the same Collection. This approach +/// ensures clear ownership. It also reflects how things will wind up in the +/// database. +/// +/// See the documentation in the database schema for more background. +#[derive(Debug, Eq, PartialEq)] +pub struct Collection { + /// unique identifier for this collection + pub id: Uuid, + /// errors encountered during collection + pub errors: Vec, + /// time the collection started + pub time_started: DateTime, + /// time the collection eneded + pub time_done: DateTime, + /// name of the agent doing the collecting (generally, this Nexus's uuid) + pub collector: String, + + /// unique baseboard ids that were found in this collection + /// + /// In practice, these will be inserted into the `hw_baseboard_id` table. + pub baseboards: BTreeSet>, + /// unique caboose contents that were found in this collection + /// + /// In practice, these will be inserted into the `sw_caboose` table. + pub cabooses: BTreeSet>, + + /// all service processors, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_service_processor` + /// table. + pub sps: BTreeMap, ServiceProcessor>, + /// all roots of trust, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_root_of_trust` table. + pub rots: BTreeMap, RotState>, + /// all caboose contents found, keyed first by the kind of caboose + /// (`CabooseWhich`), then the baseboard id of the sled where they were + /// found + /// + /// In practice, these will be inserted into the `inv_caboose` table. + pub cabooses_found: + BTreeMap, CabooseFound>>, +} + +impl Collection { + pub fn caboose_for( + &self, + which: CabooseWhich, + baseboard_id: &BaseboardId, + ) -> Option<&CabooseFound> { + self.cabooses_found + .get(&which) + .and_then(|by_bb| by_bb.get(baseboard_id)) + } +} + +/// A unique baseboard id found during a collection +/// +/// Baseboard ids are the keys used to link up information from disparate +/// sources (like a service processor and a sled agent). +/// +/// These are normalized in the database. Each distinct baseboard id is +/// assigned a uuid and shared across the many possible collections that +/// reference it. +/// +/// Usually, the part number and serial number are combined with a revision +/// number. We do not include that here. If we ever did find a baseboard with +/// the same part number and serial number but a new revision number, we'd want +/// to treat that as the same baseboard as one with a different revision number. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + +/// Caboose contents found during a collection +/// +/// These are normalized in the database. Each distinct `Caboose` is assigned a +/// uuid and shared across many possible collections that reference it. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct Caboose { + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for Caboose { + fn from(c: gateway_client::types::SpComponentCaboose) -> Self { + Caboose { + board: c.board, + git_commit: c.git_commit, + name: c.name, + version: c.version, + } + } +} + +/// Indicates that a particular `Caboose` was found (at a particular time from a +/// particular source, but these are only for debugging) +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct CabooseFound { + pub time_collected: DateTime, + pub source: String, + pub caboose: Arc, +} + +/// Describes a service processor found during collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct ServiceProcessor { + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: u16, + + pub baseboard_revision: u32, + pub hubris_archive: String, + pub power_state: PowerState, +} + +/// Describes the root of trust state found (from a service processor) during +/// collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct RotState { + pub time_collected: DateTime, + pub source: String, + + pub active_slot: RotSlot, + pub persistent_boot_preference: RotSlot, + pub pending_persistent_boot_preference: Option, + pub transient_boot_preference: Option, + pub slot_a_sha3_256_digest: Option, + pub slot_b_sha3_256_digest: Option, +} + +/// Describes which caboose this is (which component, which slot) +#[derive(Clone, Copy, Debug, EnumIter, PartialEq, Eq, PartialOrd, Ord)] +pub enum CabooseWhich { + SpSlot0, + SpSlot1, + RotSlotA, + RotSlotB, +} diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 3f864b0f17..a48c4d3b00 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -32,3 +32,4 @@ pub mod external_api; pub mod identity; pub mod internal_api; +pub mod inventory; diff --git a/openapi/gateway.json b/openapi/gateway.json index 67cc2bd634..97cb7994aa 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -2385,14 +2385,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentDetails": { diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 75db82e8e1..a75c965ad8 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -2517,14 +2517,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentInfo": { diff --git a/schema/crdb/9.0.0/up01.sql b/schema/crdb/9.0.0/up01.sql new file mode 100644 index 0000000000..88439c433b --- /dev/null +++ b/schema/crdb/9.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up02.sql b/schema/crdb/9.0.0/up02.sql new file mode 100644 index 0000000000..d98f896fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up02.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); diff --git a/schema/crdb/9.0.0/up03.sql b/schema/crdb/9.0.0/up03.sql new file mode 100644 index 0000000000..3bd036be7e --- /dev/null +++ b/schema/crdb/9.0.0/up03.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); diff --git a/schema/crdb/9.0.0/up04.sql b/schema/crdb/9.0.0/up04.sql new file mode 100644 index 0000000000..1590ec4e88 --- /dev/null +++ b/schema/crdb/9.0.0/up04.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); diff --git a/schema/crdb/9.0.0/up05.sql b/schema/crdb/9.0.0/up05.sql new file mode 100644 index 0000000000..1042282fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up05.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up06.sql b/schema/crdb/9.0.0/up06.sql new file mode 100644 index 0000000000..aa614fa2fb --- /dev/null +++ b/schema/crdb/9.0.0/up06.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); diff --git a/schema/crdb/9.0.0/up07.sql b/schema/crdb/9.0.0/up07.sql new file mode 100644 index 0000000000..945f5a44c8 --- /dev/null +++ b/schema/crdb/9.0.0/up07.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up08.sql b/schema/crdb/9.0.0/up08.sql new file mode 100644 index 0000000000..1abeb9203f --- /dev/null +++ b/schema/crdb/9.0.0/up08.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); diff --git a/schema/crdb/9.0.0/up09.sql b/schema/crdb/9.0.0/up09.sql new file mode 100644 index 0000000000..770c771775 --- /dev/null +++ b/schema/crdb/9.0.0/up09.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); diff --git a/schema/crdb/9.0.0/up10.sql b/schema/crdb/9.0.0/up10.sql new file mode 100644 index 0000000000..57665ee468 --- /dev/null +++ b/schema/crdb/9.0.0/up10.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); diff --git a/schema/crdb/9.0.0/up11.sql b/schema/crdb/9.0.0/up11.sql new file mode 100644 index 0000000000..40da69af5b --- /dev/null +++ b/schema/crdb/9.0.0/up11.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); diff --git a/schema/crdb/9.0.0/up12.sql b/schema/crdb/9.0.0/up12.sql new file mode 100644 index 0000000000..9089ac93ba --- /dev/null +++ b/schema/crdb/9.0.0/up12.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up13.sql b/schema/crdb/9.0.0/up13.sql new file mode 100644 index 0000000000..241c5d9e80 --- /dev/null +++ b/schema/crdb/9.0.0/up13.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, + slot_a_sha3_256 TEXT, + slot_b_sha3_256 TEXT, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up14.sql b/schema/crdb/9.0.0/up14.sql new file mode 100644 index 0000000000..6725d35acf --- /dev/null +++ b/schema/crdb/9.0.0/up14.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); diff --git a/schema/crdb/9.0.0/up15.sql b/schema/crdb/9.0.0/up15.sql new file mode 100644 index 0000000000..48a68d167a --- /dev/null +++ b/schema/crdb/9.0.0/up15.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0fdaf5083c..da842cbfeb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2514,6 +2514,222 @@ CREATE TABLE IF NOT EXISTS omicron.public.bootstore_keys ( generation INT8 NOT NULL ); +/* + * Hardware/software inventory + * + * See RFD 433 for details. Here are the highlights. + * + * Omicron periodically collects hardware/software inventory data from the + * running system and stores it into the database. Each discrete set of data is + * called a **collection**. Each collection contains lots of different kinds of + * data, so there are many tables here. For clarity, these tables are prefixed + * with: + * + * `inv_*` (examples: `inv_collection`, `inv_service_processor`) + * + * Describes the complete set of hardware and software in the system. + * Rows in these tables are immutable, but they describe mutable facts + * about hardware and software (e.g., the slot that a disk is in). When + * these facts change (e.g., a disk moves between slots), a new set of + * records is written. + * + * All rows in the `inv_*` tables point back to a particular collection. They + * represent the state observed at some particular time. Generally, if two + * observations came from two different places, they're not put into the same + * row of the same table. For example, caboose information comes from the SP, + * but it doesn't go into the `inv_service_processor` table. It goes in a + * separate `inv_caboose` table. This is debatable but it preserves a clearer + * record of exactly what information came from where, since the separate record + * has its own "source" and "time_collected". + * + * Information about service processors and roots of trust are joined with + * information reported by sled agents via the baseboard id. + * + * Hardware and software identifiers are normalized for the usual database + * design reasons. This means instead of storing hardware and software + * identifiers directly in the `inv_*` tables, these tables instead store + * foreign keys into one of these groups of tables, whose names are also + * prefixed for clarity: + * + * `hw_*` (example: `hw_baseboard_id`) + * + * Maps hardware-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * `sw_*` (example: `sw_caboose`) + * + * Maps software-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * Records in these tables are shared across potentially many collections. To + * see why this is useful, consider that `sw_caboose` records contain several + * long identifiers (e.g., git commit, SHA sums) and in practice, most of the + * time, we expect that all components of a given type will have the exact same + * cabooses. Rather than store the caboose contents in each + * `inv_service_processor` row (for example), often replicating the exact same + * contents for each SP for each collection, these rows just have pointers into + * the `sw_caboose` table that stores this data once. (This also makes it much + * easier to determine that these components _do_ have the same cabooses.) + * + * On PC systems (i.e., non-Oxide hardware), most of these tables will be empty + * because we do not support hardware inventory on these systems. + * + * Again, see RFD 433 for more on all this. + */ + +/* + * baseboard ids: this table assigns uuids to distinct part/serial values + * + * Usually we include the baseboard revision number when we reference the part + * number and serial number. The revision number is deliberately left out here. + * If we happened to see the same baseboard part number and serial number with + * different revisions, that's the same baseboard. + */ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); + +/* power states reportable by the SP */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); + +/* root of trust firmware slots */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); + +/* cabooses: this table assigns unique ids to distinct caboose contents */ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); + +/* Inventory Collections */ + +-- list of all collections +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); +-- Supports finding latest collection (to use) or the oldest collection (to +-- clean up) +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); + +-- list of errors generated during a collection +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); + +/* what kind of slot MGS reported a device in */ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); + +-- observations from and about service processors +-- also see `inv_root_of_trust` +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + -- identity of this device according to MGS + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- Data from MGS "Get SP Info" API. See MGS API documentation. + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +-- root of trust information reported by SP +-- There's usually one row here for each row in inv_service_processor, but not +-- necessarily. +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, -- nullable + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, -- nullable + slot_a_sha3_256 TEXT, -- nullable + slot_b_sha3_256 TEXT, -- nullable + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); + +-- cabooses found +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); + +/*******************************************************************/ + /* * The `sled_instance` view's definition needs to be modified in a separate * transaction from the transaction that created it. @@ -2522,6 +2738,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.bootstore_keys ( COMMIT; BEGIN; +/*******************************************************************/ + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. @@ -2620,7 +2838,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '8.0.0', NULL) + ( TRUE, NOW(), NOW(), '9.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 2dfee81d02..cae1f650c9 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -38,8 +38,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds # seed is omitted so a new seed will be chosen with every allocation. -type = "random_with_distinct_sleds" \ No newline at end of file +type = "random_with_distinct_sleds" diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index aff0a8a25f..be8683be54 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -38,8 +38,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. # seed is omitted so a new seed will be chosen with every allocation. -type = "random" \ No newline at end of file +type = "random" diff --git a/wicket/src/state/inventory.rs b/wicket/src/state/inventory.rs index 3a561167b1..23a0e244cf 100644 --- a/wicket/src/state/inventory.rs +++ b/wicket/src/state/inventory.rs @@ -147,7 +147,7 @@ pub enum Component { } fn version_or_unknown(caboose: Option<&SpComponentCaboose>) -> String { - caboose.and_then(|c| c.version.as_deref()).unwrap_or("UNKNOWN").to_string() + caboose.map(|c| c.version.as_str()).unwrap_or("UNKNOWN").to_string() } impl Component { diff --git a/wicket/src/ui/panes/overview.rs b/wicket/src/ui/panes/overview.rs index 7de0171e41..e8cf50bb32 100644 --- a/wicket/src/ui/panes/overview.rs +++ b/wicket/src/ui/panes/overview.rs @@ -885,7 +885,6 @@ fn append_caboose( } = caboose; let label_style = style::text_label(); let ok_style = style::text_success(); - let bad_style = style::text_failure(); spans.push( vec![ @@ -905,9 +904,5 @@ fn append_caboose( ); let mut version_spans = vec![prefix.clone(), Span::styled("Version: ", label_style)]; - if let Some(v) = version.as_ref() { - version_spans.push(Span::styled(v.clone(), ok_style)); - } else { - version_spans.push(Span::styled("Unknown", bad_style)); - } + version_spans.push(Span::styled(version, ok_style)); } diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index 18b692703c..bd8e187fe9 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -839,25 +839,21 @@ impl UpdateDriver { let message = format!( "SP board {}, version {} (git commit {})", - caboose.board, - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.board, caboose.version, caboose.git_commit ); - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => { + match caboose.version.parse::() { + Ok(version) => { StepSuccess::new((sp_artifact, Some(version))) .with_message(message) .into() } - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( (sp_artifact, None), format!( "{message} (failed to parse SP version: {err})" ), ) .into(), - None => StepWarning::new((sp_artifact, None), message) - .into(), } }, ) @@ -1769,8 +1765,7 @@ impl UpdateContext { let message = format!( "RoT slot {active_slot_name} version {} (git commit {})", - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.version, caboose.git_commit ); let make_result = |active_version| RotInterrogation { @@ -1779,16 +1774,15 @@ impl UpdateContext { active_version, }; - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => StepSuccess::new(make_result(Some(version))) + match caboose.version.parse::() { + Ok(version) => StepSuccess::new(make_result(Some(version))) .with_message(message) .into(), - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( make_result(None), format!("{message} (failed to parse RoT version: {err})"), ) .into(), - None => StepWarning::new(make_result(None), message).into(), } }