From 0f01172693b25e7b513ed02a2f743e0eec530949 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 17 Oct 2023 19:48:49 -0700 Subject: [PATCH 01/20] initial inventory for automated update This commit matches commit 581e9024b596137ede27df7f638e622dbbaf08d0 in branch dap/nexus-inventory. I've just rebased the changes onto "main" here. --- Cargo.lock | 19 + Cargo.toml | 5 +- common/src/nexus_config.rs | 41 +- dev-tools/omdb/src/bin/omdb/db.rs | 632 +++++++++++- dev-tools/omdb/src/bin/omdb/nexus.rs | 33 + dev-tools/omdb/tests/env.out | 12 + dev-tools/omdb/tests/successes.out | 13 + dev-tools/omdb/tests/usage_errors.out | 10 +- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 4 + nexus/Cargo.toml | 1 + nexus/db-model/src/inventory.rs | 354 +++++++ nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 85 ++ nexus/db-model/src/unsigned.rs | 2 + nexus/db-queries/src/authz/api_resources.rs | 55 ++ nexus/db-queries/src/authz/omicron.polar | 10 + nexus/db-queries/src/authz/oso_generic.rs | 1 + .../src/authz/policy_test/resource_builder.rs | 3 +- .../src/authz/policy_test/resources.rs | 1 + .../db-queries/src/db/datastore/inventory.rs | 914 ++++++++++++++++++ nexus/db-queries/src/db/datastore/mod.rs | 24 +- nexus/db-queries/src/db/pool.rs | 2 + nexus/db-queries/tests/output/authz-roles.out | 14 + nexus/examples/config.toml | 5 + nexus/inventory/Cargo.toml | 16 + nexus/inventory/src/builder.rs | 278 ++++++ nexus/inventory/src/collector.rs | 202 ++++ nexus/inventory/src/lib.rs | 23 + nexus/src/app/background/common.rs | 6 +- nexus/src/app/background/init.rs | 36 +- .../app/background/inventory_collection.rs | 132 +++ nexus/src/app/background/mod.rs | 1 + nexus/src/app/mod.rs | 2 + nexus/src/app/rack.rs | 1 + nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 45 +- nexus/tests/config.test.toml | 7 +- nexus/types/Cargo.toml | 1 + nexus/types/src/inventory.rs | 170 ++++ nexus/types/src/lib.rs | 1 + schema/crdb/dbinit.sql | 211 ++++ smf/nexus/multi-sled/config-partial.toml | 7 +- smf/nexus/single-sled/config-partial.toml | 7 +- 43 files changed, 3338 insertions(+), 52 deletions(-) create mode 100644 nexus/db-model/src/inventory.rs create mode 100644 nexus/db-queries/src/db/datastore/inventory.rs create mode 100644 nexus/inventory/Cargo.toml create mode 100644 nexus/inventory/src/builder.rs create mode 100644 nexus/inventory/src/collector.rs create mode 100644 nexus/inventory/src/lib.rs create mode 100644 nexus/src/app/background/inventory_collection.rs create mode 100644 nexus/types/src/inventory.rs diff --git a/Cargo.lock b/Cargo.lock index 3265ed19de..d03a9b61e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4463,6 +4463,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "nexus-inventory" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "futures", + "gateway-client", + "gateway-messages", + "nexus-types", + "slog", + "strum", + "uuid", +] + [[package]] name = "nexus-test-interface" version = "0.1.0" @@ -4490,6 +4505,8 @@ dependencies = [ "dns-server", "dns-service-client 0.1.0", "dropshot", + "gateway-messages", + "gateway-test-utils", "headers", "http", "hyper", @@ -4537,6 +4554,7 @@ dependencies = [ "chrono", "dns-service-client 0.1.0", "futures", + "gateway-client", "newtype_derive", "omicron-common 0.1.0", "omicron-passwords 0.1.0", @@ -5079,6 +5097,7 @@ dependencies = [ "nexus-db-model", "nexus-db-queries", "nexus-defaults", + "nexus-inventory", "nexus-test-interface", "nexus-test-utils", "nexus-test-utils-macros", diff --git a/Cargo.toml b/Cargo.toml index 72a7f6157e..5173f331a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/test-interface", "nexus/test-utils-macros", "nexus/test-utils", @@ -106,6 +107,7 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/types", "oximeter/collector", "oximeter/db", @@ -231,6 +233,7 @@ nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } +nexus-inventory = { path = "nexus/inventory" } omicron-certificates = { path = "certificates" } omicron-passwords = { path = "passwords" } omicron-workspace-hack = "0.1.0" @@ -369,8 +372,8 @@ tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } unicode-width = "0.1.10" update-engine = { path = "update-engine" } -uuid = { version = "1.4.1", features = ["serde", "v4"] } usdt = "0.3" +uuid = { version = "1.4.1", features = ["serde", "v4"] } walkdir = "2.4" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index ad62c34f92..44de433603 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -311,6 +311,8 @@ pub struct BackgroundTaskConfig { pub dns_external: DnsTasksConfig, /// configuration for external endpoint list watcher pub external_endpoints: ExternalEndpointsConfig, + /// configuration for inventory tasks + pub inventory: InventoryConfig, } #[serde_as] @@ -345,6 +347,24 @@ pub struct ExternalEndpointsConfig { // allow/disallow wildcard certs, don't serve expired certs, etc.) } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct InventoryConfig { + /// period (in seconds) for periodic activations of this background task + /// + /// Each activation fetches information about all harware and software in + /// the system and inserts it into the database. This generates a moderate + /// amount of data. + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// maximum number of past collections to keep in the database + /// + /// This is a very coarse mechanism to keep the system from overwhelming + /// itself with inventory data. + pub nkeep: u32, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -440,18 +460,15 @@ impl std::fmt::Display for SchemeName { #[cfg(test)] mod test { - use super::Tunables; use super::{ - AuthnConfig, Config, ConsoleConfig, LoadError, PackageConfig, - SchemeName, TimeseriesDbConfig, UpdatesConfig, + AuthnConfig, BackgroundTaskConfig, Config, ConfigDropshotWithTls, + ConsoleConfig, Database, DeploymentConfig, DnsTasksConfig, DpdConfig, + ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, + LoadErrorKind, PackageConfig, SchemeName, TimeseriesDbConfig, Tunables, + UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; - use crate::nexus_config::{ - BackgroundTaskConfig, ConfigDropshotWithTls, Database, - DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, - InternalDns, LoadErrorKind, - }; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingIfExists; @@ -596,6 +613,8 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 11 [default_region_allocation_strategy] type = "random" seed = 0 @@ -680,6 +699,10 @@ mod test { }, external_endpoints: ExternalEndpointsConfig { period_secs: Duration::from_secs(9), + }, + inventory: InventoryConfig { + period_secs: Duration::from_secs(10), + nkeep: 11, } }, default_region_allocation_strategy: @@ -733,6 +756,8 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 3 [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 881b5831ba..4546a6e543 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -30,6 +30,7 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; +use nexus_db_model::CabooseWhich; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -37,14 +38,23 @@ use nexus_db_model::DnsName; use nexus_db_model::DnsVersion; use nexus_db_model::DnsZone; use nexus_db_model::ExternalIp; +use nexus_db_model::HwBaseboardId; use nexus_db_model::Instance; +use nexus_db_model::InvCaboose; +use nexus_db_model::InvCollection; +use nexus_db_model::InvCollectionError; +use nexus_db_model::InvRootOfTrust; +use nexus_db_model::InvServiceProcessor; use nexus_db_model::Project; use nexus_db_model::Region; use nexus_db_model::Sled; +use nexus_db_model::SpType; +use nexus_db_model::SwCaboose; use nexus_db_model::Vmm; use nexus_db_model::Zpool; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::DataStoreConnection; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; @@ -58,6 +68,7 @@ use omicron_common::api::external::Generation; use omicron_common::postgres_config::PostgresConfigWithUrl; use std::cmp::Ordering; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashSet; use std::fmt::Display; use std::num::NonZeroU32; @@ -128,14 +139,16 @@ enum DbCommands { Disks(DiskArgs), /// Print information about internal and external DNS Dns(DnsArgs), - /// Print information about control plane services - Services(ServicesArgs), - /// Print information about sleds - Sleds, /// Print information about customer instances Instances, + /// Print information about collected hardware/software inventory + Inventory(InventoryArgs), /// Print information about the network Network(NetworkArgs), + /// Print information about control plane services + Services(ServicesArgs), + /// Print information about sleds + Sleds, } #[derive(Debug, Args)] @@ -206,6 +219,42 @@ impl CliDnsGroup { } } +#[derive(Debug, Args)] +struct InventoryArgs { + #[command(subcommand)] + command: InventoryCommands, +} + +#[derive(Debug, Subcommand)] +enum InventoryCommands { + /// list all baseboards ever found + BaseboardIds, + /// list all cabooses ever found + Cabooses, + /// list and show details from particular collections + Collections(CollectionsArgs), +} + +#[derive(Debug, Args)] +struct CollectionsArgs { + #[command(subcommand)] + command: CollectionsCommands, +} + +#[derive(Debug, Subcommand)] +enum CollectionsCommands { + /// list collections + List, + /// show what was found in a particular collection + Show(CollectionsShowArgs), +} + +#[derive(Debug, Args)] +struct CollectionsShowArgs { + /// id of the collection + id: Uuid, +} + #[derive(Debug, Args)] struct ServicesArgs { #[command(subcommand)] @@ -309,6 +358,20 @@ impl DbArgs { cmd_db_dns_names(&opctx, &datastore, self.fetch_limit, args) .await } + DbCommands::Instances => { + cmd_db_instances(&datastore, self.fetch_limit).await + } + DbCommands::Inventory(inventory_args) => { + cmd_db_inventory(&datastore, self.fetch_limit, inventory_args) + .await + } + DbCommands::Network(NetworkArgs { + command: NetworkCommands::ListEips, + verbose, + }) => { + cmd_db_eips(&opctx, &datastore, self.fetch_limit, *verbose) + .await + } DbCommands::Services(ServicesArgs { command: ServicesCommands::ListInstances, }) => { @@ -332,16 +395,6 @@ impl DbArgs { DbCommands::Sleds => { cmd_db_sleds(&opctx, &datastore, self.fetch_limit).await } - DbCommands::Instances => { - cmd_db_instances(&datastore, self.fetch_limit).await - } - DbCommands::Network(NetworkArgs { - command: NetworkCommands::ListEips, - verbose, - }) => { - cmd_db_eips(&opctx, &datastore, self.fetch_limit, *verbose) - .await - } } } } @@ -1398,3 +1451,554 @@ fn format_record(record: &DnsRecord) -> impl Display { } } } + +// Inventory + +async fn cmd_db_inventory( + datastore: &DataStore, + limit: NonZeroU32, + inventory_args: &InventoryArgs, +) -> Result<(), anyhow::Error> { + let conn = datastore.pool_connection_for_tests().await?; + match inventory_args.command { + InventoryCommands::BaseboardIds => { + cmd_db_inventory_baseboard_ids(&conn, limit).await + } + InventoryCommands::Cabooses => { + cmd_db_inventory_cabooses(&conn, limit).await + } + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::List, + }) => cmd_db_inventory_collections_list(&conn, limit).await, + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::Show(CollectionsShowArgs { id }), + }) => cmd_db_inventory_collections_show(&conn, id, limit).await, + } +} + +async fn cmd_db_inventory_baseboard_ids( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct BaseboardRow { + id: Uuid, + part_number: String, + serial_number: String, + } + + use db::schema::hw_baseboard_id::dsl; + let baseboard_ids = dsl::hw_baseboard_id + .order_by((dsl::part_number, dsl::serial_number)) + .limit(i64::from(u32::from(limit))) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboard ids")?; + check_limit(&baseboard_ids, limit, || "loading baseboard ids"); + + let rows = baseboard_ids.into_iter().map(|baseboard_id| BaseboardRow { + id: baseboard_id.id, + part_number: baseboard_id.part_number, + serial_number: baseboard_id.serial_number, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_cabooses( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow { + id: Uuid, + board: String, + git_commit: String, + name: String, + version: String, + } + + use db::schema::sw_caboose::dsl; + let mut cabooses = dsl::sw_caboose + .limit(i64::from(u32::from(limit))) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading cabooses")?; + check_limit(&cabooses, limit, || "loading cabooses"); + cabooses.sort(); + + let rows = cabooses.into_iter().map(|caboose| CabooseRow { + id: caboose.id, + board: caboose.board, + name: caboose.name, + version: caboose.version, + git_commit: caboose.git_commit, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_list( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CollectionRow { + id: Uuid, + started: String, + took: String, + nsps: i64, + nerrors: i64, + } + + let collections = { + use db::schema::inv_collection::dsl; + dsl::inv_collection + .order_by(dsl::time_started) + .limit(i64::from(u32::from(limit))) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collections")? + }; + check_limit(&collections, limit, || "loading collections"); + + let mut rows = Vec::new(); + for collection in collections { + let nerrors = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting errors")? + }; + + let nsps = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting SPs")? + }; + + let took = format!( + "{} ms", + collection + .time_done + .signed_duration_since(&collection.time_started) + .num_milliseconds() + ); + rows.push(CollectionRow { + id: collection.id, + started: humantime::format_rfc3339_seconds( + collection.time_started.into(), + ) + .to_string(), + took, + nsps, + nerrors, + }); + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_show( + conn: &DataStoreConnection<'_>, + id: Uuid, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + inv_collection_print(conn, id).await?; + let nerrors = inv_collection_print_errors(conn, id, limit).await?; + + // Load all the baseboards. We could select only the baseboards referenced + // by this collection. But it's simpler to fetch everything. And it's + // uncommon enough at this point to have unreferenced baseboards that it's + // worth calling them out. + let baseboard_ids = { + use db::schema::hw_baseboard_id::dsl; + let baseboard_ids = dsl::hw_baseboard_id + .limit(i64::from(u32::from(limit))) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboard ids")?; + check_limit(&baseboard_ids, limit, || "loading baseboard ids"); + baseboard_ids.into_iter().map(|b| (b.id, b)).collect::>() + }; + + // Similarly, load cabooses that are referenced by this collection. + let cabooses = { + use db::schema::inv_caboose::dsl as inv_dsl; + use db::schema::sw_caboose::dsl as sw_dsl; + let unique_cabooses = inv_dsl::inv_caboose + .filter(inv_dsl::inv_collection_id.eq(id)) + .select(inv_dsl::sw_caboose_id) + .distinct(); + let cabooses = sw_dsl::sw_caboose + .filter(sw_dsl::id.eq_any(unique_cabooses)) + .limit(i64::from(u32::from(limit))) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading cabooses")?; + check_limit(&cabooses, limit, || "loading cabooses"); + cabooses.into_iter().map(|c| (c.id, c)).collect::>() + }; + + inv_collection_print_devices(conn, id, limit, &baseboard_ids, &cabooses) + .await?; + + if nerrors > 0 { + eprintln!( + "warning: {} collection error{} {} reported above", + nerrors, + if nerrors == 1 { "was" } else { "were" }, + if nerrors == 1 { "" } else { "s" } + ); + } + + Ok(()) +} + +async fn inv_collection_print( + conn: &DataStoreConnection<'_>, + id: Uuid, +) -> Result<(), anyhow::Error> { + use db::schema::inv_collection::dsl; + let collections = dsl::inv_collection + .filter(dsl::id.eq(id)) + .limit(2) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collection")?; + anyhow::ensure!( + collections.len() == 1, + "expected exactly one collection with id {}, found {}", + id, + collections.len() + ); + let c = collections.into_iter().next().unwrap(); + println!("collection: {}", c.id); + println!( + "collector: {}{}", + c.collector, + if c.collector.parse::().is_ok() { + " (likely a Nexus instance)" + } else { + "" + } + ); + println!( + "started: {}", + humantime::format_rfc3339_millis(c.time_started.into()) + ); + println!( + "done: {}", + humantime::format_rfc3339_millis(c.time_done.into()) + ); + + Ok(()) +} + +async fn inv_collection_print_errors( + conn: &DataStoreConnection<'_>, + id: Uuid, + limit: NonZeroU32, +) -> Result { + use db::schema::inv_collection_error::dsl; + let errors = dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(id)) + .limit(i64::from(u32::from(limit))) + .select(InvCollectionError::as_select()) + .load_async(&**conn) + .await + .context("loading collection errors")?; + check_limit(&errors, limit, || "loading collection errors"); + + println!("errors: {}", errors.len()); + for e in &errors { + println!(" error {}: {}", e.idx, e.message); + } + + Ok(errors + .len() + .try_into() + .expect("could not convert error count into u32 (yikes)")) +} + +async fn inv_collection_print_devices( + conn: &DataStoreConnection<'_>, + id: Uuid, + limit: NonZeroU32, + baseboard_ids: &BTreeMap, + sw_cabooses: &BTreeMap, +) -> Result<(), anyhow::Error> { + // Load the service processors, grouped by baseboard id. + let sps: BTreeMap = { + use db::schema::inv_service_processor::dsl; + let sps = dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(id)) + .limit(i64::from(u32::from(limit))) + .select(InvServiceProcessor::as_select()) + .load_async(&**conn) + .await + .context("loading service processors")?; + check_limit(&sps, limit, || "loading service processors"); + sps.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() + }; + + // Load the roots of trust, grouped by baseboard id. + let rots: BTreeMap = { + use db::schema::inv_root_of_trust::dsl; + let rots = dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(id)) + .limit(i64::from(u32::from(limit))) + .select(InvRootOfTrust::as_select()) + .load_async(&**conn) + .await + .context("loading roots of trust")?; + check_limit(&rots, limit, || "loading roots of trust"); + rots.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() + }; + + // Load cabooses found, grouped by baseboard id. + let inv_cabooses = { + use db::schema::inv_caboose::dsl; + let cabooses_found = dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(id)) + .limit(i64::from(u32::from(limit))) + .select(InvCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading cabooses found")?; + check_limit(&cabooses_found, limit, || "loading cabooses found"); + + let mut cabooses: BTreeMap> = BTreeMap::new(); + for ic in cabooses_found { + cabooses + .entry(ic.hw_baseboard_id) + .or_insert_with(Vec::new) + .push(ic); + } + cabooses + }; + + // Assemble a list of baseboard ids, sorted first by device type (sled, + // switch, power), then by slot number. This is the order in which we will + // print everything out. + let mut sorted_baseboard_ids: Vec<_> = sps.keys().cloned().collect(); + sorted_baseboard_ids.sort_by(|s1, s2| { + let sp1 = sps.get(s1).unwrap(); + let sp2 = sps.get(s2).unwrap(); + sp1.sp_type.cmp(&sp2.sp_type).then(sp1.sp_slot.cmp(&sp2.sp_slot)) + }); + + // Now print them. + for baseboard_id in &sorted_baseboard_ids { + // This unwrap should not fail because the collection we're iterating + // over came from the one we're looking into now. + let sp = sps.get(baseboard_id).unwrap(); + let baseboard = baseboard_ids.get(baseboard_id); + let rot = rots.get(baseboard_id); + + println!(""); + match baseboard { + None => { + // It should be impossible to find an SP whose baseboard + // information we didn't previously fetch. That's either a bug + // in this tool (for failing to fetch or find the right + // baseboard information) or the inventory system (for failing + // to insert a record into the hw_baseboard_id table). + println!( + "{:?} (serial number unknown -- this is a bug)", + sp.sp_type + ); + println!(" part number: unknown"); + } + Some(baseboard) => { + println!("{:?} {}", sp.sp_type, baseboard.serial_number); + println!(" part number: {}", baseboard.part_number); + } + }; + + println!(" power: {:?}", sp.power_state); + println!(" revision: {}", sp.baseboard_revision); + print!(" MGS slot: {:?} {}", sp.sp_type, sp.sp_slot); + if let SpType::Sled = sp.sp_type { + print!(" (cubby {})", sp.sp_slot); + } + println!(""); + println!(" found at: {} from {}", sp.time_collected, sp.source); + + println!(" cabooses:"); + if let Some(my_inv_cabooses) = inv_cabooses.get(baseboard_id) { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: &'static str, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, + } + let mut nbugs = 0; + let rows = my_inv_cabooses.iter().map(|ic| { + let slot = match ic.which { + CabooseWhich::SpSlot0 => " SP slot 0", + CabooseWhich::SpSlot1 => " SP slot 1", + CabooseWhich::RotSlotA => "RoT slot A", + CabooseWhich::RotSlotB => "RoT slot B", + }; + + let (board, name, version, git_commit) = + match sw_cabooses.get(&ic.sw_caboose_id) { + None => { + nbugs += 1; + ("-", "-", "-", "-") + } + Some(c) => ( + c.board.as_str(), + c.name.as_str(), + c.version.as_str(), + c.git_commit.as_str(), + ), + }; + + CabooseRow { slot, board, name, version, git_commit } + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", textwrap::indent(&table.to_string(), " ")); + + if nbugs > 0 { + // Similar to above, if we don't have the sw_caboose for some + // inv_caboose, then it's a bug in either this tool (if we + // failed to fetch it) or the inventory system (if it failed to + // insert it). + println!( + "error: at least one caboose above was missing data \ + -- this is a bug" + ); + } + } + + if let Some(rot) = rot { + println!(" RoT: active slot: slot {:?}", rot.slot_active); + println!( + " RoT: persistent boot preference: slot {:?}", + rot.slot_active + ); + println!( + " RoT: pending persistent boot preference: {}", + rot.slot_boot_pref_persistent_pending + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + println!( + " RoT: transient boot preference: {}", + rot.slot_boot_pref_transient + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot A SHA3-256: {}", + rot.slot_a_sha3_256 + .clone() + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot B SHA3-256: {}", + rot.slot_b_sha3_256 + .clone() + .unwrap_or_else(|| String::from("-")) + ); + } else { + println!(" RoT: no information found"); + } + } + + println!(""); + for unused_baseboard in baseboard_ids + .keys() + .collect::>() + .difference(&sps.keys().collect::>()) + { + // It's not a bug in either omdb or the inventory system to find a + // baseboard not referenced in the collection. It might just mean a + // sled was removed from the system. But at this point it's uncommon + // enough to call out. + let b = baseboard_ids.get(unused_baseboard).unwrap(); + eprintln!( + "note: baseboard previously found, but not in this \ + collection: part {} serial {}", + b.part_number, b.serial_number + ); + } + for sp_missing_rot in sps + .keys() + .collect::>() + .difference(&rots.keys().collect::>()) + { + // It's not a bug in either omdb or the inventory system to find an SP + // with no RoT. It just means that when we collected inventory from the + // SP, it couldn't communicate with its RoT. + let sp = sps.get(sp_missing_rot).unwrap(); + println!( + "warning: found SP with no RoT: {:?} slot {}", + sp.sp_type, sp.sp_slot + ); + } + for rot_missing_sp in rots + .keys() + .collect::>() + .difference(&sps.keys().collect::>()) + { + // It *is* a bug in the inventory system (or omdb) to find an RoT with + // no SP, since we get the RoT information from the SP in the first + // place. + let rot = rots.get(rot_missing_sp).unwrap(); + println!( + "error: found RoT with no SP: \ + hw_baseboard_id {:?} -- this is a bug", + rot.hw_baseboard_id + ); + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 7599fc209d..cbfb6f91b8 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -6,6 +6,7 @@ use crate::Omdb; use anyhow::Context; +use chrono::DateTime; use chrono::SecondsFormat; use chrono::Utc; use clap::Args; @@ -478,6 +479,38 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } } } + } else if name == "inventory_collection" { + #[derive(Deserialize)] + struct InventorySuccess { + collection_id: Uuid, + time_started: DateTime, + time_done: DateTime, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(found_inventory) => { + println!( + " last collection id: {}", + found_inventory.collection_id + ); + println!( + " last collection started: {}", + found_inventory + .time_started + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + println!( + " last collection done: {}", + found_inventory + .time_done + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 8e345b78d1..2cf42c1b22 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,6 +57,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT @@ -113,6 +117,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. @@ -156,6 +164,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 6fd84c5eb3..2d03c697f5 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -227,6 +227,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ @@ -308,6 +312,15 @@ task: "external_endpoints" TLS certificates: 0 +task: "inventory_collection" + configured period: every 10m + currently executing: no + last completed activation: iter 3, triggered by an explicit signal + started at (s ago) and ran for ms + last collection id: REDACTED_UUID_REDACTED_UUID_REDACTED + last collection started: + last collection done: + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 7bedc3ecbc..dc75278fc3 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -92,10 +92,11 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS - services Print information about control plane services - sleds Print information about sleds instances Print information about customer instances + inventory Print information about collected hardware/software inventory network Print information about the network + services Print information about control plane services + sleds Print information about sleds help Print this message or the help of the given subcommand(s) Options: @@ -114,10 +115,11 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS - services Print information about control plane services - sleds Print information about sleds instances Print information about customer instances + inventory Print information about collected hardware/software inventory network Print information about the network + services Print information about control plane services + sleds Print information about sleds help Print this message or the help of the given subcommand(s) Options: diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index e79184f7e5..66778d96e7 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -403,6 +403,10 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { cptestctx.silo_name, cptestctx.external_dns_zone_name, ); + println!( + "omicron-dev: management gateway: http://{}", + cptestctx.gateway.client.bind_address, + ); println!("omicron-dev: silo name: {}", cptestctx.silo_name,); println!( "omicron-dev: privileged user name: {}", diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 3de6dac7c0..65a16b0d35 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -83,6 +83,7 @@ usdt.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true +nexus-inventory.workspace = true nexus-types.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs new file mode 100644 index 0000000000..1e8b3e3ea2 --- /dev/null +++ b/nexus/db-model/src/inventory.rs @@ -0,0 +1,354 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing the hardware/software inventory in the database + +use crate::schema::{ + hw_baseboard_id, inv_caboose, inv_collection, inv_collection_error, + inv_root_of_trust, inv_service_processor, sw_caboose, +}; +use crate::{impl_enum_type, SqlU16, SqlU32}; +use chrono::DateTime; +use chrono::Utc; +use diesel::backend::Backend; +use diesel::deserialize::{self, FromSql}; +use diesel::expression::AsExpression; +use diesel::pg::Pg; +use diesel::serialize::ToSql; +use diesel::{serialize, sql_types}; +use nexus_types::inventory::{ + BaseboardId, Caboose, Collection, PowerState, RotSlot, +}; +use uuid::Uuid; + +// See [`nexus_types::inventory::PowerState`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_power_state"))] + pub struct HwPowerStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwPowerStateEnum)] + pub enum HwPowerState; + + // Enum values + A0 => b"A0" + A1 => b"A1" + A2 => b"A2" +); + +impl From for HwPowerState { + fn from(p: PowerState) -> Self { + match p { + PowerState::A0 => HwPowerState::A0, + PowerState::A1 => HwPowerState::A1, + PowerState::A2 => HwPowerState::A2, + } + } +} + +// See [`nexus_types::inventory::RotSlot`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_rot_slot"))] + pub struct HwRotSlotEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwRotSlotEnum)] + pub enum HwRotSlot; + + // Enum values + A => b"A" + B => b"B" +); + +impl From for HwRotSlot { + fn from(value: RotSlot) -> Self { + match value { + RotSlot::A => HwRotSlot::A, + RotSlot::B => HwRotSlot::B, + } + } +} + +// See [`nexus_types::inventory::CabooseWhich`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "caboose_which"))] + pub struct CabooseWhichEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = CabooseWhichEnum)] + pub enum CabooseWhich; + + // Enum values + SpSlot0 => b"sp_slot_0" + SpSlot1 => b"sp_slot_1" + RotSlotA => b"rot_slot_A" + RotSlotB => b"rot_slot_B" +); + +impl From for CabooseWhich { + fn from(c: nexus_types::inventory::CabooseWhich) -> Self { + match c { + nexus_types::inventory::CabooseWhich::SpSlot0 => { + CabooseWhich::SpSlot0 + } + nexus_types::inventory::CabooseWhich::SpSlot1 => { + CabooseWhich::SpSlot1 + } + nexus_types::inventory::CabooseWhich::RotSlotA => { + CabooseWhich::RotSlotA + } + nexus_types::inventory::CabooseWhich::RotSlotB => { + CabooseWhich::RotSlotB + } + } + } +} + +// See [`nexus_types::inventory::SpType`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sp_type"))] + pub struct SpTypeEnum; + + #[derive( + Copy, + Clone, + Debug, + AsExpression, + FromSqlRow, + PartialOrd, + Ord, + PartialEq, + Eq + )] + #[diesel(sql_type = SpTypeEnum)] + pub enum SpType; + + // Enum values + Sled => b"sled" + Switch => b"switch" + Power => b"power" +); + +impl From for SpType { + fn from(value: nexus_types::inventory::SpType) -> Self { + match value { + nexus_types::inventory::SpType::Sled => SpType::Sled, + nexus_types::inventory::SpType::Power => SpType::Power, + nexus_types::inventory::SpType::Switch => SpType::Switch, + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection)] +pub struct InvCollection { + pub id: Uuid, + pub time_started: DateTime, + pub time_done: DateTime, + pub collector: String, +} + +impl<'a> From<&'a Collection> for InvCollection { + fn from(c: &'a Collection) -> Self { + InvCollection { + id: c.id, + time_started: c.time_started, + time_done: c.time_done, + collector: c.collector.clone(), + } + } +} + +/// See [`nexus_types::inventory::HwBaseboardId`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = hw_baseboard_id)] +pub struct HwBaseboardId { + pub id: Uuid, + pub part_number: String, + pub serial_number: String, +} + +impl<'a> From<&'a BaseboardId> for HwBaseboardId { + fn from(c: &'a BaseboardId) -> Self { + HwBaseboardId { + id: Uuid::new_v4(), + part_number: c.part_number.clone(), + serial_number: c.serial_number.clone(), + } + } +} + +/// See [`nexus_types::inventory::SwCaboose`]. +#[derive( + Queryable, + Insertable, + Clone, + Debug, + Selectable, + Eq, + PartialEq, + Ord, + PartialOrd, +)] +#[diesel(table_name = sw_caboose)] +pub struct SwCaboose { + pub id: Uuid, + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl<'a> From<&'a Caboose> for SwCaboose { + fn from(c: &'a Caboose) -> Self { + SwCaboose { + id: Uuid::new_v4(), + board: c.board.clone(), + git_commit: c.git_commit.clone(), + name: c.name.clone(), + version: c.version.clone(), + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection_error)] +pub struct InvCollectionError { + pub inv_collection_id: Uuid, + pub idx: SqlU16, + pub message: String, +} + +impl InvCollectionError { + pub fn new(inv_collection_id: Uuid, idx: u16, message: String) -> Self { + InvCollectionError { + inv_collection_id, + idx: SqlU16::from(idx), + message, + } + } +} + +/// See [`nexus_types::inventory::ServiceProcessor`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_service_processor)] +pub struct InvServiceProcessor { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + + pub baseboard_revision: BaseboardRevision, + pub hubris_archive_id: String, + pub power_state: HwPowerState, +} + +/// Newtype wrapping the MGS-reported slot number for an SP +/// +/// Current racks only have 32 slots for any given SP type. MGS represents the +/// slot number with a u32. We truncate it to a u16 (which still requires +/// storing it as an i32 in the database, since the database doesn't natively +/// support signed integers). +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int4)] +pub struct SpMgsSlot(SqlU16); + +NewtypeFrom! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDeref! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDisplay! { () pub struct SpMgsSlot(SqlU16); } + +impl ToSql for SpMgsSlot { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for SpMgsSlot +where + DB: Backend, + SqlU16: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(SpMgsSlot(SqlU16::from_sql(bytes)?)) + } +} + +/// Newtype wrapping the revision number for a particular baseboard +/// +/// MGS reports this as a u32 and we represent it the same way, though that +/// would be quite a lot of hardware revisions to go through! +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int8)] +pub struct BaseboardRevision(SqlU32); + +NewtypeFrom! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDeref! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDisplay! { () pub struct BaseboardRevision(SqlU32); } + +impl ToSql for BaseboardRevision { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for BaseboardRevision +where + DB: Backend, + SqlU32: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(BaseboardRevision(SqlU32::from_sql(bytes)?)) + } +} + +/// See [`nexus_types::inventory::RootOfTrust`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_root_of_trust)] +pub struct InvRootOfTrust { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub slot_active: HwRotSlot, + pub slot_boot_pref_transient: Option, + pub slot_boot_pref_persistent: HwRotSlot, + pub slot_boot_pref_persistent_pending: Option, + pub slot_a_sha3_256: Option, + pub slot_b_sha3_256: Option, +} + +/// See [`nexus_types::inventory::Caboose`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_caboose)] +pub struct InvCaboose { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub which: CabooseWhich, + pub sw_caboose_id: Uuid, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index f1447fc503..a424551e7b 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -31,6 +31,7 @@ mod image; mod instance; mod instance_cpu_count; mod instance_state; +mod inventory; mod ip_pool; mod ipv4net; mod ipv6; @@ -119,6 +120,7 @@ pub use image::*; pub use instance::*; pub use instance_cpu_count::*; pub use instance_state::*; +pub use inventory::*; pub use ip_pool::*; pub use ipv4net::*; pub use ipv6::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 61a05754c6..0b41733e6d 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1127,6 +1127,87 @@ table! { } } +/* hardware inventory */ + +table! { + hw_baseboard_id (id) { + id -> Uuid, + part_number -> Text, + serial_number -> Text, + } +} + +table! { + sw_caboose (id) { + id -> Uuid, + board -> Text, + git_commit -> Text, + name -> Text, + version -> Text, + } +} + +table! { + inv_collection (id) { + id -> Uuid, + time_started -> Timestamptz, + time_done -> Timestamptz, + collector -> Text, + } +} + +table! { + inv_collection_error (inv_collection_id, idx) { + inv_collection_id -> Uuid, + idx -> Int4, + message -> Text, + } +} + +table! { + inv_service_processor (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + sp_type -> crate::SpTypeEnum, + sp_slot -> Int4, + + baseboard_revision -> Int8, + hubris_archive_id -> Text, + power_state -> crate::HwPowerStateEnum, + } +} + +table! { + inv_root_of_trust (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + slot_active -> crate::HwRotSlotEnum, + slot_boot_pref_transient -> Nullable, + slot_boot_pref_persistent -> crate::HwRotSlotEnum, + slot_boot_pref_persistent_pending -> Nullable, + slot_a_sha3_256 -> Nullable, + slot_b_sha3_256 -> Nullable, + } +} + +table! { + inv_caboose (inv_collection_id, hw_baseboard_id, which) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + which -> crate::CabooseWhichEnum, + sw_caboose_id -> Uuid, + } +} + table! { db_metadata (singleton) { singleton -> Bool, @@ -1154,6 +1235,10 @@ joinable!(system_update_component_update -> component_update (component_update_i allow_tables_to_appear_in_same_query!(ip_pool_range, ip_pool); joinable!(ip_pool_range -> ip_pool (ip_pool_id)); +allow_tables_to_appear_in_same_query!(inv_collection, inv_collection_error); +joinable!(inv_collection_error -> inv_collection (inv_collection_id)); +allow_tables_to_appear_in_same_query!(sw_caboose, inv_caboose); + allow_tables_to_appear_in_same_query!( dataset, disk, diff --git a/nexus/db-model/src/unsigned.rs b/nexus/db-model/src/unsigned.rs index 7059c6bcad..b4e9db2308 100644 --- a/nexus/db-model/src/unsigned.rs +++ b/nexus/db-model/src/unsigned.rs @@ -83,6 +83,7 @@ pub struct SqlU16(pub u16); NewtypeFrom! { () pub struct SqlU16(u16); } NewtypeDeref! { () pub struct SqlU16(u16); } +NewtypeDisplay! { () pub struct SqlU16(u16); } impl SqlU16 { pub fn new(value: u16) -> Self { @@ -134,6 +135,7 @@ pub struct SqlU32(pub u32); NewtypeFrom! { () pub struct SqlU32(u32); } NewtypeDeref! { () pub struct SqlU32(u32); } +NewtypeDisplay! { () pub struct SqlU32(u32); } impl SqlU32 { pub fn new(value: u32) -> Self { diff --git a/nexus/db-queries/src/authz/api_resources.rs b/nexus/db-queries/src/authz/api_resources.rs index ec959e2907..b22fe1ac25 100644 --- a/nexus/db-queries/src/authz/api_resources.rs +++ b/nexus/db-queries/src/authz/api_resources.rs @@ -473,6 +473,61 @@ impl AuthorizedResource for DeviceAuthRequestList { } } +/// Synthetic resource used for modeling access to low-level hardware inventory +/// data +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Inventory; +pub const INVENTORY: Inventory = Inventory {}; + +impl oso::PolarClass for Inventory { + fn get_polar_class_builder() -> oso::ClassBuilder { + // Roles are not directly attached to Inventory + oso::Class::builder() + .with_equality_check() + .add_method( + "has_role", + |_: &Inventory, _actor: AuthenticatedActor, _role: String| { + false + }, + ) + .add_attribute_getter("fleet", |_| FLEET) + } +} + +impl AuthorizedResource for Inventory { + fn load_roles<'a, 'b, 'c, 'd, 'e, 'f>( + &'a self, + opctx: &'b OpContext, + datastore: &'c DataStore, + authn: &'d authn::Context, + roleset: &'e mut RoleSet, + ) -> futures::future::BoxFuture<'f, Result<(), Error>> + where + 'a: 'f, + 'b: 'f, + 'c: 'f, + 'd: 'f, + 'e: 'f, + { + load_roles_for_resource_tree(&FLEET, opctx, datastore, authn, roleset) + .boxed() + } + + fn on_unauthorized( + &self, + _: &Authz, + error: Error, + _: AnyActor, + _: Action, + ) -> Error { + error + } + + fn polar_class(&self) -> oso::Class { + Self::get_polar_class() + } +} + /// Synthetic resource describing the list of Certificates associated with a /// Silo #[derive(Clone, Debug, Eq, PartialEq)] diff --git a/nexus/db-queries/src/authz/omicron.polar b/nexus/db-queries/src/authz/omicron.polar index 119eccc8e9..87fdf72f6a 100644 --- a/nexus/db-queries/src/authz/omicron.polar +++ b/nexus/db-queries/src/authz/omicron.polar @@ -365,6 +365,16 @@ resource DnsConfig { has_relation(fleet: Fleet, "parent_fleet", dns_config: DnsConfig) if dns_config.fleet = fleet; +# Describes the policy for reading and modifying low-level inventory +resource Inventory { + permissions = [ "read", "modify" ]; + relations = { parent_fleet: Fleet }; + "read" if "viewer" on "parent_fleet"; + "modify" if "admin" on "parent_fleet"; +} +has_relation(fleet: Fleet, "parent_fleet", inventory: Inventory) + if inventory.fleet = fleet; + # Describes the policy for accessing "/v1/system/ip-pools" in the API resource IpPoolList { permissions = [ diff --git a/nexus/db-queries/src/authz/oso_generic.rs b/nexus/db-queries/src/authz/oso_generic.rs index bcd7a42945..e642062ead 100644 --- a/nexus/db-queries/src/authz/oso_generic.rs +++ b/nexus/db-queries/src/authz/oso_generic.rs @@ -106,6 +106,7 @@ pub fn make_omicron_oso(log: &slog::Logger) -> Result { Database::get_polar_class(), DnsConfig::get_polar_class(), Fleet::get_polar_class(), + Inventory::get_polar_class(), IpPoolList::get_polar_class(), ConsoleSessionList::get_polar_class(), DeviceAuthRequestList::get_polar_class(), diff --git a/nexus/db-queries/src/authz/policy_test/resource_builder.rs b/nexus/db-queries/src/authz/policy_test/resource_builder.rs index a4c68ea000..f10c969038 100644 --- a/nexus/db-queries/src/authz/policy_test/resource_builder.rs +++ b/nexus/db-queries/src/authz/policy_test/resource_builder.rs @@ -244,9 +244,10 @@ macro_rules! impl_dyn_authorized_resource_for_global { impl_dyn_authorized_resource_for_global!(authz::oso_generic::Database); impl_dyn_authorized_resource_for_global!(authz::ConsoleSessionList); +impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); impl_dyn_authorized_resource_for_global!(authz::DnsConfig); impl_dyn_authorized_resource_for_global!(authz::IpPoolList); -impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); +impl_dyn_authorized_resource_for_global!(authz::Inventory); impl DynAuthorizedResource for authz::SiloCertificateList { fn do_authorize<'a, 'b>( diff --git a/nexus/db-queries/src/authz/policy_test/resources.rs b/nexus/db-queries/src/authz/policy_test/resources.rs index 054fe6430b..3049f3b9bf 100644 --- a/nexus/db-queries/src/authz/policy_test/resources.rs +++ b/nexus/db-queries/src/authz/policy_test/resources.rs @@ -67,6 +67,7 @@ pub async fn make_resources( builder.new_resource(authz::CONSOLE_SESSION_LIST); builder.new_resource(authz::DNS_CONFIG); builder.new_resource(authz::DEVICE_AUTH_REQUEST_LIST); + builder.new_resource(authz::INVENTORY); builder.new_resource(authz::IP_POOL_LIST); // Silo/organization/project hierarchy diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs new file mode 100644 index 0000000000..e58aae3d1e --- /dev/null +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -0,0 +1,914 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::TransactionError; +use async_bb8_diesel::AsyncConnection; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::DateTime; +use chrono::Utc; +use diesel::sql_types; +use diesel::sql_types::Nullable; +use diesel::Column; +use diesel::ExpressionMethods; +use diesel::IntoSql; +use diesel::NullableExpressionMethods; +use diesel::QueryDsl; +use diesel::QuerySource; +use diesel::Table; +use nexus_db_model::CabooseWhich; +use nexus_db_model::CabooseWhichEnum; +use nexus_db_model::HwBaseboardId; +use nexus_db_model::HwPowerState; +use nexus_db_model::HwPowerStateEnum; +use nexus_db_model::HwRotSlot; +use nexus_db_model::HwRotSlotEnum; +use nexus_db_model::InvCollection; +use nexus_db_model::InvCollectionError; +use nexus_db_model::SpType; +use nexus_db_model::SpTypeEnum; +use nexus_db_model::SwCaboose; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseFound; +use nexus_types::inventory::Collection; +use omicron_common::api::external::Error; +use omicron_common::api::external::InternalContext; +use uuid::Uuid; + +impl DataStore { + /// Store a complete inventory collection into the database + pub async fn inventory_insert_collection( + &self, + opctx: &OpContext, + collection: &Collection, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + // In the database, the collection is represented essentially as a tree + // rooted at an `inv_collection` row. Other nodes in the tree point + // back at the `inv_collection` via `inv_collection_id`. + // + // It's helpful to assemble some values before entering the transaction + // so that we can produce the `Error` type that we want here. + let row_collection = InvCollection::from(collection); + let collection_id = row_collection.id; + let baseboards = collection + .baseboards + .iter() + .map(|b| HwBaseboardId::from(b.as_ref())) + .collect::>(); + let cabooses = collection + .cabooses + .iter() + .map(|s| SwCaboose::from(s.as_ref())) + .collect::>(); + let error_values = collection + .errors + .iter() + .enumerate() + .map(|(i, error)| { + let index = u16::try_from(i).map_err(|e| { + Error::internal_error(&format!( + "failed to convert error index to u16 (too \ + many errors in inventory collection?): {}", + e + )) + })?; + let message = format!("{:#}", error); + Ok(InvCollectionError::new(collection_id, index, message)) + }) + .collect::, Error>>()?; + + // This implementation inserts all records associated with the + // collection in one transaction. This is primarily for simplicity. It + // means we don't have to worry about other readers seeing a + // half-inserted collection, nor leaving detritus around if we start + // inserting records and then crash. However, it does mean this is + // likely to be a big transaction and if that becomes a problem we could + // break this up as long as we address those problems. + // + // The SQL here is written so that it doesn't have to be an + // *interactive* transaction. That is, it should in principle be + // possible to generate all this SQL up front and send it as one big + // batch rather than making a bunch of round-trips to the database. + // We'd do that if we had an interface for doing that with bound + // parameters, etc. See oxidecomputer/omicron#973. + let pool = self.pool_connection_authorized(opctx).await?; + pool.transaction_async(|conn| async move { + // Insert records (and generate ids) for any baseboards that do not + // already exist in the database. These rows are not scoped to a + // particular collection. They contain only immutable data -- + // they're just a mapping between hardware-provided baseboard + // identifiers (part number and model number) and an + // Omicron-specific primary key (a UUID). + { + use db::schema::hw_baseboard_id::dsl; + let _ = diesel::insert_into(dsl::hw_baseboard_id) + .values(baseboards) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert records (and generate ids) for each distinct caboose that + // we've found. Like baseboards, these might already be present and + // rows in this table are not scoped to a particular collection + // because they only map (immutable) identifiers to UUIDs. + { + use db::schema::sw_caboose::dsl; + let _ = diesel::insert_into(dsl::sw_caboose) + .values(cabooses) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert a record describing the collection itself. + { + use db::schema::inv_collection::dsl; + let _ = diesel::insert_into(dsl::inv_collection) + .values(row_collection) + .execute_async(&conn) + .await?; + } + + // Insert rows for the service processors we found. These have a + // foreign key into the hw_baseboard_id table. We don't have those + // id values, though. We may have just inserted them, or maybe not + // (if they previously existed). To avoid dozens of unnecessary + // round-trips, we use INSERT INTO ... SELECT, which looks like + // this: + // + // INSERT INTO inv_service_processor + // SELECT + // id + // [other service_processor column values as literals] + // FROM hw_baseboard_id + // WHERE part_number = ... AND serial_number = ...; + // + // This way, we don't need to know the id. The database looks it up + // for us as it does the INSERT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_service_processor::dsl as sp_dsl; + + for (baseboard_id, sp) in &collection.sps { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + sp.time_collected + .into_sql::(), + sp.source + .clone() + .into_sql::(), + SpType::from(sp.sp_type).into_sql::(), + i32::from(sp.sp_slot) + .into_sql::(), + i64::from(sp.baseboard_revision) + .into_sql::(), + sp.hubris_archive + .clone() + .into_sql::(), + HwPowerState::from(sp.power_state) + .into_sql::(), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_service_processor::table, + ) + .values(selection) + .into_columns(( + sp_dsl::inv_collection_id, + sp_dsl::hw_baseboard_id, + sp_dsl::time_collected, + sp_dsl::source, + sp_dsl::sp_type, + sp_dsl::sp_slot, + sp_dsl::baseboard_revision, + sp_dsl::hubris_archive_id, + sp_dsl::power_state, + )) + .execute_async(&conn) + .await?; + + // This statement is just here to force a compilation error + // if the set of columns in `inv_service_processor` changes. + // The code above attempts to insert a row into + // `inv_service_processor` using an explicit list of columns + // and values. Without the following statement, If a new + // required column were added, this would only fail at + // runtime. + // + // If you're here because of a compile error, you might be + // changing the `inv_service_processor` table. Update the + // statement below and be sure to update the code above, + // too! + // + // See also similar comments in blocks below, near other + // uses of `all_columns(). + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _sp_type, + _sp_slot, + _baseboard_revision, + _hubris_archive_id, + _power_state, + ) = sp_dsl::inv_service_processor::all_columns(); + } + } + + // Insert rows for the roots of trust that we found. Like service + // processors, we do this using INSERT INTO ... SELECT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_root_of_trust::dsl as rot_dsl; + + for (baseboard_id, rot) in &collection.rots { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + rot.time_collected + .into_sql::(), + rot.source + .clone() + .into_sql::(), + HwRotSlot::from(rot.active_slot) + .into_sql::(), + HwRotSlot::from(rot.persistent_boot_preference) + .into_sql::(), + rot.pending_persistent_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.transient_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.slot_a_sha3_256_digest + .clone() + .into_sql::>( + ), + rot.slot_b_sha3_256_digest + .clone() + .into_sql::>( + ), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_root_of_trust::table, + ) + .values(selection) + .into_columns(( + rot_dsl::inv_collection_id, + rot_dsl::hw_baseboard_id, + rot_dsl::time_collected, + rot_dsl::source, + rot_dsl::slot_active, + rot_dsl::slot_boot_pref_persistent, + rot_dsl::slot_boot_pref_persistent_pending, + rot_dsl::slot_boot_pref_transient, + rot_dsl::slot_a_sha3_256, + rot_dsl::slot_b_sha3_256, + )) + .execute_async(&conn) + .await?; + + // See the comment in the previous block (where we use + // `inv_service_processor::all_columns()`). The same + // applies here. + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _slot_active, + _slot_boot_pref_persistent, + _slot_boot_pref_persistent_pending, + _slot_boot_pref_transient, + _slot_a_sha3_256, + _slot_b_sha3_256, + ) = rot_dsl::inv_root_of_trust::all_columns(); + } + } + + // Insert rows for the cabooses that we found. Like service + // processors and roots of trust, we do this using INSERT INTO ... + // SELECT. But because there are two foreign keys, we need a more + // complicated `SELECT`, which requires using a CTE. + for (which, tree) in &collection.cabooses_found { + let db_which = nexus_db_model::CabooseWhich::from(*which); + for (baseboard_id, found_caboose) in tree { + InvCabooseInsert::new( + collection_id, + baseboard_id, + found_caboose, + db_which, + ) + .execute_async(&conn) + .await?; + } + } + + // Finally, insert the list of errors. + { + use db::schema::inv_collection_error::dsl as errors_dsl; + let _ = diesel::insert_into(errors_dsl::inv_collection_error) + .values(error_values) + .execute_async(&conn) + .await?; + } + + Ok(()) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + info!( + &opctx.log, + "inserted inventory collection"; + "collection_id" => collection.id.to_string(), + ); + + Ok(()) + } + + /// Prune inventory collections stored in the database, keeping at least + /// `nkeep`. + /// + /// This function removes as many collections as possible while preserving + /// the last `nkeep`. This will also preserve at least one "complete" + /// collection (i.e., one having zero errors). + // It might seem surprising that such a high-level application policy is + // embedded in the DataStore. The reason is that we want to push a bunch of + // the logic into the SQL to avoid interactive queries. + pub async fn inventory_prune_collections( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result<(), Error> { + // Assumptions: + // + // - Most of the time, there will be about `nkeep + 1` collections in + // the database. That's because the normal expected case is: we had + // `nkeep`, we created another one, and now we're pruning the oldest + // one. + // + // - There could be fewer collections in the database, early in the + // system's lifetime (before we've accumulated `nkeep` of them). + // + // - There could be many more collections in the database, if something + // has gone wrong and we've fallen behind in our cleanup. + // + // - Due to transient errors during the collection process, it's + // possible that a collection is known to be potentially incomplete. + // We can tell this because it has rows in `inv_collection_errors`. + // (It's possible that a collection can be incomplete with zero + // errors, but we can't know that here and so we can't do anything + // about it.) + // + // Goals: + // + // - When this function returns without error, there were at most + // `nkeep` collections in the database. + // + // - If we have to remove any collections, we want to start from the + // oldest ones. (We want to maintain a window of the last `nkeep`, + // not the first `nkeep - 1` from the beginning of time plus the most + // recent one.) + // + // - We want to avoid removing the last collection that had zero errors. + // (If we weren't careful, we might do this if there were `nkeep` + // collections with errors that were newer than the last complete + // collection.) + // + // Here's the plan: + // + // - Select from the database the `nkeep + 1` oldest collections and the + // number of errors associated with each one. + // + // - If we got fewer than `nkeep + 1` back, we're done. We shouldn't + // prune anything. + // + // - Otherwise, if the oldest collection is the only complete one, + // remove the next-oldest collection and go back to the top (repeat). + // + // - Otherwise, remove the oldest collection and go back to the top + // (repeat). + // + // This seems surprisingly complicated. It's designed to meet the above + // goals. + // + // Is this going to work if multiple Nexuses are doing it concurrently? + // This cannot remove the last complete collection because a given Nexus + // will only remove a complete collection if it has seen a newer + // complete one. This cannot result in keeping fewer than "nkeep" + // collections because any Nexus will only remove a collection if there + // are "nkeep" newer ones. In both of these cases, another Nexus might + // remove one of the ones that the first Nexus was counting on keeping, + // but only if there was a newer one to replace it. + + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + loop { + match self.inventory_find_pruneable(opctx, nkeep).await? { + None => break, + Some(collection_id) => { + self.inventory_delete_collection(opctx, collection_id) + .await? + } + } + } + + Ok(()) + } + + /// Return the oldest inventory collection that's eligible for pruning, + /// if any + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_find_pruneable( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result, Error> { + let conn = self.pool_connection_authorized(opctx).await?; + // Diesel requires us to use aliases in order to refer to the + // `inv_collection` table twice in the same query. + let (inv_collection1, inv_collection2) = diesel::alias!( + db::schema::inv_collection as inv_collection1, + db::schema::inv_collection as inv_collection2 + ); + + // This subquery essentially generates: + // + // SELECT id FROM inv_collection ORDER BY time_started" ASC LIMIT $1 + // + // where $1 becomes `nkeep + 1`. This just lists the `nkeep + 1` oldest + // collections. + let subquery = inv_collection1 + .select(inv_collection1.field(db::schema::inv_collection::id)) + .order_by( + inv_collection1 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .limit(i64::from(nkeep) + 1); + + // This essentially generates: + // + // SELECT + // inv_collection.id, + // count(inv_collection_error.inv_collection_id) + // FROM ( + // inv_collection + // LEFT OUTER JOIN + // inv_collection_error + // ON ( + // inv_collection_error.inv_collection_id = inv_collection.id + // ) + // ) WHERE ( + // inv_collection.id = ANY( <> ) + // ) + // GROUP BY inv_collection.id + // ORDER BY inv_collection.time_started ASC + // + // This looks a lot scarier than it is. The goal is to produce a + // two-column table that looks like this: + // + // collection_id1 count of errors from collection_id1 + // collection_id2 count of errors from collection_id2 + // collection_id3 count of errors from collection_id3 + // ... + // + let candidates: Vec<(Uuid, i64)> = inv_collection2 + .left_outer_join(db::schema::inv_collection_error::table) + .filter( + inv_collection2 + .field(db::schema::inv_collection::id) + .eq_any(subquery), + ) + .group_by(inv_collection2.field(db::schema::inv_collection::id)) + .select(( + inv_collection2.field(db::schema::inv_collection::id), + diesel::dsl::count( + db::schema::inv_collection_error::inv_collection_id + .nullable(), + ), + )) + .order_by( + inv_collection2 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .internal_context("listing oldest collections")?; + + if u32::try_from(candidates.len()).unwrap() <= nkeep { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal (too few)"; + "candidates" => ?candidates, + ); + return Ok(None); + } + + // We've now got up to "nkeep + 1" oldest collections, starting with the + // very oldest. We can get rid of the oldest one unless it's the only + // complete one. Another way to think about it: find the _last_ + // complete one. Remove it from the list of candidates. Now mark the + // first item in the remaining list for deletion. + let last_completed_idx = candidates + .iter() + .enumerate() + .rev() + .find(|(_i, (_collection_id, nerrors))| *nerrors == 0); + let candidate = match last_completed_idx { + Some((i, _)) if i == 0 => candidates.iter().skip(1).next(), + _ => candidates.iter().next(), + } + .map(|(collection_id, _nerrors)| *collection_id); + if let Some(c) = candidate { + debug!( + &opctx.log, + "inventory_prune_one: eligible for removal"; + "collection_id" => c.to_string(), + "candidates" => ?candidates, + ); + } else { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal"; + "candidates" => ?candidates, + ); + } + Ok(candidate) + } + + /// Removes an inventory collection from the database + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_delete_collection( + &self, + opctx: &OpContext, + collection_id: Uuid, + ) -> Result<(), Error> { + // As with inserting a whole collection, we remove it in one big + // transaction for simplicity. Similar considerations apply. We could + // break it up if these transactions become too big. But we'd need a + // way to stop other clients from discovering a collection after we + // start removing it and we'd also need to make sure we didn't leak a + // collection if we crash while deleting it. + let conn = self.pool_connection_authorized(opctx).await?; + let (ncollections, nsps, nrots, ncabooses, nerrors) = conn + .transaction_async(|conn| async move { + // Remove the record describing the collection itself. + let ncollections = { + use db::schema::inv_collection::dsl; + diesel::delete( + dsl::inv_collection.filter(dsl::id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for service processors. + let nsps = { + use db::schema::inv_service_processor::dsl; + diesel::delete( + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for service processors. + let nrots = { + use db::schema::inv_root_of_trust::dsl; + diesel::delete( + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for cabooses found. + let ncabooses = { + use db::schema::inv_caboose::dsl; + diesel::delete( + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for errors encountered. + let nerrors = { + use db::schema::inv_collection_error::dsl; + diesel::delete( + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + Ok((ncollections, nsps, nrots, ncabooses, nerrors)) + }) + .await + .map_err(|error| match error { + TransactionError::CustomError(e) => e, + TransactionError::Database(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; + + info!(&opctx.log, "removed inventory collection"; + "collection_id" => collection_id.to_string(), + "ncollections" => ncollections, + "nsps" => nsps, + "nrots" => nrots, + "ncabooses" => ncabooses, + "nerrors" => nerrors, + ); + + Ok(()) + } +} + +/// A SQL common table expression (CTE) used to insert into `inv_caboose` +/// +/// Concretely, we have these three tables: +/// +/// - `hw_baseboard` with an "id" primary key and lookup columns "part_number" +/// and "serial_number" +/// - `sw_caboose` with an "id" primary key and lookup columns "board", +/// "git_commit", "name", and "version" +/// - `inv_caboose` with foreign keys "hw_baseboard_id", "sw_caboose_id", and +/// various other columns +/// +/// We want to INSERT INTO `inv_caboose` a row with: +/// +/// - hw_baseboard_id (foreign key) the result of looking up an hw_baseboard row +/// by part number and serial number provided by the caller +/// +/// - sw_caboose_id (foreign key) the result of looking up a sw_caboose row by +/// board, git_commit, name, and version provided by the caller +/// +/// - the other columns being literals provided by the caller +/// +/// To achieve this, we're going to generate something like: +/// +/// WITH +/// my_new_row +/// AS ( +/// SELECT +/// hw_baseboard.id, /* `hw_baseboard` foreign key */ +/// sw_caboose.id, /* `sw_caboose` foreign key */ +/// ... /* caller-provided literal values for the rest */ +/// /* of the new inv_caboose row */ +/// FROM +/// hw_baseboard, +/// sw_caboose +/// WHERE +/// hw_baseboard.part_number = ... /* caller-provided part number */ +/// hw_baseboard.serial_number = ... /* caller-provided serial number */ +/// sw_caboose.board = ... /* caller-provided board */ +/// sw_caboose.git_commit = ... /* caller-provided git_commit */ +/// sw_caboose.name = ... /* caller-provided name */ +/// sw_caboose.version = ... /* caller-provided version */ +/// ) INSERT INTO +/// inv_caboose (... /* inv_caboose columns */) +/// SELECT * from my_new_row; +/// +/// The whole point is to avoid back-and-forth between the client and the +/// database. Those back-and-forth interactions can significantly increase +/// latency and the probability of transaction conflicts. See RFD 192 for +/// details. +#[must_use = "Queries must be executed"] +struct InvCabooseInsert { + // fields used to look up baseboard id + baseboard_part_number: String, + baseboard_serial_number: String, + + // fields used to look up caboose id + caboose_board: String, + caboose_git_commit: String, + caboose_name: String, + caboose_version: String, + + // literal values for the rest of the inv_caboose columns + collection_id: Uuid, + time_collected: DateTime, + source: String, + which: CabooseWhich, + + // These are Diesel structures representing table names in the "from" or + // "into" parts of queries (e.g., "SELECT FROM tablename" or "INSERT INTO + // tablename"). We need this in `walk_ast()` below, but they must outlive + // `walk_ast()`, so they need to be created ahead of time. + // + // TODO-cleanup These Diesel-internal types are nasty. It's not clear how + // else to do this. + from_hw_baseboard_id: + diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::hw_baseboard_id::table, + >, + from_sw_caboose: diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::sw_caboose::table, + >, + into_inv_caboose: + diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::inv_caboose::table, + >, +} + +impl InvCabooseInsert { + pub fn new( + collection_id: Uuid, + baseboard: &BaseboardId, + found_caboose: &CabooseFound, + which: CabooseWhich, + ) -> InvCabooseInsert { + InvCabooseInsert { + baseboard_part_number: baseboard.part_number.clone(), + baseboard_serial_number: baseboard.serial_number.clone(), + caboose_board: found_caboose.caboose.board.clone(), + caboose_git_commit: found_caboose.caboose.git_commit.clone(), + caboose_name: found_caboose.caboose.name.clone(), + caboose_version: found_caboose.caboose.version.clone(), + collection_id, + time_collected: found_caboose.time_collected, + source: found_caboose.source.clone(), + which, + from_hw_baseboard_id: db::schema::hw_baseboard_id::table + .from_clause(), + from_sw_caboose: db::schema::sw_caboose::table.from_clause(), + // It sounds a little goofy to use "from_clause()" when this is + // really part of an INSERT. But really this just produces the + // table name as an identifier. This is the same for both "FROM" + // and "INSERT" clauses. And diesel internally does the same thing + // here (see the type of `InsertStatement::into_clause`). + into_inv_caboose: db::schema::inv_caboose::table.from_clause(), + } + } +} + +impl diesel::query_builder::QueryFragment for InvCabooseInsert { + fn walk_ast<'b>( + &'b self, + mut pass: diesel::query_builder::AstPass<'_, 'b, diesel::pg::Pg>, + ) -> diesel::QueryResult<()> { + use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; + use db::schema::inv_caboose::dsl as dsl_inv_caboose; + use db::schema::sw_caboose::dsl as dsl_sw_caboose; + + pass.unsafe_to_cache_prepared(); + pass.push_sql("WITH my_new_row AS ("); + + pass.push_sql("SELECT "); + + // Emit the values that we're going to insert into `inv_caboose`. + // First, emit the looked-up foreign keys. + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::id::NAME)?; + pass.push_sql(", "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::id::NAME)?; + pass.push_sql(", "); + // Next, emit the literal values used for the rest of the columns. + pass.push_bind_param::(&self.collection_id)?; + pass.push_sql(", "); + pass.push_bind_param::( + &self.time_collected, + )?; + pass.push_sql(", "); + pass.push_bind_param::(&self.source)?; + pass.push_sql(", "); + pass.push_bind_param::(&self.which)?; + + // Finish the SELECT by adding the list of tables and the WHERE to pick + // out only the relevant row from each tables. + pass.push_sql(" FROM "); + + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql(", "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + + pass.push_sql(" WHERE "); + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::part_number::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::( + &self.baseboard_part_number, + )?; + pass.push_sql(" AND "); + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::serial_number::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::( + &self.baseboard_serial_number, + )?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::board::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_board)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::git_commit::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_git_commit)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::name::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_name)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::version::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_version)?; + + pass.push_sql(")\n"); // end of the SELECT query within the WITH + + pass.push_sql("INSERT INTO "); + self.into_inv_caboose.walk_ast(pass.reborrow())?; + + pass.push_sql("("); + pass.push_identifier(dsl_inv_caboose::hw_baseboard_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::sw_caboose_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::inv_collection_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::time_collected::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::source::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::which::NAME)?; + pass.push_sql(")\n"); + pass.push_sql("SELECT * FROM my_new_row"); + + // See the comment in inventory_insert_collection() where we use + // `inv_service_processor::all_columns()`. The same applies here. + // If you update the statement below because the schema for + // `inv_caboose` has changed, be sure to update the code above, too! + let ( + _hw_baseboard_id, + _sw_caboose_id, + _inv_collection_id, + _time_collected, + _source, + _which, + ) = dsl_inv_caboose::inv_caboose::all_columns(); + + Ok(()) + } +} + +// This is required to be able to call `inv_caboose_insert.execute_async()`. +impl diesel::RunQueryDsl for InvCabooseInsert {} + +// This is required to be able to call `inv_caboose_insert.execute_async()`. +impl diesel::query_builder::QueryId for InvCabooseInsert { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index a77e20647a..12959db827 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -59,6 +59,7 @@ mod external_ip; mod identity_provider; mod image; mod instance; +mod inventory; mod ip_pool; mod network_interface; mod oximeter; @@ -135,6 +136,9 @@ impl RunnableQuery for T where { } +pub type DataStoreConnection<'a> = + bb8::PooledConnection<'a, ConnectionManager>; + pub struct DataStore { pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, @@ -202,21 +206,13 @@ impl DataStore { .unwrap(); } - async fn pool_authorized( - &self, - opctx: &OpContext, - ) -> Result<&bb8::Pool>, Error> { - opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; - Ok(self.pool.pool()) - } - /// Returns a connection to a connection from the database connection pool. pub(super) async fn pool_connection_authorized( &self, opctx: &OpContext, - ) -> Result>, Error> - { - let pool = self.pool_authorized(opctx).await?; + ) -> Result { + opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; + let pool = self.pool.pool(); let connection = pool.get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -230,8 +226,7 @@ impl DataStore { /// "pool_connection_authorized". pub(super) async fn pool_connection_unauthorized( &self, - ) -> Result>, Error> - { + ) -> Result { let connection = self.pool.pool().get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -242,8 +237,7 @@ impl DataStore { #[doc(hidden)] pub async fn pool_connection_for_tests( &self, - ) -> Result>, Error> - { + ) -> Result { self.pool_connection_unauthorized().await } diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs index 73c95f4e91..249852d832 100644 --- a/nexus/db-queries/src/db/pool.rs +++ b/nexus/db-queries/src/db/pool.rs @@ -45,6 +45,8 @@ pub struct Pool { impl Pool { pub fn new(log: &slog::Logger, db_config: &DbConfig) -> Self { + // Make sure diesel-dtrace's USDT probes are enabled. + usdt::register_probes().expect("Failed to register USDT DTrace probes"); Self::new_builder(log, db_config, bb8::Builder::new()) } diff --git a/nexus/db-queries/tests/output/authz-roles.out b/nexus/db-queries/tests/output/authz-roles.out index 72031c567e..963f00f7e8 100644 --- a/nexus/db-queries/tests/output/authz-roles.out +++ b/nexus/db-queries/tests/output/authz-roles.out @@ -68,6 +68,20 @@ resource: authz::DeviceAuthRequestList silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ unauthenticated ! ! ! ! ! ! ! ! +resource: authz::Inventory + + USER Q R LC RP M MP CC D + fleet-admin ✘ ✔ ✘ ✔ ✔ ✔ ✘ ✔ + fleet-collaborator ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + fleet-viewer ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + silo1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + unauthenticated ! ! ! ! ! ! ! ! + resource: authz::IpPoolList USER Q R LC RP M MP CC D diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 1a9afbc6bd..c7345156a7 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,6 +92,11 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 5 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml new file mode 100644 index 0000000000..12208a3467 --- /dev/null +++ b/nexus/inventory/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "nexus-inventory" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-messages.workspace = true +nexus-types.workspace = true +slog.workspace = true +strum.workspace = true +uuid.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs new file mode 100644 index 0000000000..d987527024 --- /dev/null +++ b/nexus/inventory/src/builder.rs @@ -0,0 +1,278 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for building inventory [`Collection`] dynamically +//! +//! This separates the concerns of _collection_ (literally just fetching data +//! from sources like MGS) from assembling a representation of what was +//! collected. + +use anyhow::anyhow; +use chrono::DateTime; +use chrono::Utc; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::Caboose; +use nexus_types::inventory::CabooseFound; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use nexus_types::inventory::RotState; +use nexus_types::inventory::ServiceProcessor; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use uuid::Uuid; + +/// Build an inventory [`Collection`] +/// +/// This interface is oriented around the interfaces used by an actual +/// collector. Where possible, it accepts types directly provided by the data +/// sources (e.g., `gateway_client`). +#[derive(Debug)] +pub struct CollectionBuilder { + // For field documentation, see the corresponding fields in `Collection`. + errors: Vec, + time_started: DateTime, + collector: String, + baseboards: BTreeSet>, + cabooses: BTreeSet>, + sps: BTreeMap, ServiceProcessor>, + rots: BTreeMap, RotState>, + cabooses_found: + BTreeMap, CabooseFound>>, +} + +impl CollectionBuilder { + /// Start building a new `Collection` + /// + /// `collector` is an arbitrary string describing the agent that collected + /// this data. It's generally a Nexus instance uuid but it can be anything. + /// It's just for debugging. + pub fn new(collector: &str) -> Self { + CollectionBuilder { + errors: vec![], + time_started: Utc::now(), + collector: collector.to_owned(), + baseboards: BTreeSet::new(), + cabooses: BTreeSet::new(), + sps: BTreeMap::new(), + rots: BTreeMap::new(), + cabooses_found: BTreeMap::new(), + } + } + + /// Assemble a complete `Collection` representation + pub fn build(self) -> Collection { + Collection { + id: Uuid::new_v4(), + errors: self.errors, + time_started: self.time_started, + time_done: Utc::now(), + collector: self.collector, + baseboards: self.baseboards, + cabooses: self.cabooses, + sps: self.sps, + rots: self.rots, + cabooses_found: self.cabooses_found, + } + } + + /// Record service processor state `sp_state` reported by MGS + /// + /// `sp_type` and `slot` identify which SP this was. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_sp_state( + &mut self, + source: &str, + sp_type: SpType, + slot: u32, + sp_state: SpState, + ) -> Option> { + // Much ado about very little: MGS reports that "slot" is a u32, though + // in practice this seems very unlikely to be bigger than a u8. (How + // many slots can there be within one rack?) The database only supports + // signed integers, so if we assumed this really could span the range of + // a u32, we'd need to store it in an i64. Instead, assume here that we + // can stick it into a u16 (which still seems generous). This will + // allow us to store it into an Int32 in the database. + let Ok(sp_slot) = u16::try_from(slot) else { + self.found_error(anyhow!( + "MGS {:?}: SP {:?} slot {}: slot number did not fit into u16", + source, + sp_type, + slot + )); + return None; + }; + + // Normalize the baseboard id: i.e., if we've seen this baseboard + // before, use the same baseboard id record. Otherwise, make a new one. + let baseboard = Self::enum_item( + &mut self.baseboards, + BaseboardId { + serial_number: sp_state.serial_number, + part_number: sp_state.model, + }, + ); + + // Separate the SP state into the SP-specific state and the RoT state, + // if any. + let now = Utc::now(); + let _ = self.sps.entry(baseboard.clone()).or_insert_with(|| { + ServiceProcessor { + time_collected: now, + source: source.to_owned(), + + sp_type, + sp_slot, + + baseboard_revision: sp_state.revision, + hubris_archive: sp_state.hubris_archive_id, + power_state: sp_state.power_state, + } + }); + + match sp_state.rot { + gateway_client::types::RotState::Enabled { + active, + pending_persistent_boot_preference, + persistent_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + transient_boot_preference, + } => { + let _ = + self.rots.entry(baseboard.clone()).or_insert_with(|| { + RotState { + time_collected: now, + source: source.to_owned(), + active_slot: active, + persistent_boot_preference, + pending_persistent_boot_preference, + transient_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + } + }); + } + gateway_client::types::RotState::CommunicationFailed { + message, + } => { + self.found_error(anyhow!( + "MGS {:?}: reading RoT state for {:?}: {}", + source, + baseboard, + message + )); + } + } + + Some(baseboard) + } + + /// Returns true if we already found the caboose for `which` for baseboard + /// `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn sp_found_caboose_already( + &self, + baseboard: &BaseboardId, + which: CabooseWhich, + ) -> bool { + self.cabooses_found + .get(&which) + .map(|map| map.contains_key(baseboard)) + .unwrap_or(false) + } + + /// Record the given caboose information found for the given baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_sp_caboose( + &mut self, + baseboard: &BaseboardId, + which: CabooseWhich, + source: &str, + caboose: SpComponentCaboose, + ) -> Result<(), anyhow::Error> { + // Normalize the caboose contents: i.e., if we've seen this exact caboose + // contents before, use the same record from before. Otherwise, make a + // new one. + let sw_caboose = + Self::enum_item(&mut self.cabooses, Caboose::from(caboose)); + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let by_id = + self.cabooses_found.entry(which).or_insert_with(|| BTreeMap::new()); + if let Some(previous) = by_id.insert( + baseboard.clone(), + CabooseFound { + time_collected: Utc::now(), + source: source.to_owned(), + caboose: sw_caboose.clone(), + }, + ) { + let error = if *previous.caboose == *sw_caboose { + anyhow!("reported multiple times (same value)",) + } else { + anyhow!( + "reported caboose multiple times (previously {:?}, \ + now {:?})", + previous, + sw_caboose + ) + }; + Err(error.context(format!( + "baseboard {:?} caboose {:?}", + baseboard, which + ))) + } else { + Ok(()) + } + } + + /// Helper function for normalizing items + /// + /// If `item` (or its equivalent) is not already in `items`, insert it. + /// Either way, return the item from `items`. (This will either be `item` + /// itself or whatever was already in `items`.) + fn enum_item( + items: &mut BTreeSet>, + item: T, + ) -> Arc { + match items.get(&item) { + Some(found_item) => found_item.clone(), + None => { + let new_item = Arc::new(item); + items.insert(new_item.clone()); + new_item + } + } + } + + /// Record a collection error + /// + /// This is used for operational errors encountered during the collection + /// process (e.g., a down MGS instance). It's not intended for mis-uses of + /// this API, which are conveyed instead through returned errors (and should + /// probably cause the caller to stop collection altogether). + pub fn found_error(&mut self, error: anyhow::Error) { + self.errors.push(error); + } +} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs new file mode 100644 index 0000000000..b410326904 --- /dev/null +++ b/nexus/inventory/src/collector.rs @@ -0,0 +1,202 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collection of inventory from Omicron components + +use crate::builder::CollectionBuilder; +use anyhow::Context; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use slog::{debug, error}; +use std::sync::Arc; +use strum::IntoEnumIterator; + +pub struct Collector { + log: slog::Logger, + mgs_clients: Vec>, + in_progress: CollectionBuilder, +} + +impl Collector { + pub fn new( + creator: &str, + mgs_clients: &[Arc], + log: slog::Logger, + ) -> Self { + Collector { + log, + mgs_clients: mgs_clients.to_vec(), + in_progress: CollectionBuilder::new(creator), + } + } + + /// Begin the process of collecting a complete hardware/software inventory + /// of the rack + /// + /// The collection process makes a bunch of requests to a bunch of + /// components. This can take a while and produce any number of errors. + /// Such errors generally don't cause this function to fail. Rather, the + /// returned `Collection` keeps track of these errors. + pub async fn collect_all(mut self) -> Result { + // We're about to do a bunch of asynchronous operations. With a + // combination of async, futures, and some cleverness, we could do much + // of this in parallel. But this code path is not remotely + // latency-sensitive. And there's real risk of overloading our + // downstream services. So we just do one step at a time. This also + // keeps the code simpler. + + debug!(&self.log, "begin collection"); + + // When we add stages to collect from other components (e.g., sled + // agents), those will go here. + self.collect_all_mgs().await; + + debug!(&self.log, "finished collection"); + + Ok(self.in_progress.build()) + } + + /// Collect inventory from all MGS instances + async fn collect_all_mgs(&mut self) { + let clients = self.mgs_clients.clone(); + for client in &clients { + self.collect_one_mgs(&client).await; + } + } + + async fn collect_one_mgs(&mut self, client: &gateway_client::Client) { + debug!(&self.log, "begin collection from MGS"; + "mgs_url" => client.baseurl() + ); + + // First, see which SPs MGS can see via Ignition. + let ignition_result = client.ignition_list().await.with_context(|| { + format!("MGS {:?}: listing ignition targets", client.baseurl()) + }); + + // Select only the SPs that appear powered on. + // + // This choice is debatable. It's conceivable that an SP could be + // functioning but not visible to ignition. In that case, we'd be + // better off trying to ask MGS about it even though ignition reports it + // powered off. But in practice, if ignition can't see it, it's much + // more likely that there's just nothing plugged in. And in that case, + // if we try to ask MGS about it, we have to wait for MGS to time out + // its attempt to reach it (currently several seconds). This choice + // enables inventory to complete much faster, at the expense of not + // being able to identify this particular condition. + let sps = match ignition_result { + Err(error) => { + self.in_progress.found_error(error); + return; + } + + Ok(targets) => { + targets.into_inner().into_iter().filter_map(|sp_ignition| { + match sp_ignition.details { + gateway_client::types::SpIgnition::No => None, + gateway_client::types::SpIgnition::Yes { + power: false, + .. + } => None, + gateway_client::types::SpIgnition::Yes { + power: true, + .. + } => Some(sp_ignition.id), + } + }) + } + }; + + // For each SP that ignition reports up, fetch the state and caboose + // information. + for sp in sps { + // First, fetch the state of the SP. If that fails, report the + // error but continue. + let result = + client.sp_get(sp.type_, sp.slot).await.with_context(|| { + format!( + "MGS {:?}: fetching state of SP {:?}", + client.baseurl(), + sp + ) + }); + let sp_state = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + + // Record the state that we found. + let Some(baseboard_id) = self.in_progress.found_sp_state( + client.baseurl(), + sp.type_, + sp.slot, + sp_state, + ) else { + // We failed to parse this SP for some reason. The error was + // reported already. Move on. + continue; + }; + + // For each kind of caboose that we care about, if it hasn't been + // fetched already, fetch it and record it. Generally, we'd only + // get here for the first MGS client. Assuming that one succeeds, + // the other(s) will skip this loop. + for which in CabooseWhich::iter() { + if self + .in_progress + .sp_found_caboose_already(&baseboard_id, which) + { + continue; + } + + let (component, slot) = match which { + CabooseWhich::SpSlot0 => ("sp", 0), + CabooseWhich::SpSlot1 => ("sp", 1), + CabooseWhich::RotSlotA => ("rot", 0), + CabooseWhich::RotSlotB => ("rot", 1), + }; + + let result = client + .sp_component_caboose_get( + sp.type_, sp.slot, component, slot, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {:?}: caboose {:?}", + client.baseurl(), + sp, + which + ) + }); + let caboose = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + if let Err(error) = self.in_progress.found_sp_caboose( + &baseboard_id, + which, + client.baseurl(), + caboose, + ) { + error!( + &self.log, + "error reporting caboose: {:?} {:?} {:?}: {:#}", + baseboard_id, + which, + client.baseurl(), + error + ); + } + } + } + } +} diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs new file mode 100644 index 0000000000..c30c25369a --- /dev/null +++ b/nexus/inventory/src/lib.rs @@ -0,0 +1,23 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron component inventory +//! +//! This module provides [`Collector`], an interface for collecting a complete +//! hardware/software inventory in a running Omicron deployment +//! +//! This is really just the collection part. For separation of concerns, this +//! module doesn't know anything about storing these collections into the +//! database. That's provided by the datastore. The types associated with +//! collections are in `nexus_types::inventory` so they can be shared with other +//! parts of Nexus (like the datastore). +//! +//! This module lives inside Nexus but it has few dependencies on other parts of +//! Nexus. It could be incorporated into other components. (The corresponding +//! types in `nexus_types` might have to move, too) + +mod builder; +mod collector; + +pub use collector::Collector; diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index 3fcf0483a5..7b05eab61b 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -177,7 +177,7 @@ pub struct Driver { /// /// This is returned by [`Driver::register()`] to identify the corresponding /// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::status()`] to identify the task. +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] pub struct TaskHandle(String); @@ -277,8 +277,8 @@ impl Driver { /// Enumerate all registered background tasks /// /// This is aimed at callers that want to get the status of all background - /// tasks. You'd call [`Driver::status()`] with each of the items produced - /// by the iterator. + /// tasks. You'd call [`Driver::task_status()`] with each of the items + /// produced by the iterator. pub fn tasks(&self) -> impl Iterator { self.tasks.keys() } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index aa949bbc9f..1c178175fe 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -9,6 +9,7 @@ use super::dns_config; use super::dns_propagation; use super::dns_servers; use super::external_endpoints; +use super::inventory_collection; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -16,6 +17,7 @@ use omicron_common::nexus_config::BackgroundTaskConfig; use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::sync::Arc; +use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with /// them @@ -42,6 +44,9 @@ pub struct BackgroundTasks { pub external_endpoints: tokio::sync::watch::Receiver< Option, >, + + /// task handle for the task that collects inventory + pub task_inventory_collection: common::TaskHandle, } impl BackgroundTasks { @@ -50,6 +55,8 @@ impl BackgroundTasks { opctx: &OpContext, datastore: Arc, config: &BackgroundTaskConfig, + nexus_id: Uuid, + resolver: internal_dns::resolver::Resolver, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -70,8 +77,9 @@ impl BackgroundTasks { // Background task: External endpoints list watcher let (task_external_endpoints, external_endpoints) = { - let watcher = - external_endpoints::ExternalEndpointsWatcher::new(datastore); + let watcher = external_endpoints::ExternalEndpointsWatcher::new( + datastore.clone(), + ); let watcher_channel = watcher.watcher(); let task = driver.register( String::from("external_endpoints"), @@ -88,6 +96,29 @@ impl BackgroundTasks { (task, watcher_channel) }; + // Background task: inventory collector + let task_inventory_collection = { + let watcher = inventory_collection::InventoryCollector::new( + datastore, + resolver, + &nexus_id.to_string(), + config.inventory.nkeep, + ); + let task = driver.register( + String::from("inventory_collection"), + String::from( + "collects hardware and software inventory data from the \ + whole system", + ), + config.inventory.period_secs, + Box::new(watcher), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -96,6 +127,7 @@ impl BackgroundTasks { task_external_dns_servers, task_external_endpoints, external_endpoints, + task_inventory_collection, } } diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs new file mode 100644 index 0000000000..93e7db2697 --- /dev/null +++ b/nexus/src/app/background/inventory_collection.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for reading inventory for the rack + +use super::common::BackgroundTask; +use anyhow::Context; +use futures::future::BoxFuture; +use futures::FutureExt; +use internal_dns::ServiceName; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::inventory::Collection; +use serde_json::json; +use std::sync::Arc; + +/// Background task that reads inventory for the rack +pub struct InventoryCollector { + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: String, + nkeep: u32, +} + +impl InventoryCollector { + pub fn new( + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, + ) -> InventoryCollector { + InventoryCollector { + datastore, + resolver, + creator: creator.to_owned(), + nkeep, + } + } +} + +impl BackgroundTask for InventoryCollector { + fn activate<'a, 'b, 'c>( + &'a mut self, + opctx: &'b OpContext, + ) -> BoxFuture<'c, serde_json::Value> + where + 'a: 'c, + 'b: 'c, + { + async { + match inventory_activate( + opctx, + &self.datastore, + &self.resolver, + &self.creator, + self.nkeep, + ) + .await + .context("failed to collect inventory") + { + Err(error) => { + let message = format!("{:#}", error); + warn!(opctx.log, "inventory collection failed"; + "error" => message.clone()); + json!({ "error": message }) + } + Ok(collection) => { + debug!(opctx.log, "inventory collection complete"; + "collection_id" => collection.id.to_string(), + "time_started" => collection.time_started.to_string(), + ); + json!({ + "collection_id": collection.id.to_string(), + "time_started": collection.time_started.to_string(), + "time_done": collection.time_done.to_string() + }) + } + } + } + .boxed() + } +} + +async fn inventory_activate( + opctx: &OpContext, + datastore: &DataStore, + resolver: &internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, +) -> Result { + // Prune old collections. We do this first, here, to ensure that we never + // develop an unbounded backlog of collections. (If this process were done + // by a separate task, it would be possible for the backlog to grow + // unbounded if that task were simply slower than the collection process, + // let alone if there were some kind of extended operational issue + // blocking deletion.) + datastore + .inventory_prune_collections(opctx, nkeep) + .await + .context("pruning old collections")?; + + // Find MGS clients. + let mgs_clients = resolver + .lookup_all_socket_v6(ServiceName::ManagementGatewayService) + .await + .context("looking up MGS addresses")? + .into_iter() + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + let log = opctx.log.new(o!("gateway_url" => url.clone())); + Arc::new(gateway_client::Client::new(&url, log)) + }) + .collect::>(); + + // Run a collection. + let inventory = nexus_inventory::Collector::new( + creator, + &mgs_clients, + opctx.log.clone(), + ); + let collection = + inventory.collect_all().await.context("collecting inventory")?; + + // Write it to the database. + datastore + .inventory_insert_collection(opctx, &collection) + .await + .context("saving inventory to database")?; + + Ok(collection) +} diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 9ba0780246..e1f474b41a 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -10,6 +10,7 @@ mod dns_propagation; mod dns_servers; mod external_endpoints; mod init; +mod inventory_collection; mod status; pub use common::Driver; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 354df0ead3..8d61832d2a 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -287,6 +287,8 @@ impl Nexus { &background_ctx, Arc::clone(&db_datastore), &config.pkg.background_tasks, + config.deployment.id, + resolver.clone(), ); let external_resolver = { diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 3ac4b9063d..67da485c46 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -222,6 +222,7 @@ impl super::Nexus { &self.background_tasks.task_external_dns_config, &self.background_tasks.task_external_dns_servers, &self.background_tasks.task_external_endpoints, + &self.background_tasks.task_inventory_collection, ] { self.background_tasks.activate(task); } diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 8cd25582be..56cee27b37 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -14,6 +14,8 @@ crucible-agent-client.workspace = true dns-server.workspace = true dns-service-client.workspace = true dropshot.workspace = true +gateway-messages.workspace = true +gateway-test-utils.workspace = true headers.workspace = true http.workspace = true hyper.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 34c218b3e2..4ac64082b9 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -14,6 +14,7 @@ use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; +use gateway_test_utils::setup::GatewayTestContext; use nexus_test_interface::NexusServer; use nexus_types::external_api::params::UserId; use nexus_types::internal_api::params::Certificate; @@ -85,6 +86,7 @@ pub struct ControlPlaneTestContext { pub sled_agent: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, + pub gateway: GatewayTestContext, pub dendrite: HashMap, pub external_dns_zone_name: String, pub external_dns: dns_server::TransientServer, @@ -105,6 +107,7 @@ impl ControlPlaneTestContext { self.sled_agent.http_server.close().await.unwrap(); self.oximeter.close().await.unwrap(); self.producer.close().await.unwrap(); + self.gateway.teardown().await; for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -221,6 +224,7 @@ impl RackInitRequestBuilder { pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub config: &'a mut omicron_common::nexus_config::Config, + test_name: &'a str, rack_init_builder: RackInitRequestBuilder, pub start_time: chrono::DateTime, @@ -236,6 +240,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub sled_agent: Option, pub oximeter: Option, pub producer: Option, + pub gateway: Option, pub dendrite: HashMap, // NOTE: Only exists after starting Nexus, until external Nexus is @@ -253,7 +258,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { pub fn new( - test_name: &str, + test_name: &'a str, config: &'a mut omicron_common::nexus_config::Config, ) -> Self { let start_time = chrono::Utc::now(); @@ -261,6 +266,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { Self { config, + test_name, rack_init_builder: RackInitRequestBuilder::new(), start_time, logctx, @@ -273,6 +279,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { sled_agent: None, oximeter: None, producer: None, + gateway: None, dendrite: HashMap::new(), nexus_internal: None, nexus_internal_addr: None, @@ -370,6 +377,37 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .set_port(port); } + pub async fn start_gateway(&mut self) { + // For now, this MGS is not configured to match up in any way with + // either the simulated sled agent or the Dendrite instances. It's + // useful for testing stuff unrelated to that. But at some point we + // will probably want the reported data to match up better. + debug!(&self.logctx.log, "Starting Management Gateway"); + let gateway = gateway_test_utils::setup::test_setup( + self.test_name, + gateway_messages::SpPort::One, + ) + .await; + let fake_mgs_zone_id = Uuid::new_v4(); + let SocketAddr::V6(v6addr) = gateway.client.bind_address else { + panic!("MGS unexpectedly listening on IPv4?"); + }; + let zone = self + .rack_init_builder + .internal_dns_config + .host_zone(fake_mgs_zone_id, *v6addr.ip()) + .expect("Failed to add DNS for MGS zone"); + self.rack_init_builder + .internal_dns_config + .service_backend_zone( + internal_dns::ServiceName::ManagementGatewayService, + &zone, + v6addr.port(), + ) + .expect("Failed to add DNS for MGS service"); + self.gateway = Some(gateway); + } + pub async fn start_dendrite(&mut self, switch_location: SwitchLocation) { let log = &self.logctx.log; debug!(log, "Starting Dendrite for {switch_location}"); @@ -741,6 +779,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { oximeter: self.oximeter.unwrap(), producer: self.producer.unwrap(), logctx: self.logctx, + gateway: self.gateway.unwrap(), dendrite: self.dendrite, external_dns_zone_name: self.external_dns_zone_name.unwrap(), external_dns: self.external_dns.unwrap(), @@ -769,6 +808,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { if let Some(producer) = self.producer { producer.close().await.unwrap(); } + if let Some(gateway) = self.gateway { + gateway.teardown().await; + } for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -860,6 +902,7 @@ async fn setup_with_config_impl( ) -> ControlPlaneTestContext { builder.start_crdb_impl(populate).await; builder.start_clickhouse().await; + builder.start_gateway().await; builder.start_dendrite(SwitchLocation::Switch0).await; builder.start_dendrite(SwitchLocation::Switch1).await; builder.start_internal_dns().await; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 1b1ae2c912..3e50a1ef18 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -89,8 +89,13 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the # `Random` strategy, instead of `RandomWithDistinctSleds` -type = "random" \ No newline at end of file +type = "random" diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index c499714c31..5722b065cf 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -23,6 +23,7 @@ uuid.workspace = true api_identity.workspace = true dns-service-client.workspace = true +gateway-client.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs new file mode 100644 index 0000000000..53dd59aef2 --- /dev/null +++ b/nexus/types/src/inventory.rs @@ -0,0 +1,170 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types representing collection of hardware/software inventory +//! +//! This lives in nexus/types because it's used by both nexus/db-model and +//! nexus/inventory. (It could as well just live in nexus/db-model, but +//! nexus/inventory does not currently know about nexus/db-model and it's +//! convenient to separate these concerns.) + +use chrono::DateTime; +use chrono::Utc; +pub use gateway_client::types::PowerState; +pub use gateway_client::types::RotSlot; +pub use gateway_client::types::SpType; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use strum::EnumIter; +use uuid::Uuid; + +/// Results of collecting hardware/software inventory from various Omicron +/// components +/// +/// This type is structured so that it's both easy to collect and easy to insert +/// into the database. This means items that are represented with separate +/// database tables (like service processors and roots of trust) are represented +/// with separate records, even though they might come from the same source +/// (in this case, a single MGS request). +/// +/// We make heavy use of maps, sets, and Arcs here because many of these things +/// point to each other and this approach to representing relationships ensures +/// clear ownership. (It also reflects how things will wind up in the +/// database.) +/// +/// See the documentation in the database schema for more background. +#[derive(Debug)] +pub struct Collection { + /// unique identifier for this collection + pub id: Uuid, + /// errors encountered during collection + pub errors: Vec, + /// time the collection started + pub time_started: DateTime, + /// time the collection eneded + pub time_done: DateTime, + /// name of the agent doing the collecting (generally, this Nexus's uuid) + pub collector: String, + + /// unique baseboard ids that were found in this collection + /// + /// In practice, these will be inserted into the `hw_baseboard_id` table. + pub baseboards: BTreeSet>, + /// unique caboose contents that were found in this collection + /// + /// In practice, these will be inserted into the `sw_caboose` table. + pub cabooses: BTreeSet>, + + /// all service processors, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_service_processor` + /// table. + pub sps: BTreeMap, ServiceProcessor>, + /// all roots of trust, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_root_of_trust` table. + pub rots: BTreeMap, RotState>, + /// all caboose contents found, keyed first by the kind of caboose + /// (`CabooseWhich`), then the baseboard id of the sled where they were + /// found + /// + /// In practice, these will be inserted into the `inv_caboose` table. + pub cabooses_found: + BTreeMap, CabooseFound>>, +} + +/// A unique baseboard id found during a collection +/// +/// Baseboard ids are the keys used to link up information from disparate +/// sources (like a service processor and a sled agent). +/// +/// These are normalized in the database. Each distinct baseboard id is +/// assigned a uuid and shared across the many possible collections that +/// reference it. +/// +/// Usually, the part number and serial number are combined with a revision +/// number. We do not include that here. If we ever did find a baseboard with +/// the same part number and serial number but a new revision number, we'd want +/// to treat that as the same baseboard as one with a different revision number. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + +/// Caboose contents found during a collection +/// +/// These are normalized in the database. Each distinct `Caboose` is assigned a +/// uuid and shared across many possible collections that reference it. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct Caboose { + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for Caboose { + fn from(c: gateway_client::types::SpComponentCaboose) -> Self { + Caboose { + board: c.board, + git_commit: c.git_commit, + name: c.name, + // The MGS API uses an `Option` here because old SP versions did not + // supply it. But modern SP versions do. So we should never hit + // this `unwrap_or()`. + version: c.version.unwrap_or(String::from("")), + } + } +} + +/// Indicates that a particular `Caboose` was found (at a particular time from a +/// particular source, but these are only for debugging) +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct CabooseFound { + pub time_collected: DateTime, + pub source: String, + pub caboose: Arc, +} + +/// Describes a service processor found during collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct ServiceProcessor { + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: u16, + + pub baseboard_revision: u32, + pub hubris_archive: String, + pub power_state: PowerState, +} + +/// Describes the root of trust state found (from a service processor) during +/// collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct RotState { + pub time_collected: DateTime, + pub source: String, + + pub active_slot: RotSlot, + pub persistent_boot_preference: RotSlot, + pub pending_persistent_boot_preference: Option, + pub transient_boot_preference: Option, + pub slot_a_sha3_256_digest: Option, + pub slot_b_sha3_256_digest: Option, +} + +/// Describes which caboose this is (which component, which slot) +#[derive(Clone, Copy, Debug, EnumIter, PartialEq, Eq, PartialOrd, Ord)] +pub enum CabooseWhich { + SpSlot0, + SpSlot1, + RotSlotA, + RotSlotB, +} diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 3f864b0f17..a48c4d3b00 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -32,3 +32,4 @@ pub mod external_api; pub mod identity; pub mod internal_api; +pub mod inventory; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9f5f78326c..19b164b240 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2508,6 +2508,217 @@ CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_address_config ( COMMIT; BEGIN; +/* + * Hardware/software inventory + * + * See RFD 433 for details. Here are the highlights. + * + * Omicron periodically collects hardware/software inventory data from the + * running system and stores it into the database. Each discrete set of data is + * called a **collection**. Each collection contains lots of different kinds of + * data, so there are many tables here. For clarity, these tables are prefixed + * with: + * + * `inv_*` (examples: `inv_collection`, `inv_service_processor`) + * + * Describes the complete set of hardware and software in the system. + * Rows in these tables are immutable, but they describe mutable facts + * about hardware and software (e.g., the slot that a disk is in). When + * these facts change (e.g., a disk moves between slots), a new set of + * records is written. + * + * All rows in the `inv_*` tables point back to a particular collection. They + * represent the state observed at some particular time. + * + * Information about service processors and roots of trust are joined with + * information reported by sled agents via the baseboard id. + * + * Hardware and software identifiers are normalized for the usual database + * design reasons. This means instead of storing hardware and software + * identifiers directly in the `inv_*` tables, these tables instead store + * foreign keys into one of these groups of tables, whose names are also + * prefixed for clarity: + * + * `hw_*` (example: `hw_baseboard_id`) + * + * Maps hardware-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * `sw_*` (example: `sw_caboose`) + * + * Maps software-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * Records in these tables are shared across potentially many collections. To + * see why this is useful, consider that `sw_caboose` records contain several + * long identifiers (e.g., git commit, SHA sums) and in practice, most of the + * time, we expect that all components of a given type will have the exact same + * cabooses. Rather than store the caboose contents in each + * `inv_service_processor` row (for example), often replicating the exact same + * contents for each SP for each collection, these rows just have pointers into + * the `sw_caboose` table that stores this data once. (This also makes it much + * easier to determine that these components _do_ have the same cabooses.) + * + * On PC systems (i.e., non-Oxide hardware), most of these tables will be empty + * because we do not support hardware inventory on these systems. + * + * Again, see RFD 433 for more on all this. + */ + +/* + * baseboard ids: this table assigns uuids to distinct part/serial values + * + * Usually we include the baseboard revision number when we reference the part + * number and serial number. The revision number is deliberately left out here. + * If we happened to see the same baseboard part number and serial number with + * different revisions, that's the same baseboard. + */ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); + +/* power states reportable by the SP */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); + +/* root of trust firmware slots */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); + +/* cabooses: this table assigns unique ids to distinct caboose contents */ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); + +/* Inventory Collections */ + +-- list of all collections +CREATE TABLE IF NOT EXISTS inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); +-- Supports finding latest collection (to use) or the oldest collection (to +-- clean up) +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); + +-- list of errors generated during a collection +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); + +/* what kind of slot MGS reported a device in */ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); + +-- observations from and about service processors +-- also see `inv_root_of_trust` +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + -- identity of this device according to MGS + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- Data from MGS "Get SP Info" API. See MGS API documentation. + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +-- root of trust information reported by SP +-- There's usually one row here for each row in inv_service_processor, but not +-- necessarily. +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, -- nullable + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, -- nullable + slot_a_sha3_256 TEXT, -- nullable + slot_b_sha3_256 TEXT, -- nullable + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); + +-- cabooses found +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); + + +/*******************************************************************/ + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 2dfee81d02..c9b2f3fdc2 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -38,8 +38,13 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds # seed is omitted so a new seed will be chosen with every allocation. -type = "random_with_distinct_sleds" \ No newline at end of file +type = "random_with_distinct_sleds" diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index aff0a8a25f..65bd020e0b 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -38,8 +38,13 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. # seed is omitted so a new seed will be chosen with every allocation. -type = "random" \ No newline at end of file +type = "random" From 54f83e0bc3e942962d8cf571e4991ce76d2a854e Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 17 Oct 2023 20:28:49 -0700 Subject: [PATCH 02/20] fix docs --- nexus/db-model/src/inventory.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 1e8b3e3ea2..e9c1ee1f98 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -165,7 +165,7 @@ impl<'a> From<&'a Collection> for InvCollection { } } -/// See [`nexus_types::inventory::HwBaseboardId`]. +/// See [`nexus_types::inventory::BaseboardId`]. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = hw_baseboard_id)] pub struct HwBaseboardId { @@ -184,7 +184,7 @@ impl<'a> From<&'a BaseboardId> for HwBaseboardId { } } -/// See [`nexus_types::inventory::SwCaboose`]. +/// See [`nexus_types::inventory::Caboose`]. #[derive( Queryable, Insertable, @@ -323,7 +323,7 @@ where } } -/// See [`nexus_types::inventory::RootOfTrust`]. +/// See [`nexus_types::inventory::RotState`]. #[derive(Queryable, Clone, Debug, Selectable)] #[diesel(table_name = inv_root_of_trust)] pub struct InvRootOfTrust { @@ -340,7 +340,7 @@ pub struct InvRootOfTrust { pub slot_b_sha3_256: Option, } -/// See [`nexus_types::inventory::Caboose`]. +/// See [`nexus_types::inventory::CabooseFound`]. #[derive(Queryable, Clone, Debug, Selectable)] #[diesel(table_name = inv_caboose)] pub struct InvCaboose { From 58c010f5f238f6ae5f17b93c16e045d0a4843aba Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 19 Oct 2023 15:35:13 -0700 Subject: [PATCH 03/20] replace `CabooseWhich` with optional fields in SP, RoT tables --- Cargo.lock | 1 + clients/gateway-client/Cargo.toml | 1 + clients/gateway-client/src/lib.rs | 2 +- dev-tools/omdb/src/bin/omdb/db.rs | 141 +++---- nexus/db-model/src/inventory.rs | 45 +-- nexus/db-model/src/schema.rs | 11 +- .../db-queries/src/db/datastore/inventory.rs | 371 +++++------------- nexus/inventory/src/builder.rs | 177 ++++++--- nexus/inventory/src/collector.rs | 78 +++- nexus/types/src/inventory.rs | 22 +- schema/crdb/dbinit.sql | 47 ++- 11 files changed, 422 insertions(+), 474 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d03a9b61e0..8f53bfac2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2783,6 +2783,7 @@ dependencies = [ "serde", "serde_json", "slog", + "strum", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index fc33174107..7458453660 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -14,5 +14,6 @@ serde.workspace = true serde_json.workspace = true schemars.workspace = true slog.workspace = true +strum.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index b071d34975..27a45ba5ab 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -54,7 +54,7 @@ progenitor::generate_api!( SpState = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, RotState = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, RotImageDetails = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, - RotSlot = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, + RotSlot = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, strum::EnumIter ] }, ImageVersion = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, HostPhase2RecoveryImageId = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, }, diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 4546a6e543..471ec01cb3 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -12,7 +12,7 @@ //! would be the only consumer -- and in that case it's okay to query the //! database directly. -// NOTE: eminates from Tabled macros +// NOTE: emanates from Tabled macros #![allow(clippy::useless_vec)] use crate::Omdb; @@ -30,7 +30,6 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; -use nexus_db_model::CabooseWhich; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -1791,7 +1790,7 @@ async fn inv_collection_print_devices( rots.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() }; - // Load cabooses found, grouped by baseboard id. + // Load cabooses found, grouped by id. let inv_cabooses = { use db::schema::inv_caboose::dsl; let cabooses_found = dsl::inv_caboose @@ -1802,15 +1801,7 @@ async fn inv_collection_print_devices( .await .context("loading cabooses found")?; check_limit(&cabooses_found, limit, || "loading cabooses found"); - - let mut cabooses: BTreeMap> = BTreeMap::new(); - for ic in cabooses_found { - cabooses - .entry(ic.hw_baseboard_id) - .or_insert_with(Vec::new) - .push(ic); - } - cabooses + cabooses_found.into_iter().map(|c| (c.id, c)).collect() }; // Assemble a list of baseboard ids, sorted first by device type (sled, @@ -1860,61 +1851,11 @@ async fn inv_collection_print_devices( println!(""); println!(" found at: {} from {}", sp.time_collected, sp.source); - println!(" cabooses:"); - if let Some(my_inv_cabooses) = inv_cabooses.get(baseboard_id) { - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct CabooseRow<'a> { - slot: &'static str, - board: &'a str, - name: &'a str, - version: &'a str, - git_commit: &'a str, - } - let mut nbugs = 0; - let rows = my_inv_cabooses.iter().map(|ic| { - let slot = match ic.which { - CabooseWhich::SpSlot0 => " SP slot 0", - CabooseWhich::SpSlot1 => " SP slot 1", - CabooseWhich::RotSlotA => "RoT slot A", - CabooseWhich::RotSlotB => "RoT slot B", - }; - - let (board, name, version, git_commit) = - match sw_cabooses.get(&ic.sw_caboose_id) { - None => { - nbugs += 1; - ("-", "-", "-", "-") - } - Some(c) => ( - c.board.as_str(), - c.name.as_str(), - c.version.as_str(), - c.git_commit.as_str(), - ), - }; - - CabooseRow { slot, board, name, version, git_commit } - }); - - let table = tabled::Table::new(rows) - .with(tabled::settings::Style::empty()) - .with(tabled::settings::Padding::new(0, 1, 0, 0)) - .to_string(); - - println!("{}", textwrap::indent(&table.to_string(), " ")); - - if nbugs > 0 { - // Similar to above, if we don't have the sw_caboose for some - // inv_caboose, then it's a bug in either this tool (if we - // failed to fetch it) or the inventory system (if it failed to - // insert it). - println!( - "error: at least one caboose above was missing data \ - -- this is a bug" - ); - } - } + let sp_cabooses = &[ + ("SP slot 0", sp.slot0_inv_caboose_id), + ("SP slot 1", sp.slot1_inv_caboose_id), + ]; + inv_collection_print_cabooses(sp_cabooses, &inv_cabooses, &sw_cabooses); if let Some(rot) = rot { println!(" RoT: active slot: slot {:?}", rot.slot_active); @@ -1948,6 +1889,16 @@ async fn inv_collection_print_devices( .clone() .unwrap_or_else(|| String::from("-")) ); + + let rot_cabooses = &[ + ("RoT slot 0", rot.slot_a_inv_caboose_id), + ("RoT slot 1", rot.slot_b_inv_caboose_id), + ]; + inv_collection_print_cabooses( + rot_cabooses, + &inv_cabooses, + &sw_cabooses, + ); } else { println!(" RoT: no information found"); } @@ -2002,3 +1953,59 @@ async fn inv_collection_print_devices( Ok(()) } + +fn inv_collection_print_cabooses( + component_cabooses: &[(&'static str, Option)], + inv_cabooses: &BTreeMap, + sw_cabooses: &BTreeMap, +) { + println!(" cabooses:"); + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: &'static str, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, + } + let mut nbugs = 0; + + let rows = component_cabooses.iter().map(|(slot, inv_caboose_id)| { + let sw_caboose = inv_caboose_id + .and_then(|inv_caboose_id| inv_cabooses.get(&inv_caboose_id)) + .and_then(|inv_caboose| { + sw_cabooses.get(&inv_caboose.sw_caboose_id) + }); + let (board, name, version, git_commit) = match sw_caboose { + None => { + nbugs += 1; + ("-", "-", "-", "-") + } + Some(c) => ( + c.board.as_str(), + c.name.as_str(), + c.version.as_str(), + c.git_commit.as_str(), + ), + }; + CabooseRow { slot, board, name, version, git_commit } + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", textwrap::indent(&table.to_string(), " ")); + + if nbugs > 0 { + // If we don't have the sw_caboose for some inv_caboose, then + // it's a bug in either this tool (if we failed to fetch it) or + // the inventory system (if it failed to insert it). + println!( + "error: at least one caboose above was missing data \ + -- this is a bug" + ); + } +} diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index e9c1ee1f98..1fbdce570b 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -72,42 +72,6 @@ impl From for HwRotSlot { } } -// See [`nexus_types::inventory::CabooseWhich`]. -impl_enum_type!( - #[derive(SqlType, Debug, QueryId)] - #[diesel(postgres_type(name = "caboose_which"))] - pub struct CabooseWhichEnum; - - #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] - #[diesel(sql_type = CabooseWhichEnum)] - pub enum CabooseWhich; - - // Enum values - SpSlot0 => b"sp_slot_0" - SpSlot1 => b"sp_slot_1" - RotSlotA => b"rot_slot_A" - RotSlotB => b"rot_slot_B" -); - -impl From for CabooseWhich { - fn from(c: nexus_types::inventory::CabooseWhich) -> Self { - match c { - nexus_types::inventory::CabooseWhich::SpSlot0 => { - CabooseWhich::SpSlot0 - } - nexus_types::inventory::CabooseWhich::SpSlot1 => { - CabooseWhich::SpSlot1 - } - nexus_types::inventory::CabooseWhich::RotSlotA => { - CabooseWhich::RotSlotA - } - nexus_types::inventory::CabooseWhich::RotSlotB => { - CabooseWhich::RotSlotB - } - } - } -} - // See [`nexus_types::inventory::SpType`]. impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -251,6 +215,9 @@ pub struct InvServiceProcessor { pub baseboard_revision: BaseboardRevision, pub hubris_archive_id: String, pub power_state: HwPowerState, + + pub slot0_inv_caboose_id: Option, + pub slot1_inv_caboose_id: Option, } /// Newtype wrapping the MGS-reported slot number for an SP @@ -338,6 +305,9 @@ pub struct InvRootOfTrust { pub slot_boot_pref_persistent_pending: Option, pub slot_a_sha3_256: Option, pub slot_b_sha3_256: Option, + + pub slot_a_inv_caboose_id: Option, + pub slot_b_inv_caboose_id: Option, } /// See [`nexus_types::inventory::CabooseFound`]. @@ -345,10 +315,9 @@ pub struct InvRootOfTrust { #[diesel(table_name = inv_caboose)] pub struct InvCaboose { pub inv_collection_id: Uuid, - pub hw_baseboard_id: Uuid, pub time_collected: DateTime, pub source: String, - pub which: CabooseWhich, + pub id: Uuid, pub sw_caboose_id: Uuid, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 0b41733e6d..13e104251b 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1177,6 +1177,9 @@ table! { baseboard_revision -> Int8, hubris_archive_id -> Text, power_state -> crate::HwPowerStateEnum, + + slot0_inv_caboose_id -> Nullable, + slot1_inv_caboose_id -> Nullable, } } @@ -1193,17 +1196,19 @@ table! { slot_boot_pref_persistent_pending -> Nullable, slot_a_sha3_256 -> Nullable, slot_b_sha3_256 -> Nullable, + + slot_a_inv_caboose_id -> Nullable, + slot_b_inv_caboose_id -> Nullable, } } table! { - inv_caboose (inv_collection_id, hw_baseboard_id, which) { + inv_caboose (id) { + id -> Uuid, inv_collection_id -> Uuid, - hw_baseboard_id -> Uuid, time_collected -> Timestamptz, source -> Text, - which -> crate::CabooseWhichEnum, sw_caboose_id -> Uuid, } } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index e58aae3d1e..664a06cd8e 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -11,19 +11,12 @@ use crate::db::error::ErrorHandler; use crate::db::TransactionError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; -use chrono::DateTime; -use chrono::Utc; -use diesel::sql_types; use diesel::sql_types::Nullable; -use diesel::Column; use diesel::ExpressionMethods; use diesel::IntoSql; use diesel::NullableExpressionMethods; use diesel::QueryDsl; -use diesel::QuerySource; use diesel::Table; -use nexus_db_model::CabooseWhich; -use nexus_db_model::CabooseWhichEnum; use nexus_db_model::HwBaseboardId; use nexus_db_model::HwPowerState; use nexus_db_model::HwPowerStateEnum; @@ -34,8 +27,6 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::SpType; use nexus_db_model::SpTypeEnum; use nexus_db_model::SwCaboose; -use nexus_types::inventory::BaseboardId; -use nexus_types::inventory::CabooseFound; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; @@ -68,6 +59,15 @@ impl DataStore { .iter() .map(|s| SwCaboose::from(s.as_ref())) .collect::>(); + let cabooses_found: Vec<_> = collection + .sps + .iter() + .flat_map(|(_, sp)| [&sp.slot0_caboose, &sp.slot1_caboose]) + .chain(collection.rots.iter().flat_map(|(_, rot)| { + [&rot.slot_a_caboose, &rot.slot_b_caboose] + })) + .flatten() + .collect(); let error_values = collection .errors .iter() @@ -178,6 +178,18 @@ impl DataStore { .into_sql::(), HwPowerState::from(sp.power_state) .into_sql::(), + sp.slot0_caboose + .as_ref() + .map(|c| c.id) + .into_sql::>(), + sp.slot1_caboose + .as_ref() + .map(|c| c.id) + .into_sql::>(), )) .filter( baseboard_dsl::part_number @@ -202,6 +214,8 @@ impl DataStore { sp_dsl::baseboard_revision, sp_dsl::hubris_archive_id, sp_dsl::power_state, + sp_dsl::slot0_inv_caboose_id, + sp_dsl::slot1_inv_caboose_id, )) .execute_async(&conn) .await?; @@ -231,6 +245,8 @@ impl DataStore { _baseboard_revision, _hubris_archive_id, _power_state, + _slot0_inv_caboose_id, + _slot1_inv_caboose_id, ) = sp_dsl::inv_service_processor::all_columns(); } } @@ -269,6 +285,18 @@ impl DataStore { .clone() .into_sql::>( ), + rot.slot_a_caboose + .as_ref() + .map(|c| c.id) + .into_sql::>(), + rot.slot_b_caboose + .as_ref() + .map(|c| c.id) + .into_sql::>(), )) .filter( baseboard_dsl::part_number @@ -294,6 +322,8 @@ impl DataStore { rot_dsl::slot_boot_pref_transient, rot_dsl::slot_a_sha3_256, rot_dsl::slot_b_sha3_256, + rot_dsl::slot_a_inv_caboose_id, + rot_dsl::slot_b_inv_caboose_id, )) .execute_async(&conn) .await?; @@ -312,26 +342,71 @@ impl DataStore { _slot_boot_pref_transient, _slot_a_sha3_256, _slot_b_sha3_256, + _slot_a_inv_caboose_id, + _slot_b_inv_caboose_id, ) = rot_dsl::inv_root_of_trust::all_columns(); } } - // Insert rows for the cabooses that we found. Like service - // processors and roots of trust, we do this using INSERT INTO ... - // SELECT. But because there are two foreign keys, we need a more - // complicated `SELECT`, which requires using a CTE. - for (which, tree) in &collection.cabooses_found { - let db_which = nexus_db_model::CabooseWhich::from(*which); - for (baseboard_id, found_caboose) in tree { - InvCabooseInsert::new( - collection_id, - baseboard_id, - found_caboose, - db_which, - ) - .execute_async(&conn) - .await?; + // Insert records for cabooses found. Like the others, we do this + // using INSERT INTO ... SELECT because we need ids from the + // `sw_caboose` table that we may not have. + { + use db::schema::inv_caboose::dsl as inv_dsl; + use db::schema::sw_caboose::dsl as sw_dsl; + + for caboose_found in &cabooses_found { + let selection = db::schema::sw_caboose::table + .select(( + caboose_found + .id + .into_sql::(), + collection_id.into_sql::(), + caboose_found + .time_collected + .into_sql::(), + caboose_found + .source + .clone() + .into_sql::(), + sw_dsl::id, + )) + .filter( + sw_dsl::board + .eq(caboose_found.caboose.board.clone()), + ) + .filter( + sw_dsl::git_commit + .eq(caboose_found.caboose.git_commit.clone()), + ) + .filter( + sw_dsl::name.eq(caboose_found.caboose.name.clone()), + ) + .filter( + sw_dsl::version + .eq(caboose_found.caboose.version.clone()), + ); + + let _ = diesel::insert_into(db::schema::inv_caboose::table) + .values(selection) + .into_columns(( + inv_dsl::id, + inv_dsl::inv_collection_id, + inv_dsl::time_collected, + inv_dsl::source, + inv_dsl::sw_caboose_id, + )) + .execute_async(&conn) + .await?; } + + let ( + _id, + _inv_collection_id, + _time_collected, + _source, + _sw_caboose_id, + ) = inv_dsl::inv_caboose::all_columns(); } // Finally, insert the list of errors. @@ -664,251 +739,3 @@ impl DataStore { Ok(()) } } - -/// A SQL common table expression (CTE) used to insert into `inv_caboose` -/// -/// Concretely, we have these three tables: -/// -/// - `hw_baseboard` with an "id" primary key and lookup columns "part_number" -/// and "serial_number" -/// - `sw_caboose` with an "id" primary key and lookup columns "board", -/// "git_commit", "name", and "version" -/// - `inv_caboose` with foreign keys "hw_baseboard_id", "sw_caboose_id", and -/// various other columns -/// -/// We want to INSERT INTO `inv_caboose` a row with: -/// -/// - hw_baseboard_id (foreign key) the result of looking up an hw_baseboard row -/// by part number and serial number provided by the caller -/// -/// - sw_caboose_id (foreign key) the result of looking up a sw_caboose row by -/// board, git_commit, name, and version provided by the caller -/// -/// - the other columns being literals provided by the caller -/// -/// To achieve this, we're going to generate something like: -/// -/// WITH -/// my_new_row -/// AS ( -/// SELECT -/// hw_baseboard.id, /* `hw_baseboard` foreign key */ -/// sw_caboose.id, /* `sw_caboose` foreign key */ -/// ... /* caller-provided literal values for the rest */ -/// /* of the new inv_caboose row */ -/// FROM -/// hw_baseboard, -/// sw_caboose -/// WHERE -/// hw_baseboard.part_number = ... /* caller-provided part number */ -/// hw_baseboard.serial_number = ... /* caller-provided serial number */ -/// sw_caboose.board = ... /* caller-provided board */ -/// sw_caboose.git_commit = ... /* caller-provided git_commit */ -/// sw_caboose.name = ... /* caller-provided name */ -/// sw_caboose.version = ... /* caller-provided version */ -/// ) INSERT INTO -/// inv_caboose (... /* inv_caboose columns */) -/// SELECT * from my_new_row; -/// -/// The whole point is to avoid back-and-forth between the client and the -/// database. Those back-and-forth interactions can significantly increase -/// latency and the probability of transaction conflicts. See RFD 192 for -/// details. -#[must_use = "Queries must be executed"] -struct InvCabooseInsert { - // fields used to look up baseboard id - baseboard_part_number: String, - baseboard_serial_number: String, - - // fields used to look up caboose id - caboose_board: String, - caboose_git_commit: String, - caboose_name: String, - caboose_version: String, - - // literal values for the rest of the inv_caboose columns - collection_id: Uuid, - time_collected: DateTime, - source: String, - which: CabooseWhich, - - // These are Diesel structures representing table names in the "from" or - // "into" parts of queries (e.g., "SELECT FROM tablename" or "INSERT INTO - // tablename"). We need this in `walk_ast()` below, but they must outlive - // `walk_ast()`, so they need to be created ahead of time. - // - // TODO-cleanup These Diesel-internal types are nasty. It's not clear how - // else to do this. - from_hw_baseboard_id: - diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::hw_baseboard_id::table, - >, - from_sw_caboose: diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::sw_caboose::table, - >, - into_inv_caboose: - diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::inv_caboose::table, - >, -} - -impl InvCabooseInsert { - pub fn new( - collection_id: Uuid, - baseboard: &BaseboardId, - found_caboose: &CabooseFound, - which: CabooseWhich, - ) -> InvCabooseInsert { - InvCabooseInsert { - baseboard_part_number: baseboard.part_number.clone(), - baseboard_serial_number: baseboard.serial_number.clone(), - caboose_board: found_caboose.caboose.board.clone(), - caboose_git_commit: found_caboose.caboose.git_commit.clone(), - caboose_name: found_caboose.caboose.name.clone(), - caboose_version: found_caboose.caboose.version.clone(), - collection_id, - time_collected: found_caboose.time_collected, - source: found_caboose.source.clone(), - which, - from_hw_baseboard_id: db::schema::hw_baseboard_id::table - .from_clause(), - from_sw_caboose: db::schema::sw_caboose::table.from_clause(), - // It sounds a little goofy to use "from_clause()" when this is - // really part of an INSERT. But really this just produces the - // table name as an identifier. This is the same for both "FROM" - // and "INSERT" clauses. And diesel internally does the same thing - // here (see the type of `InsertStatement::into_clause`). - into_inv_caboose: db::schema::inv_caboose::table.from_clause(), - } - } -} - -impl diesel::query_builder::QueryFragment for InvCabooseInsert { - fn walk_ast<'b>( - &'b self, - mut pass: diesel::query_builder::AstPass<'_, 'b, diesel::pg::Pg>, - ) -> diesel::QueryResult<()> { - use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; - use db::schema::inv_caboose::dsl as dsl_inv_caboose; - use db::schema::sw_caboose::dsl as dsl_sw_caboose; - - pass.unsafe_to_cache_prepared(); - pass.push_sql("WITH my_new_row AS ("); - - pass.push_sql("SELECT "); - - // Emit the values that we're going to insert into `inv_caboose`. - // First, emit the looked-up foreign keys. - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::id::NAME)?; - pass.push_sql(", "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::id::NAME)?; - pass.push_sql(", "); - // Next, emit the literal values used for the rest of the columns. - pass.push_bind_param::(&self.collection_id)?; - pass.push_sql(", "); - pass.push_bind_param::( - &self.time_collected, - )?; - pass.push_sql(", "); - pass.push_bind_param::(&self.source)?; - pass.push_sql(", "); - pass.push_bind_param::(&self.which)?; - - // Finish the SELECT by adding the list of tables and the WHERE to pick - // out only the relevant row from each tables. - pass.push_sql(" FROM "); - - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql(", "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - - pass.push_sql(" WHERE "); - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::part_number::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::( - &self.baseboard_part_number, - )?; - pass.push_sql(" AND "); - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::serial_number::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::( - &self.baseboard_serial_number, - )?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::board::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_board)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::git_commit::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_git_commit)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::name::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_name)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::version::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_version)?; - - pass.push_sql(")\n"); // end of the SELECT query within the WITH - - pass.push_sql("INSERT INTO "); - self.into_inv_caboose.walk_ast(pass.reborrow())?; - - pass.push_sql("("); - pass.push_identifier(dsl_inv_caboose::hw_baseboard_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::sw_caboose_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::inv_collection_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::time_collected::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::source::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::which::NAME)?; - pass.push_sql(")\n"); - pass.push_sql("SELECT * FROM my_new_row"); - - // See the comment in inventory_insert_collection() where we use - // `inv_service_processor::all_columns()`. The same applies here. - // If you update the statement below because the schema for - // `inv_caboose` has changed, be sure to update the code above, too! - let ( - _hw_baseboard_id, - _sw_caboose_id, - _inv_collection_id, - _time_collected, - _source, - _which, - ) = dsl_inv_caboose::inv_caboose::all_columns(); - - Ok(()) - } -} - -// This is required to be able to call `inv_caboose_insert.execute_async()`. -impl diesel::RunQueryDsl for InvCabooseInsert {} - -// This is required to be able to call `inv_caboose_insert.execute_async()`. -impl diesel::query_builder::QueryId for InvCabooseInsert { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index d987527024..c8c3d48874 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -9,23 +9,32 @@ //! collected. use anyhow::anyhow; +use anyhow::Context; use chrono::DateTime; use chrono::Utc; +use gateway_client::types::RotSlot; use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_client::types::SpType; use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Caboose; use nexus_types::inventory::CabooseFound; -use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; use nexus_types::inventory::RotState; use nexus_types::inventory::ServiceProcessor; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::sync::Arc; +use strum::EnumIter; use uuid::Uuid; +/// Identifies one of a service processor's two firmware slots +#[derive(Debug, Clone, Copy, EnumIter)] +pub enum SpSlot { + Slot0, + Slot1, +} + /// Build an inventory [`Collection`] /// /// This interface is oriented around the interfaces used by an actual @@ -41,8 +50,6 @@ pub struct CollectionBuilder { cabooses: BTreeSet>, sps: BTreeMap, ServiceProcessor>, rots: BTreeMap, RotState>, - cabooses_found: - BTreeMap, CabooseFound>>, } impl CollectionBuilder { @@ -60,7 +67,6 @@ impl CollectionBuilder { cabooses: BTreeSet::new(), sps: BTreeMap::new(), rots: BTreeMap::new(), - cabooses_found: BTreeMap::new(), } } @@ -76,7 +82,6 @@ impl CollectionBuilder { cabooses: self.cabooses, sps: self.sps, rots: self.rots, - cabooses_found: self.cabooses_found, } } @@ -134,6 +139,9 @@ impl CollectionBuilder { baseboard_revision: sp_state.revision, hubris_archive: sp_state.hubris_archive_id, power_state: sp_state.power_state, + + slot0_caboose: None, + slot1_caboose: None, } }); @@ -157,6 +165,8 @@ impl CollectionBuilder { transient_boot_preference, slot_a_sha3_256_digest, slot_b_sha3_256_digest, + slot_a_caboose: None, + slot_b_caboose: None, } }); } @@ -175,19 +185,47 @@ impl CollectionBuilder { Some(baseboard) } - /// Returns true if we already found the caboose for `which` for baseboard - /// `baseboard` + /// Returns true if we already found the SP caboose for slot `slot` for + /// baseboard `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn found_sp_caboose_already( + &self, + baseboard: &BaseboardId, + slot: SpSlot, + ) -> bool { + self.sps + .get(baseboard) + .map(|sp| { + let sp_slot = match slot { + SpSlot::Slot0 => &sp.slot0_caboose, + SpSlot::Slot1 => &sp.slot1_caboose, + }; + sp_slot.is_some() + }) + .unwrap_or(false) + } + + /// Returns true if we already found the RoT caboose for slot `slot` for + /// baseboard `baseboard` /// /// This is used to avoid requesting it multiple times (from multiple MGS /// instances). - pub fn sp_found_caboose_already( + pub fn found_rot_caboose_already( &self, baseboard: &BaseboardId, - which: CabooseWhich, + slot: RotSlot, ) -> bool { - self.cabooses_found - .get(&which) - .map(|map| map.contains_key(baseboard)) + self.rots + .get(baseboard) + .map(|rot| { + let rot_slot = match slot { + RotSlot::A => &rot.slot_a_caboose, + RotSlot::B => &rot.slot_b_caboose, + }; + rot_slot.is_some() + }) .unwrap_or(false) } @@ -201,7 +239,7 @@ impl CollectionBuilder { pub fn found_sp_caboose( &mut self, baseboard: &BaseboardId, - which: CabooseWhich, + slot: SpSlot, source: &str, caboose: SpComponentCaboose, ) -> Result<(), anyhow::Error> { @@ -210,40 +248,87 @@ impl CollectionBuilder { // new one. let sw_caboose = Self::enum_item(&mut self.cabooses, Caboose::from(caboose)); - let (baseboard, _) = - self.sps.get_key_value(baseboard).ok_or_else(|| { - anyhow!( - "reporting caboose for unknown baseboard: {:?} ({:?})", - baseboard, - sw_caboose - ) - })?; - let by_id = - self.cabooses_found.entry(which).or_insert_with(|| BTreeMap::new()); - if let Some(previous) = by_id.insert( - baseboard.clone(), - CabooseFound { - time_collected: Utc::now(), - source: source.to_owned(), - caboose: sw_caboose.clone(), - }, - ) { - let error = if *previous.caboose == *sw_caboose { - anyhow!("reported multiple times (same value)",) - } else { - anyhow!( - "reported caboose multiple times (previously {:?}, \ + + // Find the SP. + let sp = self.sps.get_mut(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let sp_slot = match slot { + SpSlot::Slot0 => &mut sp.slot0_caboose, + SpSlot::Slot1 => &mut sp.slot1_caboose, + }; + Self::record_caboose(sp_slot, source, sw_caboose) + .context(format!("baseboard {:?} SP caboose {:?}", baseboard, slot)) + } + + /// Record the given root of trust caboose information found for the given + /// baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_rot_caboose( + &mut self, + baseboard: &BaseboardId, + slot: RotSlot, + source: &str, + caboose: SpComponentCaboose, + ) -> Result<(), anyhow::Error> { + // Normalize the caboose contents: i.e., if we've seen this exact caboose + // contents before, use the same record from before. Otherwise, make a + // new one. + let sw_caboose = + Self::enum_item(&mut self.cabooses, Caboose::from(caboose)); + + // Find the RoT state. Note that it's possible that we _do_ have + // caboose information for an RoT that we have no information about + // because the SP couldn't talk to the RoT when we asked for its state, + // but was able to do so when we got the caboose. This seems unlikely. + let rot = self.rots.get_mut(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let rot_slot = match slot { + RotSlot::A => &mut rot.slot_a_caboose, + RotSlot::B => &mut rot.slot_b_caboose, + }; + Self::record_caboose(rot_slot, source, sw_caboose).context(format!( + "baseboard {:?} RoT caboose {:?}", + baseboard, slot + )) + } + + fn record_caboose( + slot: &mut Option>, + source: &str, + sw_caboose: Arc, + ) -> Result<(), anyhow::Error> { + let old = slot.replace(Arc::new(CabooseFound { + id: Uuid::new_v4(), + time_collected: Utc::now(), + source: source.to_owned(), + caboose: sw_caboose.clone(), + })); + match old { + None => Ok(()), + Some(previous) if *previous.caboose == *sw_caboose => { + Err(anyhow!("reported multiple times (same value)")) + } + Some(previous) => Err(anyhow!( + "reported caboose multiple times (previously {:?}, \ now {:?})", - previous, - sw_caboose - ) - }; - Err(error.context(format!( - "baseboard {:?} caboose {:?}", - baseboard, which - ))) - } else { - Ok(()) + previous, + sw_caboose + )), } } diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index b410326904..eb9a396889 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -4,9 +4,9 @@ //! Collection of inventory from Omicron components -use crate::builder::CollectionBuilder; +use crate::builder::{CollectionBuilder, SpSlot}; use anyhow::Context; -use nexus_types::inventory::CabooseWhich; +use gateway_client::types::RotSlot; use nexus_types::inventory::Collection; use slog::{debug, error}; use std::sync::Arc; @@ -146,32 +146,78 @@ impl Collector { // fetched already, fetch it and record it. Generally, we'd only // get here for the first MGS client. Assuming that one succeeds, // the other(s) will skip this loop. - for which in CabooseWhich::iter() { + for sp_slot in SpSlot::iter() { if self .in_progress - .sp_found_caboose_already(&baseboard_id, which) + .found_sp_caboose_already(&baseboard_id, sp_slot) { continue; } - let (component, slot) = match which { - CabooseWhich::SpSlot0 => ("sp", 0), - CabooseWhich::SpSlot1 => ("sp", 1), - CabooseWhich::RotSlotA => ("rot", 0), - CabooseWhich::RotSlotB => ("rot", 1), + let slot_num = match sp_slot { + SpSlot::Slot0 => 0, + SpSlot::Slot1 => 1, + }; + + let result = client + .sp_component_caboose_get(sp.type_, sp.slot, "sp", slot_num) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {:?}: SP caboose {:?}", + client.baseurl(), + sp, + sp_slot + ) + }); + let caboose = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + if let Err(error) = self.in_progress.found_sp_caboose( + &baseboard_id, + sp_slot, + client.baseurl(), + caboose, + ) { + error!( + &self.log, + "error reporting caboose: {:?} SP {:?} {:?}: {:#}", + baseboard_id, + sp_slot, + client.baseurl(), + error + ); + } + } + + for rot_slot in RotSlot::iter() { + if self + .in_progress + .found_rot_caboose_already(&baseboard_id, rot_slot) + { + continue; + } + + let slot_num = match rot_slot { + RotSlot::A => 0, + RotSlot::B => 1, }; let result = client .sp_component_caboose_get( - sp.type_, sp.slot, component, slot, + sp.type_, sp.slot, "rot", slot_num, ) .await .with_context(|| { format!( - "MGS {:?}: SP {:?}: caboose {:?}", + "MGS {:?}: SP {:?}: RoT caboose {:?}", client.baseurl(), sp, - which + rot_slot ) }); let caboose = match result { @@ -181,17 +227,17 @@ impl Collector { } Ok(response) => response.into_inner(), }; - if let Err(error) = self.in_progress.found_sp_caboose( + if let Err(error) = self.in_progress.found_rot_caboose( &baseboard_id, - which, + rot_slot, client.baseurl(), caboose, ) { error!( &self.log, - "error reporting caboose: {:?} {:?} {:?}: {:#}", + "error reporting caboose: {:?} RoT {:?} {:?}: {:#}", baseboard_id, - which, + rot_slot, client.baseurl(), error ); diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 53dd59aef2..370844b388 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -17,7 +17,6 @@ pub use gateway_client::types::SpType; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::sync::Arc; -use strum::EnumIter; use uuid::Uuid; /// Results of collecting hardware/software inventory from various Omicron @@ -66,13 +65,6 @@ pub struct Collection { /// /// In practice, these will be inserted into the `inv_root_of_trust` table. pub rots: BTreeMap, RotState>, - /// all caboose contents found, keyed first by the kind of caboose - /// (`CabooseWhich`), then the baseboard id of the sled where they were - /// found - /// - /// In practice, these will be inserted into the `inv_caboose` table. - pub cabooses_found: - BTreeMap, CabooseFound>>, } /// A unique baseboard id found during a collection @@ -126,6 +118,7 @@ impl From for Caboose { /// particular source, but these are only for debugging) #[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] pub struct CabooseFound { + pub id: Uuid, pub time_collected: DateTime, pub source: String, pub caboose: Arc, @@ -143,6 +136,9 @@ pub struct ServiceProcessor { pub baseboard_revision: u32, pub hubris_archive: String, pub power_state: PowerState, + + pub slot0_caboose: Option>, + pub slot1_caboose: Option>, } /// Describes the root of trust state found (from a service processor) during @@ -158,13 +154,7 @@ pub struct RotState { pub transient_boot_preference: Option, pub slot_a_sha3_256_digest: Option, pub slot_b_sha3_256_digest: Option, -} -/// Describes which caboose this is (which component, which slot) -#[derive(Clone, Copy, Debug, EnumIter, PartialEq, Eq, PartialOrd, Ord)] -pub enum CabooseWhich { - SpSlot0, - SpSlot1, - RotSlotA, - RotSlotB, + pub slot_a_caboose: Option>, + pub slot_b_caboose: Option>, } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 19b164b240..311a6afcf5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2528,7 +2528,13 @@ BEGIN; * records is written. * * All rows in the `inv_*` tables point back to a particular collection. They - * represent the state observed at some particular time. + * represent the state observed at some particular time. Generally, if two + * observations came from two different places, they're not put into the same + * row of the same table. For example, caboose information comes from the SP, + * but it doesn't go into the `inv_service_processor` table. It goes in a + * separate `inv_caboose` table. This is debatable but it preserves a clearer + * record of exactly what information came from where, since the separate record + * has its own "source" and "time_collected". * * Information about service processors and roots of trust are joined with * information reported by sled agents via the baseboard id. @@ -2662,6 +2668,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( hubris_archive_id TEXT NOT NULL, power_state omicron.public.hw_power_state NOT NULL, + -- Caboose information (foreign keys into `inv_caboose`). The requests to + -- fetch this information can individually fail so these fields can be NULL. + slot0_inv_caboose_id UUID, + slot1_inv_caboose_id UUID, + PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); @@ -2687,35 +2698,41 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( slot_a_sha3_256 TEXT, -- nullable slot_b_sha3_256 TEXT, -- nullable - PRIMARY KEY (inv_collection_id, hw_baseboard_id) -); + -- Caboose information (foreign keys into `inv_caboose`). The requests to + -- fetch this information can individually fail so these fields can be NULL. + slot_a_inv_caboose_id UUID, + slot_b_inv_caboose_id UUID, -CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( - 'sp_slot_0', - 'sp_slot_1', - 'rot_slot_A', - 'rot_slot_B' + PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); -- cabooses found +-- +-- Rows in this table reflect that a particular caboose (`sw_caboose_id`) was +-- found in this collection, having been reported by `source` at +-- `time_collected`. It may be an SP or RoT caboose, and it could be in either +-- slot. To know which, you need to look at which field in +-- `inv_service_processor` or `inv_root_of_trust` points to it. +-- +-- Technically, we don't need `inv_collection_id` here because it's implied by +-- whatever points to this record. CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, - -- which system this SP reports it is part of - -- (foreign key into `hw_baseboard_id` table) - hw_baseboard_id UUID NOT NULL, -- when this observation was made time_collected TIMESTAMPTZ NOT NULL, -- which MGS instance reported this data source TEXT NOT NULL, - which omicron.public.caboose_which NOT NULL, - sw_caboose_id UUID NOT NULL, - - PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) + id UUID PRIMARY KEY, + sw_caboose_id UUID NOT NULL ); +-- Allow us to paginate through the cabooses that are part of a collection +CREATE INDEX IF NOT EXISTS lookup_caboose ON omicron.public.inv_caboose ( + inv_collection_id, id +); /*******************************************************************/ From eaab37f40cf16c1f4ec6397ab3fa76186bb9a117 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 19 Oct 2023 16:05:34 -0700 Subject: [PATCH 04/20] initial review feedback --- dev-tools/omdb/src/bin/omdb/mgs.rs | 2 +- gateway/src/http_entrypoints.rs | 13 ++++--------- nexus/inventory/src/builder.rs | 8 ++++---- nexus/types/src/inventory.rs | 13 +++++-------- openapi/gateway.json | 4 ++-- wicketd/src/update_tracker.rs | 22 ++++++++-------------- 6 files changed, 24 insertions(+), 38 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index d2938418e1..770cba9f62 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -433,7 +433,7 @@ async fn show_sp_details( board: caboose.board, git_commit: caboose.git_commit, name: caboose.name, - version: caboose.version.unwrap_or_else(|| "-".to_string()), + version: caboose.version, } } } diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 6d19f4c56e..e51f7509a5 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -27,8 +27,6 @@ use dropshot::UntypedBody; use dropshot::WebsocketEndpointResult; use dropshot::WebsocketUpgrade; use futures::TryFutureExt; -use gateway_messages::SpError; -use gateway_sp_comms::error::CommunicationError; use gateway_sp_comms::HostPhase2Provider; use omicron_common::update::ArtifactHash; use schemars::JsonSchema; @@ -422,7 +420,7 @@ pub struct SpComponentCaboose { pub git_commit: String, pub board: String, pub name: String, - pub version: Option, + pub version: String, } /// Identity of a host phase2 recovery image. @@ -659,18 +657,15 @@ async fn sp_component_caboose_get( .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME) .await .map_err(SpCommsError::from)?; - let version = match sp + let version = sp .read_component_caboose(component, firmware_slot, CABOOSE_KEY_VERSION) .await - { - Ok(value) => Some(from_utf8(&CABOOSE_KEY_VERSION, value)?), - Err(CommunicationError::SpError(SpError::NoSuchCabooseKey(_))) => None, - Err(err) => return Err(SpCommsError::from(err).into()), - }; + .map_err(SpCommsError::from)?; let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; let name = from_utf8(&CABOOSE_KEY_NAME, name)?; + let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; let caboose = SpComponentCaboose { git_commit, board, name, version }; diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index c8c3d48874..fa6ae21363 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -117,7 +117,7 @@ impl CollectionBuilder { // Normalize the baseboard id: i.e., if we've seen this baseboard // before, use the same baseboard id record. Otherwise, make a new one. - let baseboard = Self::enum_item( + let baseboard = Self::normalize_item( &mut self.baseboards, BaseboardId { serial_number: sp_state.serial_number, @@ -247,7 +247,7 @@ impl CollectionBuilder { // contents before, use the same record from before. Otherwise, make a // new one. let sw_caboose = - Self::enum_item(&mut self.cabooses, Caboose::from(caboose)); + Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); // Find the SP. let sp = self.sps.get_mut(baseboard).ok_or_else(|| { @@ -284,7 +284,7 @@ impl CollectionBuilder { // contents before, use the same record from before. Otherwise, make a // new one. let sw_caboose = - Self::enum_item(&mut self.cabooses, Caboose::from(caboose)); + Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); // Find the RoT state. Note that it's possible that we _do_ have // caboose information for an RoT that we have no information about @@ -337,7 +337,7 @@ impl CollectionBuilder { /// If `item` (or its equivalent) is not already in `items`, insert it. /// Either way, return the item from `items`. (This will either be `item` /// itself or whatever was already in `items`.) - fn enum_item( + fn normalize_item( items: &mut BTreeSet>, item: T, ) -> Arc { diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 370844b388..ba889133ef 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -28,10 +28,10 @@ use uuid::Uuid; /// with separate records, even though they might come from the same source /// (in this case, a single MGS request). /// -/// We make heavy use of maps, sets, and Arcs here because many of these things -/// point to each other and this approach to representing relationships ensures -/// clear ownership. (It also reflects how things will wind up in the -/// database.) +/// We make heavy use of maps, sets, and Arcs here because some of these objects +/// are pointed-to by many other objects in the same Collection. This approach +/// ensures clear ownership. It also reflects how things will wind up in the +/// database. /// /// See the documentation in the database schema for more background. #[derive(Debug)] @@ -106,10 +106,7 @@ impl From for Caboose { board: c.board, git_commit: c.git_commit, name: c.name, - // The MGS API uses an `Option` here because old SP versions did not - // supply it. But modern SP versions do. So we should never hit - // this `unwrap_or()`. - version: c.version.unwrap_or(String::from("")), + version: c.version, } } } diff --git a/openapi/gateway.json b/openapi/gateway.json index 847d1f746d..6a8c72c73f 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -2184,14 +2184,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentDetails": { diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index 1bbda00158..05f57935a2 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -839,25 +839,21 @@ impl UpdateDriver { let message = format!( "SP board {}, version {} (git commit {})", - caboose.board, - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.board, caboose.version, caboose.git_commit ); - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => { + match caboose.version.parse::() { + Ok(version) => { StepSuccess::new((sp_artifact, Some(version))) .with_message(message) .into() } - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( (sp_artifact, None), format!( "{message} (failed to parse SP version: {err})" ), ) .into(), - None => StepWarning::new((sp_artifact, None), message) - .into(), } }, ) @@ -1599,8 +1595,7 @@ impl UpdateContext { let message = format!( "RoT slot {active_slot_name} version {} (git commit {})", - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.version, caboose.git_commit ); let make_result = |active_version| RotInterrogation { @@ -1609,16 +1604,15 @@ impl UpdateContext { active_version, }; - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => StepSuccess::new(make_result(Some(version))) + match caboose.version.parse::() { + Ok(version) => StepSuccess::new(make_result(Some(version))) .with_message(message) .into(), - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( make_result(None), format!("{message} (failed to parse RoT version: {err})"), ) .into(), - None => StepWarning::new(make_result(None), message).into(), } } From 308070034378408b9ef4bd8fa61187be8dbec72a Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 19 Oct 2023 16:37:57 -0700 Subject: [PATCH 05/20] cont: review feedback --- openapi/wicketd.json | 4 ++-- wicket/src/state/inventory.rs | 2 +- wicket/src/ui/panes/overview.rs | 7 +------ 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/openapi/wicketd.json b/openapi/wicketd.json index d67fc79f7a..0e278c9423 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -2331,14 +2331,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentInfo": { diff --git a/wicket/src/state/inventory.rs b/wicket/src/state/inventory.rs index 02019898e8..4b439c1414 100644 --- a/wicket/src/state/inventory.rs +++ b/wicket/src/state/inventory.rs @@ -148,7 +148,7 @@ pub enum Component { } fn version_or_unknown(caboose: Option<&SpComponentCaboose>) -> String { - caboose.and_then(|c| c.version.as_deref()).unwrap_or("UNKNOWN").to_string() + caboose.map(|c| c.version.as_str()).unwrap_or("UNKNOWN").to_string() } impl Component { diff --git a/wicket/src/ui/panes/overview.rs b/wicket/src/ui/panes/overview.rs index b7a04c055d..3e0b317df9 100644 --- a/wicket/src/ui/panes/overview.rs +++ b/wicket/src/ui/panes/overview.rs @@ -885,7 +885,6 @@ fn append_caboose( } = caboose; let label_style = style::text_label(); let ok_style = style::text_success(); - let bad_style = style::text_failure(); spans.push( vec![ @@ -905,9 +904,5 @@ fn append_caboose( ); let mut version_spans = vec![prefix.clone(), Span::styled("Version: ", label_style)]; - if let Some(v) = version.as_ref() { - version_spans.push(Span::styled(v.clone(), ok_style)); - } else { - version_spans.push(Span::styled("Unknown", bad_style)); - } + version_spans.push(Span::styled(version, ok_style)); } From 90a6ad532a073bfd05f1d216ad8a89d80e0605d7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 19 Oct 2023 16:41:45 -0700 Subject: [PATCH 06/20] omdb test is flaky because of the order of background tasks printed out --- dev-tools/omdb/src/bin/omdb/nexus.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index cbfb6f91b8..e21d8630a8 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -145,7 +145,9 @@ async fn cmd_nexus_background_tasks_show( ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; - let mut tasks = response.into_inner(); + // Convert the HashMap to a BTreeMap because we want the keys in sorted + // order. + let mut tasks = response.into_inner().into_iter().collect::>(); // We want to pick the order that we print some tasks intentionally. Then // we want to print anything else that we find. From fc91640fb3d724a16032bea527d241c3648d3394 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 24 Oct 2023 21:11:17 -0700 Subject: [PATCH 07/20] add test for inventory builder --- nexus/inventory/src/builder.rs | 714 +++++++++++++++++++++++++++++++++ 1 file changed, 714 insertions(+) diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index fa6ae21363..1f25a3f519 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -361,3 +361,717 @@ impl CollectionBuilder { self.errors.push(error); } } + +#[cfg(test)] +mod test { + use super::CollectionBuilder; + use crate::builder::SpSlot; + use chrono::Utc; + use gateway_client::types::PowerState; + use gateway_client::types::RotSlot; + use gateway_client::types::RotState; + use gateway_client::types::SpComponentCaboose; + use gateway_client::types::SpState; + use gateway_client::types::SpType; + use nexus_types::inventory::BaseboardId; + use nexus_types::inventory::Caboose; + use strum::IntoEnumIterator; + + // Verify the contents of an empty collection. + #[test] + fn test_empty() { + let time_before = Utc::now(); + let builder = CollectionBuilder::new("test_empty"); + let collection = builder.build(); + let time_after = Utc::now(); + + assert!(collection.errors.is_empty()); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_empty"); + assert!(collection.baseboards.is_empty()); + assert!(collection.cabooses.is_empty()); + assert!(collection.sps.is_empty()); + assert!(collection.rots.is_empty()); + } + + // Simple test of a single, fairly typical collection that contains just + // about all kinds of valid data. That includes exercising: + // + // - all three baseboard types (switch, sled, PSC) + // - various valid values for all fields (sources, slot numbers, power + // states, baseboard revisions, cabooses, etc.) + // - some empty slots + // - some missing cabooses + // - some cabooses common to multiple baseboards; others not + // - serial number reused across different model numbers + // + // This test is admittedly pretty tedious and maybe not worthwhile but it's + // a useful quick check. + #[test] + fn test_basic() { + let time_before = Utc::now(); + let mut builder = CollectionBuilder::new("test_basic"); + + // an ordinary, working sled + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest1", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest1", + )), + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // another ordinary sled with different values for ordinary fields + let sled2_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Sled, + 4, + SpState { + base_mac_address: [1; 6], + hubris_archive_id: String::from("hubris2"), + model: String::from("model2"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: Some(RotSlot::A), + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest2", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest2", + )), + transient_boot_preference: Some(RotSlot::B), + }, + // same serial number, which is okay because it's a different + // model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a switch + let switch1_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Switch, + 0, + SpState { + base_mac_address: [2; 6], + hubris_archive_id: String::from("hubris3"), + model: String::from("model3"), + power_state: PowerState::A1, + revision: 2, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest3", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest3", + )), + transient_boot_preference: None, + }, + // same serial number, which is okay because it's a different + // model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a PSC + let psc_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Power, + 1, + SpState { + base_mac_address: [3; 6], + hubris_archive_id: String::from("hubris4"), + model: String::from("model4"), + power_state: PowerState::A2, + revision: 3, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest4", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest4", + )), + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // a sled with no RoT state or other optional fields + let sled3_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 5, + SpState { + base_mac_address: [4; 6], + hubris_archive_id: String::from("hubris5"), + model: String::from("model1"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::CommunicationFailed { + message: String::from("test suite injected error"), + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // Report some cabooses. + + // We'll use the same cabooses for most of these components, although + // that's not possible in a real system. We deliberately construct a + // new value each time to make sure the builder correctly normalizes it. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; + for bb in &common_caboose_baseboards { + for slot in SpSlot::iter() { + assert!(!builder.found_sp_caboose_already(bb, slot)); + let _ = builder + .found_sp_caboose( + bb, + slot, + "test suite", + SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }, + ) + .unwrap(); + assert!(builder.found_sp_caboose_already(bb, slot)); + } + + for slot in RotSlot::iter() { + assert!(!builder.found_rot_caboose_already(bb, slot)); + let _ = builder.found_rot_caboose( + bb, + slot, + "test suite", + SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }, + ); + assert!(builder.found_rot_caboose_already(bb, slot)); + } + } + + // For the PSC, use different cabooses for both slots of both the SP and + // RoT, just to exercise that we correctly keep track of different + // cabooses. + let _ = builder + .found_sp_caboose( + &psc_bb, + SpSlot::Slot0, + "test suite", + SpComponentCaboose { + board: String::from("psc_sp_0"), + git_commit: String::from("psc_sp_0"), + name: String::from("psc_sp_0"), + version: String::from("psc_sp_0"), + }, + ) + .unwrap(); + let _ = builder + .found_sp_caboose( + &psc_bb, + SpSlot::Slot1, + "test suite", + SpComponentCaboose { + board: String::from("psc_sp_1"), + git_commit: String::from("psc_sp_1"), + name: String::from("psc_sp_1"), + version: String::from("psc_sp_1"), + }, + ) + .unwrap(); + let _ = builder + .found_rot_caboose( + &psc_bb, + RotSlot::A, + "test suite", + SpComponentCaboose { + board: String::from("psc_rot_a"), + git_commit: String::from("psc_rot_a"), + name: String::from("psc_rot_a"), + version: String::from("psc_rot_a"), + }, + ) + .unwrap(); + let _ = builder + .found_rot_caboose( + &psc_bb, + RotSlot::B, + "test suite", + SpComponentCaboose { + board: String::from("psc_rot_b"), + git_commit: String::from("psc_rot_b"), + name: String::from("psc_rot_b"), + version: String::from("psc_rot_b"), + }, + ) + .unwrap(); + + // We deliberately provide no cabooses for sled3. + + // Finish the collection and verify the basics. + let collection = builder.build(); + let time_after = Utc::now(); + println!("{:#?}", collection); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_basic"); + + // Verify the one error that ought to have been produced for the SP with + // no RoT information. + assert_eq!( + collection.errors.iter().map(|e| e.to_string()).collect::>(), + ["MGS \"fake MGS 1\": reading RoT state for BaseboardId \ + { part_number: \"model1\", serial_number: \"s2\" }: test suite \ + injected error"] + ); + + // Verify the baseboard ids found. + let expected_baseboards = + &[&sled1_bb, &sled2_bb, &sled3_bb, &switch1_bb, &psc_bb]; + for bb in expected_baseboards { + assert!(collection.baseboards.contains(*bb)); + } + assert_eq!(collection.baseboards.len(), expected_baseboards.len()); + + // Verify the stuff that's easy to verify for all SPs: timestamps. + assert_eq!(collection.sps.len(), collection.baseboards.len()); + for (bb, sp) in collection.sps.iter() { + assert!(collection.time_started <= sp.time_collected); + assert!(sp.time_collected <= collection.time_done); + + if let Some(rot) = collection.rots.get(bb) { + assert_eq!(rot.source, sp.source); + assert_eq!(rot.time_collected, sp.time_collected); + } + + for c in + [&sp.slot0_caboose, &sp.slot1_caboose].into_iter().flatten() + { + assert!(collection.time_started <= c.time_collected); + assert!(c.time_collected <= collection.time_done); + } + } + + // Verify the common caboose. + let common_caboose = Caboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + for bb in &common_caboose_baseboards { + let sp = collection.sps.get(*bb).unwrap(); + let c0 = sp.slot0_caboose.as_ref().unwrap(); + let c1 = sp.slot0_caboose.as_ref().unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + + let rot = collection.rots.get(*bb).unwrap(); + let c0 = rot.slot_a_caboose.as_ref().unwrap(); + let c1 = rot.slot_b_caboose.as_ref().unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + } + assert!(collection.cabooses.contains(&common_caboose)); + + // Verify the specific, different data for the healthy SPs and RoTs that + // we reported. + // sled1 + let sp = collection.sps.get(&sled1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 3); + assert_eq!(sp.baseboard_revision, 0); + assert_eq!(sp.hubris_archive, "hubris1"); + assert_eq!(sp.power_state, PowerState::A0); + let rot = collection.rots.get(&sled1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::A); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest1" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest1" + ); + assert_eq!(rot.transient_boot_preference, None); + + // sled2 + let sp = collection.sps.get(&sled2_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 4); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris2"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&sled2_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, Some(RotSlot::A)); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest2" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest2" + ); + assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); + + // switch + let sp = collection.sps.get(&switch1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Switch); + assert_eq!(sp.sp_slot, 0); + assert_eq!(sp.baseboard_revision, 2); + assert_eq!(sp.hubris_archive, "hubris3"); + assert_eq!(sp.power_state, PowerState::A1); + let rot = collection.rots.get(&switch1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest3" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest3" + ); + assert_eq!(rot.transient_boot_preference, None); + + // PSC + let sp = collection.sps.get(&psc_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Power); + assert_eq!(sp.sp_slot, 1); + assert_eq!(sp.baseboard_revision, 3); + assert_eq!(sp.hubris_archive, "hubris4"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&psc_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest4" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest4" + ); + assert_eq!(rot.transient_boot_preference, None); + + // The PSC has four different cabooses! + let c = &sp.slot0_caboose.as_ref().unwrap().caboose; + assert_eq!(c.board, "psc_sp_0"); + assert!(collection.cabooses.contains(c)); + let c = &sp.slot1_caboose.as_ref().unwrap().caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_sp_1"); + let c = &rot.slot_a_caboose.as_ref().unwrap().caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_rot_a"); + let c = &rot.slot_b_caboose.as_ref().unwrap().caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_rot_b"); + + // Verify the reported SP state for sled3, which did not have a healthy + // RoT, nor any cabooses. + let sp = collection.sps.get(&sled3_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 5); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris5"); + assert_eq!(sp.power_state, PowerState::A2); + assert_eq!(sp.slot0_caboose, None); + assert_eq!(sp.slot1_caboose, None); + assert!(!collection.rots.contains_key(&sled3_bb)); + + // There shouldn't be any other RoTs. + assert_eq!(collection.sps.len(), collection.rots.len() + 1); + + // There should be five cabooses: the four used for the PSC (see above), + // plus the common one. + assert_eq!(collection.cabooses.len(), 5); + } + + // Exercises all the failure cases that shouldn't happen in real systems. + // Despite all of these failures, we should get a valid collection at the + // end. + #[test] + fn test_problems() { + let mut builder = CollectionBuilder::new("test_problems"); + + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // report the same SP again with the same contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report the same SP again with different contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report an SP with an impossible slot number + let sled2_sp = builder.found_sp_state( + "fake MGS 1", + SpType::Sled, + u32::from(u16::MAX) + 1, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ); + assert_eq!(sled2_sp, None); + + // report SP caboose for an unknown baseboard + let bogus_baseboard = BaseboardId { + part_number: String::from("p1"), + serial_number: String::from("bogus"), + }; + let caboose1 = SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + assert!( + !builder.found_sp_caboose_already(&bogus_baseboard, SpSlot::Slot0) + ); + let error = builder + .found_sp_caboose( + &bogus_baseboard, + SpSlot::Slot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "reporting caboose for unknown baseboard: \ + BaseboardId { part_number: \"p1\", serial_number: \"bogus\" } \ + (Caboose { board: \"board1\", git_commit: \"git_commit1\", \ + name: \"name1\", version: \"version1\" })" + ); + assert!( + !builder.found_sp_caboose_already(&bogus_baseboard, SpSlot::Slot0) + ); + + // report RoT caboose for an unknown baseboard + let error2 = builder + .found_rot_caboose( + &bogus_baseboard, + RotSlot::A, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!(error.to_string(), error2.to_string(),); + + // report the same caboose twice with the same contents + let _ = builder + .found_sp_caboose( + &sled1_bb, + SpSlot::Slot0, + "dummy", + caboose1.clone(), + ) + .unwrap(); + let error = builder + .found_sp_caboose( + &sled1_bb, + SpSlot::Slot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + format!("{:#}", error), + "baseboard BaseboardId { part_number: \"model1\", \ + serial_number: \"s1\" } SP caboose Slot0: reported multiple \ + times (same value)" + ); + // report the same caboose again with different contents + let error = builder + .found_sp_caboose( + &sled1_bb, + SpSlot::Slot0, + "dummy", + SpComponentCaboose { + board: String::from("board2"), + git_commit: String::from("git_commit2"), + name: String::from("name2"), + version: String::from("version2"), + }, + ) + .unwrap_err(); + let message = format!("{:#}", error); + println!("found error: {}", message); + assert!(message.contains( + "SP caboose Slot0: reported caboose multiple times (previously" + )); + assert!(message.contains(", now ")); + + // We should still get a valid collection. + let collection = builder.build(); + println!("{:#?}", collection); + assert_eq!(collection.collector, "test_problems"); + + // We should still have the one sled and its SP slot0 caboose. + assert!(collection.baseboards.contains(&sled1_bb)); + let sp = collection.sps.get(&sled1_bb).unwrap(); + let caboose = sp.slot0_caboose.as_ref().unwrap(); + assert_eq!(caboose.caboose.board, "board2"); + assert!(collection.cabooses.contains(&caboose.caboose)); + assert_eq!(sp.slot1_caboose, None); + let rot = collection.rots.get(&sled1_bb).unwrap(); + assert_eq!(rot.slot_a_caboose, None); + assert_eq!(rot.slot_b_caboose, None); + + // We should see an error. + assert_eq!( + collection + .errors + .iter() + .map(|e| format!("{:#}", e)) + .collect::>(), + vec![ + "MGS \"fake MGS 1\": SP Sled slot 65536: \ + slot number did not fit into u16" + ] + ); + } +} From 3e79d6948b6ee28a1d6fdc5d54b551f0a02bd075 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 24 Oct 2023 21:11:50 -0700 Subject: [PATCH 08/20] rustfmt --- dev-tools/omdb/src/bin/omdb/nexus.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index e21d8630a8..128d4315f2 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -147,7 +147,8 @@ async fn cmd_nexus_background_tasks_show( client.bgtask_list().await.context("listing background tasks")?; // Convert the HashMap to a BTreeMap because we want the keys in sorted // order. - let mut tasks = response.into_inner().into_iter().collect::>(); + let mut tasks = + response.into_inner().into_iter().collect::>(); // We want to pick the order that we print some tasks intentionally. Then // we want to print anything else that we find. From 515fba1d29a6fd4d772615fb557fd885b8d9bed8 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 25 Oct 2023 10:38:58 -0700 Subject: [PATCH 09/20] add basic test for collector --- Cargo.lock | 3 + nexus/inventory/Cargo.toml | 5 + nexus/inventory/src/collector.rs | 207 ++++++++++++++++++ .../tests/output/collector_basic.txt | 25 +++ .../tests/output/collector_errors.txt | 26 +++ 5 files changed, 266 insertions(+) create mode 100644 nexus/inventory/tests/output/collector_basic.txt create mode 100644 nexus/inventory/tests/output/collector_errors.txt diff --git a/Cargo.lock b/Cargo.lock index 8f53bfac2d..aa324ed0de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4470,12 +4470,15 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "expectorate", "futures", "gateway-client", "gateway-messages", + "gateway-test-utils", "nexus-types", "slog", "strum", + "tokio", "uuid", ] diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index 12208a3467..7fc3602596 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -14,3 +14,8 @@ nexus-types.workspace = true slog.workspace = true strum.workspace = true uuid.workspace = true + +[dev-dependencies] +expectorate.workspace = true +gateway-test-utils.workspace = true +tokio.workspace = true diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index eb9a396889..8781fdd655 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -246,3 +246,210 @@ impl Collector { } } } + +#[cfg(test)] +mod test { + use super::Collector; + use gateway_messages::SpPort; + use nexus_types::inventory::Collection; + use std::fmt::Write; + use std::sync::Arc; + + fn dump_collection(collection: &Collection) -> String { + // Construct a stable, human-readable summary of the Collection + // contents. We could use a `Debug` impl for this, but that's not quite + // right: when debugging, for example, we want fields like the ids, but + // these change each time and we don't want to include them here. + // `Serialize` has the same problem -- the set of fields to include + // depends on what the serialization is for. It's easy enough to just + // print what we want here. + let mut s = String::new(); + write!(&mut s, "baseboards:\n").unwrap(); + for b in &collection.baseboards { + write!( + &mut s, + " part {:?} serial {:?}\n", + b.part_number, b.serial_number + ) + .unwrap(); + } + + write!(&mut s, "\ncabooses:\n").unwrap(); + for c in &collection.cabooses { + write!( + &mut s, + " board {:?} name {:?} version {:?} git_commit {:?}\n", + c.board, c.name, c.version, c.git_commit, + ) + .unwrap(); + } + + // All we really need to check here is that we're reporting the right + // SPs, RoTs, and cabooses. The actual SP data, RoT data, and caboose + // data comes straight from MGS. And proper handling of that data is + // tested in the builder. + write!(&mut s, "\nSPs:\n").unwrap(); + for (bb, sp) in &collection.sps { + write!( + &mut s, + " baseboard part {:?} serial {:?} slot0 \ + caboose {} slot1 caboose {}\n", + bb.part_number, + bb.serial_number, + sp.slot0_caboose + .as_ref() + .map(|c| format!( + "{:?}/{:?}", + c.caboose.board, c.caboose.git_commit + )) + .as_deref() + .unwrap_or("(none)"), + sp.slot1_caboose + .as_ref() + .map(|c| format!( + "{:?}/{:?}", + c.caboose.board, c.caboose.git_commit + )) + .as_deref() + .unwrap_or("(none)"), + ) + .unwrap(); + } + + write!(&mut s, "\nRoTs:\n").unwrap(); + for (bb, rot) in &collection.rots { + write!( + &mut s, + " baseboard part {:?} serial {:?} slot A \ + caboose {} slot B caboose {}\n", + bb.part_number, + bb.serial_number, + rot.slot_a_caboose + .as_ref() + .map(|c| format!( + "{:?}/{:?}", + c.caboose.board, c.caboose.git_commit + )) + .as_deref() + .unwrap_or("(none)"), + rot.slot_b_caboose + .as_ref() + .map(|c| format!( + "{:?}/{:?}", + c.caboose.board, c.caboose.git_commit + )) + .as_deref() + .unwrap_or("(none)"), + ) + .unwrap(); + } + + write!(&mut s, "\nerrors:\n").unwrap(); + for e in &collection.errors { + write!(&mut s, "error: {:#}\n", e).unwrap(); + } + + s + } + + #[tokio::test] + async fn test_basic() { + // Set up the stock MGS test setup which includes a couple of fake SPs. + // Then run a collection against it. + let gwtestctx = + gateway_test_utils::setup::test_setup("test_basic", SpPort::One) + .await; + let log = &gwtestctx.logctx.log; + let mgs_url = format!("http://{}/", gwtestctx.client.bind_address); + let mgs_client = + Arc::new(gateway_client::Client::new(&mgs_url, log.clone())); + let collector = + Collector::new("test-suite", &[mgs_client], log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs() { + // This is the same as the basic test, but we set up two different MGS + // instances and point the collector at both. We should get the same + // result. + let gwtestctx1 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_1", + SpPort::One, + ) + .await; + let gwtestctx2 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx1.logctx.log; + let mgs_clients = [&gwtestctx1, &gwtestctx2] + .into_iter() + .map(|g| { + let url = format!("http://{}/", g.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }) + .collect::>(); + let collector = Collector::new("test-suite", &mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx1.teardown().await; + gwtestctx2.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs_failure() { + // This is similar to the multi-MGS test, but we don't actually set up + // the second MGS. To the collector, it should look offline or + // otherwise non-functional. + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx.logctx.log; + let real_client = { + let url = format!("http://{}/", gwtestctx.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }; + let bad_client = { + // This IP range is guaranteed by RFC 6666 to discard traffic. + let url = format!("http://[100::1]:12345"); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }; + let mgs_clients = &[bad_client, real_client]; + let collector = Collector::new("test-suite", mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_errors.txt", &s); + + gwtestctx.teardown().await; + } +} diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt new file mode 100644 index 0000000000..630712a4e8 --- /dev/null +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -0,0 +1,25 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + +errors: diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt new file mode 100644 index 0000000000..e89fce075f --- /dev/null +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -0,0 +1,26 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + +errors: +error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error trying to connect: tcp connect error: Network is unreachable (os error 128): tcp connect error: Network is unreachable (os error 128): Network is unreachable (os error 128) From f355bd3cdc4ad7d38f6f70623756be529c6b0550 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 25 Oct 2023 11:11:13 -0700 Subject: [PATCH 10/20] typo --- nexus/db-queries/src/db/datastore/inventory.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 664a06cd8e..1bfca3d41d 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -684,7 +684,7 @@ impl DataStore { .await?; }; - // Remove rows for service processors. + // Remove rows for roots of trust. let nrots = { use db::schema::inv_root_of_trust::dsl; diesel::delete( From 03abf62bf3fb531574b1a39fa0fd275cae7a90de Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 25 Oct 2023 11:26:27 -0700 Subject: [PATCH 11/20] Revert "replace `CabooseWhich` with optional fields in SP, RoT tables" This reverts commit 58c010f5f238f6ae5f17b93c16e045d0a4843aba. --- Cargo.lock | 1 - clients/gateway-client/Cargo.toml | 1 - clients/gateway-client/src/lib.rs | 2 +- dev-tools/omdb/src/bin/omdb/db.rs | 141 ++- nexus/db-model/src/inventory.rs | 45 +- nexus/db-model/src/schema.rs | 11 +- .../db-queries/src/db/datastore/inventory.rs | 371 ++++++-- nexus/inventory/src/builder.rs | 891 +----------------- nexus/inventory/src/collector.rs | 138 +-- .../tests/output/collector_basic.txt | 34 +- .../tests/output/collector_errors.txt | 34 +- nexus/types/src/inventory.rs | 22 +- schema/crdb/dbinit.sql | 39 +- 13 files changed, 543 insertions(+), 1187 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa324ed0de..43c551d9c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2783,7 +2783,6 @@ dependencies = [ "serde", "serde_json", "slog", - "strum", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 7458453660..fc33174107 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -14,6 +14,5 @@ serde.workspace = true serde_json.workspace = true schemars.workspace = true slog.workspace = true -strum.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index 27a45ba5ab..b071d34975 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -54,7 +54,7 @@ progenitor::generate_api!( SpState = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, RotState = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, RotImageDetails = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, - RotSlot = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, strum::EnumIter ] }, + RotSlot = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, ImageVersion = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, HostPhase2RecoveryImageId = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, }, diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 471ec01cb3..4546a6e543 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -12,7 +12,7 @@ //! would be the only consumer -- and in that case it's okay to query the //! database directly. -// NOTE: emanates from Tabled macros +// NOTE: eminates from Tabled macros #![allow(clippy::useless_vec)] use crate::Omdb; @@ -30,6 +30,7 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; +use nexus_db_model::CabooseWhich; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -1790,7 +1791,7 @@ async fn inv_collection_print_devices( rots.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() }; - // Load cabooses found, grouped by id. + // Load cabooses found, grouped by baseboard id. let inv_cabooses = { use db::schema::inv_caboose::dsl; let cabooses_found = dsl::inv_caboose @@ -1801,7 +1802,15 @@ async fn inv_collection_print_devices( .await .context("loading cabooses found")?; check_limit(&cabooses_found, limit, || "loading cabooses found"); - cabooses_found.into_iter().map(|c| (c.id, c)).collect() + + let mut cabooses: BTreeMap> = BTreeMap::new(); + for ic in cabooses_found { + cabooses + .entry(ic.hw_baseboard_id) + .or_insert_with(Vec::new) + .push(ic); + } + cabooses }; // Assemble a list of baseboard ids, sorted first by device type (sled, @@ -1851,11 +1860,61 @@ async fn inv_collection_print_devices( println!(""); println!(" found at: {} from {}", sp.time_collected, sp.source); - let sp_cabooses = &[ - ("SP slot 0", sp.slot0_inv_caboose_id), - ("SP slot 1", sp.slot1_inv_caboose_id), - ]; - inv_collection_print_cabooses(sp_cabooses, &inv_cabooses, &sw_cabooses); + println!(" cabooses:"); + if let Some(my_inv_cabooses) = inv_cabooses.get(baseboard_id) { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: &'static str, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, + } + let mut nbugs = 0; + let rows = my_inv_cabooses.iter().map(|ic| { + let slot = match ic.which { + CabooseWhich::SpSlot0 => " SP slot 0", + CabooseWhich::SpSlot1 => " SP slot 1", + CabooseWhich::RotSlotA => "RoT slot A", + CabooseWhich::RotSlotB => "RoT slot B", + }; + + let (board, name, version, git_commit) = + match sw_cabooses.get(&ic.sw_caboose_id) { + None => { + nbugs += 1; + ("-", "-", "-", "-") + } + Some(c) => ( + c.board.as_str(), + c.name.as_str(), + c.version.as_str(), + c.git_commit.as_str(), + ), + }; + + CabooseRow { slot, board, name, version, git_commit } + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", textwrap::indent(&table.to_string(), " ")); + + if nbugs > 0 { + // Similar to above, if we don't have the sw_caboose for some + // inv_caboose, then it's a bug in either this tool (if we + // failed to fetch it) or the inventory system (if it failed to + // insert it). + println!( + "error: at least one caboose above was missing data \ + -- this is a bug" + ); + } + } if let Some(rot) = rot { println!(" RoT: active slot: slot {:?}", rot.slot_active); @@ -1889,16 +1948,6 @@ async fn inv_collection_print_devices( .clone() .unwrap_or_else(|| String::from("-")) ); - - let rot_cabooses = &[ - ("RoT slot 0", rot.slot_a_inv_caboose_id), - ("RoT slot 1", rot.slot_b_inv_caboose_id), - ]; - inv_collection_print_cabooses( - rot_cabooses, - &inv_cabooses, - &sw_cabooses, - ); } else { println!(" RoT: no information found"); } @@ -1953,59 +2002,3 @@ async fn inv_collection_print_devices( Ok(()) } - -fn inv_collection_print_cabooses( - component_cabooses: &[(&'static str, Option)], - inv_cabooses: &BTreeMap, - sw_cabooses: &BTreeMap, -) { - println!(" cabooses:"); - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct CabooseRow<'a> { - slot: &'static str, - board: &'a str, - name: &'a str, - version: &'a str, - git_commit: &'a str, - } - let mut nbugs = 0; - - let rows = component_cabooses.iter().map(|(slot, inv_caboose_id)| { - let sw_caboose = inv_caboose_id - .and_then(|inv_caboose_id| inv_cabooses.get(&inv_caboose_id)) - .and_then(|inv_caboose| { - sw_cabooses.get(&inv_caboose.sw_caboose_id) - }); - let (board, name, version, git_commit) = match sw_caboose { - None => { - nbugs += 1; - ("-", "-", "-", "-") - } - Some(c) => ( - c.board.as_str(), - c.name.as_str(), - c.version.as_str(), - c.git_commit.as_str(), - ), - }; - CabooseRow { slot, board, name, version, git_commit } - }); - - let table = tabled::Table::new(rows) - .with(tabled::settings::Style::empty()) - .with(tabled::settings::Padding::new(0, 1, 0, 0)) - .to_string(); - - println!("{}", textwrap::indent(&table.to_string(), " ")); - - if nbugs > 0 { - // If we don't have the sw_caboose for some inv_caboose, then - // it's a bug in either this tool (if we failed to fetch it) or - // the inventory system (if it failed to insert it). - println!( - "error: at least one caboose above was missing data \ - -- this is a bug" - ); - } -} diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 1fbdce570b..e9c1ee1f98 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -72,6 +72,42 @@ impl From for HwRotSlot { } } +// See [`nexus_types::inventory::CabooseWhich`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "caboose_which"))] + pub struct CabooseWhichEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = CabooseWhichEnum)] + pub enum CabooseWhich; + + // Enum values + SpSlot0 => b"sp_slot_0" + SpSlot1 => b"sp_slot_1" + RotSlotA => b"rot_slot_A" + RotSlotB => b"rot_slot_B" +); + +impl From for CabooseWhich { + fn from(c: nexus_types::inventory::CabooseWhich) -> Self { + match c { + nexus_types::inventory::CabooseWhich::SpSlot0 => { + CabooseWhich::SpSlot0 + } + nexus_types::inventory::CabooseWhich::SpSlot1 => { + CabooseWhich::SpSlot1 + } + nexus_types::inventory::CabooseWhich::RotSlotA => { + CabooseWhich::RotSlotA + } + nexus_types::inventory::CabooseWhich::RotSlotB => { + CabooseWhich::RotSlotB + } + } + } +} + // See [`nexus_types::inventory::SpType`]. impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -215,9 +251,6 @@ pub struct InvServiceProcessor { pub baseboard_revision: BaseboardRevision, pub hubris_archive_id: String, pub power_state: HwPowerState, - - pub slot0_inv_caboose_id: Option, - pub slot1_inv_caboose_id: Option, } /// Newtype wrapping the MGS-reported slot number for an SP @@ -305,9 +338,6 @@ pub struct InvRootOfTrust { pub slot_boot_pref_persistent_pending: Option, pub slot_a_sha3_256: Option, pub slot_b_sha3_256: Option, - - pub slot_a_inv_caboose_id: Option, - pub slot_b_inv_caboose_id: Option, } /// See [`nexus_types::inventory::CabooseFound`]. @@ -315,9 +345,10 @@ pub struct InvRootOfTrust { #[diesel(table_name = inv_caboose)] pub struct InvCaboose { pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, pub time_collected: DateTime, pub source: String, - pub id: Uuid, + pub which: CabooseWhich, pub sw_caboose_id: Uuid, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 13e104251b..0b41733e6d 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1177,9 +1177,6 @@ table! { baseboard_revision -> Int8, hubris_archive_id -> Text, power_state -> crate::HwPowerStateEnum, - - slot0_inv_caboose_id -> Nullable, - slot1_inv_caboose_id -> Nullable, } } @@ -1196,19 +1193,17 @@ table! { slot_boot_pref_persistent_pending -> Nullable, slot_a_sha3_256 -> Nullable, slot_b_sha3_256 -> Nullable, - - slot_a_inv_caboose_id -> Nullable, - slot_b_inv_caboose_id -> Nullable, } } table! { - inv_caboose (id) { - id -> Uuid, + inv_caboose (inv_collection_id, hw_baseboard_id, which) { inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, time_collected -> Timestamptz, source -> Text, + which -> crate::CabooseWhichEnum, sw_caboose_id -> Uuid, } } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 1bfca3d41d..3b0df6b519 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -11,12 +11,19 @@ use crate::db::error::ErrorHandler; use crate::db::TransactionError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::DateTime; +use chrono::Utc; +use diesel::sql_types; use diesel::sql_types::Nullable; +use diesel::Column; use diesel::ExpressionMethods; use diesel::IntoSql; use diesel::NullableExpressionMethods; use diesel::QueryDsl; +use diesel::QuerySource; use diesel::Table; +use nexus_db_model::CabooseWhich; +use nexus_db_model::CabooseWhichEnum; use nexus_db_model::HwBaseboardId; use nexus_db_model::HwPowerState; use nexus_db_model::HwPowerStateEnum; @@ -27,6 +34,8 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::SpType; use nexus_db_model::SpTypeEnum; use nexus_db_model::SwCaboose; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseFound; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; @@ -59,15 +68,6 @@ impl DataStore { .iter() .map(|s| SwCaboose::from(s.as_ref())) .collect::>(); - let cabooses_found: Vec<_> = collection - .sps - .iter() - .flat_map(|(_, sp)| [&sp.slot0_caboose, &sp.slot1_caboose]) - .chain(collection.rots.iter().flat_map(|(_, rot)| { - [&rot.slot_a_caboose, &rot.slot_b_caboose] - })) - .flatten() - .collect(); let error_values = collection .errors .iter() @@ -178,18 +178,6 @@ impl DataStore { .into_sql::(), HwPowerState::from(sp.power_state) .into_sql::(), - sp.slot0_caboose - .as_ref() - .map(|c| c.id) - .into_sql::>(), - sp.slot1_caboose - .as_ref() - .map(|c| c.id) - .into_sql::>(), )) .filter( baseboard_dsl::part_number @@ -214,8 +202,6 @@ impl DataStore { sp_dsl::baseboard_revision, sp_dsl::hubris_archive_id, sp_dsl::power_state, - sp_dsl::slot0_inv_caboose_id, - sp_dsl::slot1_inv_caboose_id, )) .execute_async(&conn) .await?; @@ -245,8 +231,6 @@ impl DataStore { _baseboard_revision, _hubris_archive_id, _power_state, - _slot0_inv_caboose_id, - _slot1_inv_caboose_id, ) = sp_dsl::inv_service_processor::all_columns(); } } @@ -285,18 +269,6 @@ impl DataStore { .clone() .into_sql::>( ), - rot.slot_a_caboose - .as_ref() - .map(|c| c.id) - .into_sql::>(), - rot.slot_b_caboose - .as_ref() - .map(|c| c.id) - .into_sql::>(), )) .filter( baseboard_dsl::part_number @@ -322,8 +294,6 @@ impl DataStore { rot_dsl::slot_boot_pref_transient, rot_dsl::slot_a_sha3_256, rot_dsl::slot_b_sha3_256, - rot_dsl::slot_a_inv_caboose_id, - rot_dsl::slot_b_inv_caboose_id, )) .execute_async(&conn) .await?; @@ -342,71 +312,26 @@ impl DataStore { _slot_boot_pref_transient, _slot_a_sha3_256, _slot_b_sha3_256, - _slot_a_inv_caboose_id, - _slot_b_inv_caboose_id, ) = rot_dsl::inv_root_of_trust::all_columns(); } } - // Insert records for cabooses found. Like the others, we do this - // using INSERT INTO ... SELECT because we need ids from the - // `sw_caboose` table that we may not have. - { - use db::schema::inv_caboose::dsl as inv_dsl; - use db::schema::sw_caboose::dsl as sw_dsl; - - for caboose_found in &cabooses_found { - let selection = db::schema::sw_caboose::table - .select(( - caboose_found - .id - .into_sql::(), - collection_id.into_sql::(), - caboose_found - .time_collected - .into_sql::(), - caboose_found - .source - .clone() - .into_sql::(), - sw_dsl::id, - )) - .filter( - sw_dsl::board - .eq(caboose_found.caboose.board.clone()), - ) - .filter( - sw_dsl::git_commit - .eq(caboose_found.caboose.git_commit.clone()), - ) - .filter( - sw_dsl::name.eq(caboose_found.caboose.name.clone()), - ) - .filter( - sw_dsl::version - .eq(caboose_found.caboose.version.clone()), - ); - - let _ = diesel::insert_into(db::schema::inv_caboose::table) - .values(selection) - .into_columns(( - inv_dsl::id, - inv_dsl::inv_collection_id, - inv_dsl::time_collected, - inv_dsl::source, - inv_dsl::sw_caboose_id, - )) - .execute_async(&conn) - .await?; + // Insert rows for the cabooses that we found. Like service + // processors and roots of trust, we do this using INSERT INTO ... + // SELECT. But because there are two foreign keys, we need a more + // complicated `SELECT`, which requires using a CTE. + for (which, tree) in &collection.cabooses_found { + let db_which = nexus_db_model::CabooseWhich::from(*which); + for (baseboard_id, found_caboose) in tree { + InvCabooseInsert::new( + collection_id, + baseboard_id, + found_caboose, + db_which, + ) + .execute_async(&conn) + .await?; } - - let ( - _id, - _inv_collection_id, - _time_collected, - _source, - _sw_caboose_id, - ) = inv_dsl::inv_caboose::all_columns(); } // Finally, insert the list of errors. @@ -739,3 +664,251 @@ impl DataStore { Ok(()) } } + +/// A SQL common table expression (CTE) used to insert into `inv_caboose` +/// +/// Concretely, we have these three tables: +/// +/// - `hw_baseboard` with an "id" primary key and lookup columns "part_number" +/// and "serial_number" +/// - `sw_caboose` with an "id" primary key and lookup columns "board", +/// "git_commit", "name", and "version" +/// - `inv_caboose` with foreign keys "hw_baseboard_id", "sw_caboose_id", and +/// various other columns +/// +/// We want to INSERT INTO `inv_caboose` a row with: +/// +/// - hw_baseboard_id (foreign key) the result of looking up an hw_baseboard row +/// by part number and serial number provided by the caller +/// +/// - sw_caboose_id (foreign key) the result of looking up a sw_caboose row by +/// board, git_commit, name, and version provided by the caller +/// +/// - the other columns being literals provided by the caller +/// +/// To achieve this, we're going to generate something like: +/// +/// WITH +/// my_new_row +/// AS ( +/// SELECT +/// hw_baseboard.id, /* `hw_baseboard` foreign key */ +/// sw_caboose.id, /* `sw_caboose` foreign key */ +/// ... /* caller-provided literal values for the rest */ +/// /* of the new inv_caboose row */ +/// FROM +/// hw_baseboard, +/// sw_caboose +/// WHERE +/// hw_baseboard.part_number = ... /* caller-provided part number */ +/// hw_baseboard.serial_number = ... /* caller-provided serial number */ +/// sw_caboose.board = ... /* caller-provided board */ +/// sw_caboose.git_commit = ... /* caller-provided git_commit */ +/// sw_caboose.name = ... /* caller-provided name */ +/// sw_caboose.version = ... /* caller-provided version */ +/// ) INSERT INTO +/// inv_caboose (... /* inv_caboose columns */) +/// SELECT * from my_new_row; +/// +/// The whole point is to avoid back-and-forth between the client and the +/// database. Those back-and-forth interactions can significantly increase +/// latency and the probability of transaction conflicts. See RFD 192 for +/// details. +#[must_use = "Queries must be executed"] +struct InvCabooseInsert { + // fields used to look up baseboard id + baseboard_part_number: String, + baseboard_serial_number: String, + + // fields used to look up caboose id + caboose_board: String, + caboose_git_commit: String, + caboose_name: String, + caboose_version: String, + + // literal values for the rest of the inv_caboose columns + collection_id: Uuid, + time_collected: DateTime, + source: String, + which: CabooseWhich, + + // These are Diesel structures representing table names in the "from" or + // "into" parts of queries (e.g., "SELECT FROM tablename" or "INSERT INTO + // tablename"). We need this in `walk_ast()` below, but they must outlive + // `walk_ast()`, so they need to be created ahead of time. + // + // TODO-cleanup These Diesel-internal types are nasty. It's not clear how + // else to do this. + from_hw_baseboard_id: + diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::hw_baseboard_id::table, + >, + from_sw_caboose: diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::sw_caboose::table, + >, + into_inv_caboose: + diesel::internal::table_macro::StaticQueryFragmentInstance< + db::schema::inv_caboose::table, + >, +} + +impl InvCabooseInsert { + pub fn new( + collection_id: Uuid, + baseboard: &BaseboardId, + found_caboose: &CabooseFound, + which: CabooseWhich, + ) -> InvCabooseInsert { + InvCabooseInsert { + baseboard_part_number: baseboard.part_number.clone(), + baseboard_serial_number: baseboard.serial_number.clone(), + caboose_board: found_caboose.caboose.board.clone(), + caboose_git_commit: found_caboose.caboose.git_commit.clone(), + caboose_name: found_caboose.caboose.name.clone(), + caboose_version: found_caboose.caboose.version.clone(), + collection_id, + time_collected: found_caboose.time_collected, + source: found_caboose.source.clone(), + which, + from_hw_baseboard_id: db::schema::hw_baseboard_id::table + .from_clause(), + from_sw_caboose: db::schema::sw_caboose::table.from_clause(), + // It sounds a little goofy to use "from_clause()" when this is + // really part of an INSERT. But really this just produces the + // table name as an identifier. This is the same for both "FROM" + // and "INSERT" clauses. And diesel internally does the same thing + // here (see the type of `InsertStatement::into_clause`). + into_inv_caboose: db::schema::inv_caboose::table.from_clause(), + } + } +} + +impl diesel::query_builder::QueryFragment for InvCabooseInsert { + fn walk_ast<'b>( + &'b self, + mut pass: diesel::query_builder::AstPass<'_, 'b, diesel::pg::Pg>, + ) -> diesel::QueryResult<()> { + use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; + use db::schema::inv_caboose::dsl as dsl_inv_caboose; + use db::schema::sw_caboose::dsl as dsl_sw_caboose; + + pass.unsafe_to_cache_prepared(); + pass.push_sql("WITH my_new_row AS ("); + + pass.push_sql("SELECT "); + + // Emit the values that we're going to insert into `inv_caboose`. + // First, emit the looked-up foreign keys. + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::id::NAME)?; + pass.push_sql(", "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::id::NAME)?; + pass.push_sql(", "); + // Next, emit the literal values used for the rest of the columns. + pass.push_bind_param::(&self.collection_id)?; + pass.push_sql(", "); + pass.push_bind_param::( + &self.time_collected, + )?; + pass.push_sql(", "); + pass.push_bind_param::(&self.source)?; + pass.push_sql(", "); + pass.push_bind_param::(&self.which)?; + + // Finish the SELECT by adding the list of tables and the WHERE to pick + // out only the relevant row from each tables. + pass.push_sql(" FROM "); + + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql(", "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + + pass.push_sql(" WHERE "); + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::part_number::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::( + &self.baseboard_part_number, + )?; + pass.push_sql(" AND "); + self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_baseboard_id::serial_number::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::( + &self.baseboard_serial_number, + )?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::board::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_board)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::git_commit::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_git_commit)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::name::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_name)?; + pass.push_sql(" AND "); + self.from_sw_caboose.walk_ast(pass.reborrow())?; + pass.push_sql("."); + pass.push_identifier(dsl_sw_caboose::version::NAME)?; + pass.push_sql(" = "); + pass.push_bind_param::(&self.caboose_version)?; + + pass.push_sql(")\n"); // end of the SELECT query within the WITH + + pass.push_sql("INSERT INTO "); + self.into_inv_caboose.walk_ast(pass.reborrow())?; + + pass.push_sql("("); + pass.push_identifier(dsl_inv_caboose::hw_baseboard_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::sw_caboose_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::inv_collection_id::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::time_collected::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::source::NAME)?; + pass.push_sql(", "); + pass.push_identifier(dsl_inv_caboose::which::NAME)?; + pass.push_sql(")\n"); + pass.push_sql("SELECT * FROM my_new_row"); + + // See the comment in inventory_insert_collection() where we use + // `inv_service_processor::all_columns()`. The same applies here. + // If you update the statement below because the schema for + // `inv_caboose` has changed, be sure to update the code above, too! + let ( + _hw_baseboard_id, + _sw_caboose_id, + _inv_collection_id, + _time_collected, + _source, + _which, + ) = dsl_inv_caboose::inv_caboose::all_columns(); + + Ok(()) + } +} + +// This is required to be able to call `inv_caboose_insert.execute_async()`. +impl diesel::RunQueryDsl for InvCabooseInsert {} + +// This is required to be able to call `inv_caboose_insert.execute_async()`. +impl diesel::query_builder::QueryId for InvCabooseInsert { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 1f25a3f519..85c1122c29 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -9,32 +9,23 @@ //! collected. use anyhow::anyhow; -use anyhow::Context; use chrono::DateTime; use chrono::Utc; -use gateway_client::types::RotSlot; use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_client::types::SpType; use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Caboose; use nexus_types::inventory::CabooseFound; +use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; use nexus_types::inventory::RotState; use nexus_types::inventory::ServiceProcessor; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::sync::Arc; -use strum::EnumIter; use uuid::Uuid; -/// Identifies one of a service processor's two firmware slots -#[derive(Debug, Clone, Copy, EnumIter)] -pub enum SpSlot { - Slot0, - Slot1, -} - /// Build an inventory [`Collection`] /// /// This interface is oriented around the interfaces used by an actual @@ -50,6 +41,8 @@ pub struct CollectionBuilder { cabooses: BTreeSet>, sps: BTreeMap, ServiceProcessor>, rots: BTreeMap, RotState>, + cabooses_found: + BTreeMap, CabooseFound>>, } impl CollectionBuilder { @@ -67,6 +60,7 @@ impl CollectionBuilder { cabooses: BTreeSet::new(), sps: BTreeMap::new(), rots: BTreeMap::new(), + cabooses_found: BTreeMap::new(), } } @@ -82,6 +76,7 @@ impl CollectionBuilder { cabooses: self.cabooses, sps: self.sps, rots: self.rots, + cabooses_found: self.cabooses_found, } } @@ -139,9 +134,6 @@ impl CollectionBuilder { baseboard_revision: sp_state.revision, hubris_archive: sp_state.hubris_archive_id, power_state: sp_state.power_state, - - slot0_caboose: None, - slot1_caboose: None, } }); @@ -165,8 +157,6 @@ impl CollectionBuilder { transient_boot_preference, slot_a_sha3_256_digest, slot_b_sha3_256_digest, - slot_a_caboose: None, - slot_b_caboose: None, } }); } @@ -185,47 +175,19 @@ impl CollectionBuilder { Some(baseboard) } - /// Returns true if we already found the SP caboose for slot `slot` for - /// baseboard `baseboard` + /// Returns true if we already found the caboose for `which` for baseboard + /// `baseboard` /// /// This is used to avoid requesting it multiple times (from multiple MGS /// instances). - pub fn found_sp_caboose_already( + pub fn sp_found_caboose_already( &self, baseboard: &BaseboardId, - slot: SpSlot, + which: CabooseWhich, ) -> bool { - self.sps - .get(baseboard) - .map(|sp| { - let sp_slot = match slot { - SpSlot::Slot0 => &sp.slot0_caboose, - SpSlot::Slot1 => &sp.slot1_caboose, - }; - sp_slot.is_some() - }) - .unwrap_or(false) - } - - /// Returns true if we already found the RoT caboose for slot `slot` for - /// baseboard `baseboard` - /// - /// This is used to avoid requesting it multiple times (from multiple MGS - /// instances). - pub fn found_rot_caboose_already( - &self, - baseboard: &BaseboardId, - slot: RotSlot, - ) -> bool { - self.rots - .get(baseboard) - .map(|rot| { - let rot_slot = match slot { - RotSlot::A => &rot.slot_a_caboose, - RotSlot::B => &rot.slot_b_caboose, - }; - rot_slot.is_some() - }) + self.cabooses_found + .get(&which) + .map(|map| map.contains_key(baseboard)) .unwrap_or(false) } @@ -239,44 +201,7 @@ impl CollectionBuilder { pub fn found_sp_caboose( &mut self, baseboard: &BaseboardId, - slot: SpSlot, - source: &str, - caboose: SpComponentCaboose, - ) -> Result<(), anyhow::Error> { - // Normalize the caboose contents: i.e., if we've seen this exact caboose - // contents before, use the same record from before. Otherwise, make a - // new one. - let sw_caboose = - Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); - - // Find the SP. - let sp = self.sps.get_mut(baseboard).ok_or_else(|| { - anyhow!( - "reporting caboose for unknown baseboard: {:?} ({:?})", - baseboard, - sw_caboose - ) - })?; - let sp_slot = match slot { - SpSlot::Slot0 => &mut sp.slot0_caboose, - SpSlot::Slot1 => &mut sp.slot1_caboose, - }; - Self::record_caboose(sp_slot, source, sw_caboose) - .context(format!("baseboard {:?} SP caboose {:?}", baseboard, slot)) - } - - /// Record the given root of trust caboose information found for the given - /// baseboard - /// - /// The baseboard must previously have been reported using - /// `found_sp_state()`. - /// - /// `source` is an arbitrary string for debugging that describes the MGS - /// that reported this data (generally a URL string). - pub fn found_rot_caboose( - &mut self, - baseboard: &BaseboardId, - slot: RotSlot, + which: CabooseWhich, source: &str, caboose: SpComponentCaboose, ) -> Result<(), anyhow::Error> { @@ -285,50 +210,40 @@ impl CollectionBuilder { // new one. let sw_caboose = Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); - - // Find the RoT state. Note that it's possible that we _do_ have - // caboose information for an RoT that we have no information about - // because the SP couldn't talk to the RoT when we asked for its state, - // but was able to do so when we got the caboose. This seems unlikely. - let rot = self.rots.get_mut(baseboard).ok_or_else(|| { - anyhow!( - "reporting caboose for unknown baseboard: {:?} ({:?})", - baseboard, - sw_caboose - ) - })?; - let rot_slot = match slot { - RotSlot::A => &mut rot.slot_a_caboose, - RotSlot::B => &mut rot.slot_b_caboose, - }; - Self::record_caboose(rot_slot, source, sw_caboose).context(format!( - "baseboard {:?} RoT caboose {:?}", - baseboard, slot - )) - } - - fn record_caboose( - slot: &mut Option>, - source: &str, - sw_caboose: Arc, - ) -> Result<(), anyhow::Error> { - let old = slot.replace(Arc::new(CabooseFound { - id: Uuid::new_v4(), - time_collected: Utc::now(), - source: source.to_owned(), - caboose: sw_caboose.clone(), - })); - match old { - None => Ok(()), - Some(previous) if *previous.caboose == *sw_caboose => { - Err(anyhow!("reported multiple times (same value)")) - } - Some(previous) => Err(anyhow!( - "reported caboose multiple times (previously {:?}, \ + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let by_id = + self.cabooses_found.entry(which).or_insert_with(|| BTreeMap::new()); + if let Some(previous) = by_id.insert( + baseboard.clone(), + CabooseFound { + time_collected: Utc::now(), + source: source.to_owned(), + caboose: sw_caboose.clone(), + }, + ) { + let error = if *previous.caboose == *sw_caboose { + anyhow!("reported multiple times (same value)",) + } else { + anyhow!( + "reported caboose multiple times (previously {:?}, \ now {:?})", - previous, - sw_caboose - )), + previous, + sw_caboose + ) + }; + Err(error.context(format!( + "baseboard {:?} caboose {:?}", + baseboard, which + ))) + } else { + Ok(()) } } @@ -361,717 +276,3 @@ impl CollectionBuilder { self.errors.push(error); } } - -#[cfg(test)] -mod test { - use super::CollectionBuilder; - use crate::builder::SpSlot; - use chrono::Utc; - use gateway_client::types::PowerState; - use gateway_client::types::RotSlot; - use gateway_client::types::RotState; - use gateway_client::types::SpComponentCaboose; - use gateway_client::types::SpState; - use gateway_client::types::SpType; - use nexus_types::inventory::BaseboardId; - use nexus_types::inventory::Caboose; - use strum::IntoEnumIterator; - - // Verify the contents of an empty collection. - #[test] - fn test_empty() { - let time_before = Utc::now(); - let builder = CollectionBuilder::new("test_empty"); - let collection = builder.build(); - let time_after = Utc::now(); - - assert!(collection.errors.is_empty()); - assert!(time_before <= collection.time_started); - assert!(collection.time_started <= collection.time_done); - assert!(collection.time_done <= time_after); - assert_eq!(collection.collector, "test_empty"); - assert!(collection.baseboards.is_empty()); - assert!(collection.cabooses.is_empty()); - assert!(collection.sps.is_empty()); - assert!(collection.rots.is_empty()); - } - - // Simple test of a single, fairly typical collection that contains just - // about all kinds of valid data. That includes exercising: - // - // - all three baseboard types (switch, sled, PSC) - // - various valid values for all fields (sources, slot numbers, power - // states, baseboard revisions, cabooses, etc.) - // - some empty slots - // - some missing cabooses - // - some cabooses common to multiple baseboards; others not - // - serial number reused across different model numbers - // - // This test is admittedly pretty tedious and maybe not worthwhile but it's - // a useful quick check. - #[test] - fn test_basic() { - let time_before = Utc::now(); - let mut builder = CollectionBuilder::new("test_basic"); - - // an ordinary, working sled - let sled1_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 3, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 0, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest1", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest1", - )), - transient_boot_preference: None, - }, - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // another ordinary sled with different values for ordinary fields - let sled2_bb = builder - .found_sp_state( - "fake MGS 2", - SpType::Sled, - 4, - SpState { - base_mac_address: [1; 6], - hubris_archive_id: String::from("hubris2"), - model: String::from("model2"), - power_state: PowerState::A2, - revision: 1, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: Some(RotSlot::A), - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest2", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest2", - )), - transient_boot_preference: Some(RotSlot::B), - }, - // same serial number, which is okay because it's a different - // model number - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // a switch - let switch1_bb = builder - .found_sp_state( - "fake MGS 2", - SpType::Switch, - 0, - SpState { - base_mac_address: [2; 6], - hubris_archive_id: String::from("hubris3"), - model: String::from("model3"), - power_state: PowerState::A1, - revision: 2, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest3", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest3", - )), - transient_boot_preference: None, - }, - // same serial number, which is okay because it's a different - // model number - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // a PSC - let psc_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Power, - 1, - SpState { - base_mac_address: [3; 6], - hubris_archive_id: String::from("hubris4"), - model: String::from("model4"), - power_state: PowerState::A2, - revision: 3, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest4", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest4", - )), - transient_boot_preference: None, - }, - serial_number: String::from("s2"), - }, - ) - .unwrap(); - - // a sled with no RoT state or other optional fields - let sled3_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 5, - SpState { - base_mac_address: [4; 6], - hubris_archive_id: String::from("hubris5"), - model: String::from("model1"), - power_state: PowerState::A2, - revision: 1, - rot: RotState::CommunicationFailed { - message: String::from("test suite injected error"), - }, - serial_number: String::from("s2"), - }, - ) - .unwrap(); - - // Report some cabooses. - - // We'll use the same cabooses for most of these components, although - // that's not possible in a real system. We deliberately construct a - // new value each time to make sure the builder correctly normalizes it. - let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; - for bb in &common_caboose_baseboards { - for slot in SpSlot::iter() { - assert!(!builder.found_sp_caboose_already(bb, slot)); - let _ = builder - .found_sp_caboose( - bb, - slot, - "test suite", - SpComponentCaboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), - }, - ) - .unwrap(); - assert!(builder.found_sp_caboose_already(bb, slot)); - } - - for slot in RotSlot::iter() { - assert!(!builder.found_rot_caboose_already(bb, slot)); - let _ = builder.found_rot_caboose( - bb, - slot, - "test suite", - SpComponentCaboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), - }, - ); - assert!(builder.found_rot_caboose_already(bb, slot)); - } - } - - // For the PSC, use different cabooses for both slots of both the SP and - // RoT, just to exercise that we correctly keep track of different - // cabooses. - let _ = builder - .found_sp_caboose( - &psc_bb, - SpSlot::Slot0, - "test suite", - SpComponentCaboose { - board: String::from("psc_sp_0"), - git_commit: String::from("psc_sp_0"), - name: String::from("psc_sp_0"), - version: String::from("psc_sp_0"), - }, - ) - .unwrap(); - let _ = builder - .found_sp_caboose( - &psc_bb, - SpSlot::Slot1, - "test suite", - SpComponentCaboose { - board: String::from("psc_sp_1"), - git_commit: String::from("psc_sp_1"), - name: String::from("psc_sp_1"), - version: String::from("psc_sp_1"), - }, - ) - .unwrap(); - let _ = builder - .found_rot_caboose( - &psc_bb, - RotSlot::A, - "test suite", - SpComponentCaboose { - board: String::from("psc_rot_a"), - git_commit: String::from("psc_rot_a"), - name: String::from("psc_rot_a"), - version: String::from("psc_rot_a"), - }, - ) - .unwrap(); - let _ = builder - .found_rot_caboose( - &psc_bb, - RotSlot::B, - "test suite", - SpComponentCaboose { - board: String::from("psc_rot_b"), - git_commit: String::from("psc_rot_b"), - name: String::from("psc_rot_b"), - version: String::from("psc_rot_b"), - }, - ) - .unwrap(); - - // We deliberately provide no cabooses for sled3. - - // Finish the collection and verify the basics. - let collection = builder.build(); - let time_after = Utc::now(); - println!("{:#?}", collection); - assert!(time_before <= collection.time_started); - assert!(collection.time_started <= collection.time_done); - assert!(collection.time_done <= time_after); - assert_eq!(collection.collector, "test_basic"); - - // Verify the one error that ought to have been produced for the SP with - // no RoT information. - assert_eq!( - collection.errors.iter().map(|e| e.to_string()).collect::>(), - ["MGS \"fake MGS 1\": reading RoT state for BaseboardId \ - { part_number: \"model1\", serial_number: \"s2\" }: test suite \ - injected error"] - ); - - // Verify the baseboard ids found. - let expected_baseboards = - &[&sled1_bb, &sled2_bb, &sled3_bb, &switch1_bb, &psc_bb]; - for bb in expected_baseboards { - assert!(collection.baseboards.contains(*bb)); - } - assert_eq!(collection.baseboards.len(), expected_baseboards.len()); - - // Verify the stuff that's easy to verify for all SPs: timestamps. - assert_eq!(collection.sps.len(), collection.baseboards.len()); - for (bb, sp) in collection.sps.iter() { - assert!(collection.time_started <= sp.time_collected); - assert!(sp.time_collected <= collection.time_done); - - if let Some(rot) = collection.rots.get(bb) { - assert_eq!(rot.source, sp.source); - assert_eq!(rot.time_collected, sp.time_collected); - } - - for c in - [&sp.slot0_caboose, &sp.slot1_caboose].into_iter().flatten() - { - assert!(collection.time_started <= c.time_collected); - assert!(c.time_collected <= collection.time_done); - } - } - - // Verify the common caboose. - let common_caboose = Caboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), - }; - for bb in &common_caboose_baseboards { - let sp = collection.sps.get(*bb).unwrap(); - let c0 = sp.slot0_caboose.as_ref().unwrap(); - let c1 = sp.slot0_caboose.as_ref().unwrap(); - assert_eq!(c0.source, "test suite"); - assert_eq!(*c0.caboose, common_caboose); - assert_eq!(c1.source, "test suite"); - assert_eq!(*c1.caboose, common_caboose); - - let rot = collection.rots.get(*bb).unwrap(); - let c0 = rot.slot_a_caboose.as_ref().unwrap(); - let c1 = rot.slot_b_caboose.as_ref().unwrap(); - assert_eq!(c0.source, "test suite"); - assert_eq!(*c0.caboose, common_caboose); - assert_eq!(c1.source, "test suite"); - assert_eq!(*c1.caboose, common_caboose); - } - assert!(collection.cabooses.contains(&common_caboose)); - - // Verify the specific, different data for the healthy SPs and RoTs that - // we reported. - // sled1 - let sp = collection.sps.get(&sled1_bb).unwrap(); - assert_eq!(sp.source, "fake MGS 1"); - assert_eq!(sp.sp_type, SpType::Sled); - assert_eq!(sp.sp_slot, 3); - assert_eq!(sp.baseboard_revision, 0); - assert_eq!(sp.hubris_archive, "hubris1"); - assert_eq!(sp.power_state, PowerState::A0); - let rot = collection.rots.get(&sled1_bb).unwrap(); - assert_eq!(rot.active_slot, RotSlot::A); - assert_eq!(rot.pending_persistent_boot_preference, None); - assert_eq!(rot.persistent_boot_preference, RotSlot::A); - assert_eq!( - rot.slot_a_sha3_256_digest.as_ref().unwrap(), - "slotAdigest1" - ); - assert_eq!( - rot.slot_b_sha3_256_digest.as_ref().unwrap(), - "slotBdigest1" - ); - assert_eq!(rot.transient_boot_preference, None); - - // sled2 - let sp = collection.sps.get(&sled2_bb).unwrap(); - assert_eq!(sp.source, "fake MGS 2"); - assert_eq!(sp.sp_type, SpType::Sled); - assert_eq!(sp.sp_slot, 4); - assert_eq!(sp.baseboard_revision, 1); - assert_eq!(sp.hubris_archive, "hubris2"); - assert_eq!(sp.power_state, PowerState::A2); - let rot = collection.rots.get(&sled2_bb).unwrap(); - assert_eq!(rot.active_slot, RotSlot::B); - assert_eq!(rot.pending_persistent_boot_preference, Some(RotSlot::A)); - assert_eq!(rot.persistent_boot_preference, RotSlot::A); - assert_eq!( - rot.slot_a_sha3_256_digest.as_ref().unwrap(), - "slotAdigest2" - ); - assert_eq!( - rot.slot_b_sha3_256_digest.as_ref().unwrap(), - "slotBdigest2" - ); - assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); - - // switch - let sp = collection.sps.get(&switch1_bb).unwrap(); - assert_eq!(sp.source, "fake MGS 2"); - assert_eq!(sp.sp_type, SpType::Switch); - assert_eq!(sp.sp_slot, 0); - assert_eq!(sp.baseboard_revision, 2); - assert_eq!(sp.hubris_archive, "hubris3"); - assert_eq!(sp.power_state, PowerState::A1); - let rot = collection.rots.get(&switch1_bb).unwrap(); - assert_eq!(rot.active_slot, RotSlot::B); - assert_eq!(rot.pending_persistent_boot_preference, None); - assert_eq!(rot.persistent_boot_preference, RotSlot::A); - assert_eq!( - rot.slot_a_sha3_256_digest.as_ref().unwrap(), - "slotAdigest3" - ); - assert_eq!( - rot.slot_b_sha3_256_digest.as_ref().unwrap(), - "slotBdigest3" - ); - assert_eq!(rot.transient_boot_preference, None); - - // PSC - let sp = collection.sps.get(&psc_bb).unwrap(); - assert_eq!(sp.source, "fake MGS 1"); - assert_eq!(sp.sp_type, SpType::Power); - assert_eq!(sp.sp_slot, 1); - assert_eq!(sp.baseboard_revision, 3); - assert_eq!(sp.hubris_archive, "hubris4"); - assert_eq!(sp.power_state, PowerState::A2); - let rot = collection.rots.get(&psc_bb).unwrap(); - assert_eq!(rot.active_slot, RotSlot::B); - assert_eq!(rot.pending_persistent_boot_preference, None); - assert_eq!(rot.persistent_boot_preference, RotSlot::A); - assert_eq!( - rot.slot_a_sha3_256_digest.as_ref().unwrap(), - "slotAdigest4" - ); - assert_eq!( - rot.slot_b_sha3_256_digest.as_ref().unwrap(), - "slotBdigest4" - ); - assert_eq!(rot.transient_boot_preference, None); - - // The PSC has four different cabooses! - let c = &sp.slot0_caboose.as_ref().unwrap().caboose; - assert_eq!(c.board, "psc_sp_0"); - assert!(collection.cabooses.contains(c)); - let c = &sp.slot1_caboose.as_ref().unwrap().caboose; - assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_sp_1"); - let c = &rot.slot_a_caboose.as_ref().unwrap().caboose; - assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_rot_a"); - let c = &rot.slot_b_caboose.as_ref().unwrap().caboose; - assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_rot_b"); - - // Verify the reported SP state for sled3, which did not have a healthy - // RoT, nor any cabooses. - let sp = collection.sps.get(&sled3_bb).unwrap(); - assert_eq!(sp.source, "fake MGS 1"); - assert_eq!(sp.sp_type, SpType::Sled); - assert_eq!(sp.sp_slot, 5); - assert_eq!(sp.baseboard_revision, 1); - assert_eq!(sp.hubris_archive, "hubris5"); - assert_eq!(sp.power_state, PowerState::A2); - assert_eq!(sp.slot0_caboose, None); - assert_eq!(sp.slot1_caboose, None); - assert!(!collection.rots.contains_key(&sled3_bb)); - - // There shouldn't be any other RoTs. - assert_eq!(collection.sps.len(), collection.rots.len() + 1); - - // There should be five cabooses: the four used for the PSC (see above), - // plus the common one. - assert_eq!(collection.cabooses.len(), 5); - } - - // Exercises all the failure cases that shouldn't happen in real systems. - // Despite all of these failures, we should get a valid collection at the - // end. - #[test] - fn test_problems() { - let mut builder = CollectionBuilder::new("test_problems"); - - let sled1_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 3, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 0, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: None, - slot_b_sha3_256_digest: None, - transient_boot_preference: None, - }, - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // report the same SP again with the same contents - let sled1_bb_dup = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 3, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 0, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: None, - slot_b_sha3_256_digest: None, - transient_boot_preference: None, - }, - serial_number: String::from("s1"), - }, - ) - .unwrap(); - assert_eq!(sled1_bb, sled1_bb_dup); - - // report the same SP again with different contents - let sled1_bb_dup = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 3, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 1, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: None, - slot_b_sha3_256_digest: None, - transient_boot_preference: None, - }, - serial_number: String::from("s1"), - }, - ) - .unwrap(); - assert_eq!(sled1_bb, sled1_bb_dup); - - // report an SP with an impossible slot number - let sled2_sp = builder.found_sp_state( - "fake MGS 1", - SpType::Sled, - u32::from(u16::MAX) + 1, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 1, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: None, - slot_b_sha3_256_digest: None, - transient_boot_preference: None, - }, - serial_number: String::from("s2"), - }, - ); - assert_eq!(sled2_sp, None); - - // report SP caboose for an unknown baseboard - let bogus_baseboard = BaseboardId { - part_number: String::from("p1"), - serial_number: String::from("bogus"), - }; - let caboose1 = SpComponentCaboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), - }; - assert!( - !builder.found_sp_caboose_already(&bogus_baseboard, SpSlot::Slot0) - ); - let error = builder - .found_sp_caboose( - &bogus_baseboard, - SpSlot::Slot0, - "dummy", - caboose1.clone(), - ) - .unwrap_err(); - assert_eq!( - error.to_string(), - "reporting caboose for unknown baseboard: \ - BaseboardId { part_number: \"p1\", serial_number: \"bogus\" } \ - (Caboose { board: \"board1\", git_commit: \"git_commit1\", \ - name: \"name1\", version: \"version1\" })" - ); - assert!( - !builder.found_sp_caboose_already(&bogus_baseboard, SpSlot::Slot0) - ); - - // report RoT caboose for an unknown baseboard - let error2 = builder - .found_rot_caboose( - &bogus_baseboard, - RotSlot::A, - "dummy", - caboose1.clone(), - ) - .unwrap_err(); - assert_eq!(error.to_string(), error2.to_string(),); - - // report the same caboose twice with the same contents - let _ = builder - .found_sp_caboose( - &sled1_bb, - SpSlot::Slot0, - "dummy", - caboose1.clone(), - ) - .unwrap(); - let error = builder - .found_sp_caboose( - &sled1_bb, - SpSlot::Slot0, - "dummy", - caboose1.clone(), - ) - .unwrap_err(); - assert_eq!( - format!("{:#}", error), - "baseboard BaseboardId { part_number: \"model1\", \ - serial_number: \"s1\" } SP caboose Slot0: reported multiple \ - times (same value)" - ); - // report the same caboose again with different contents - let error = builder - .found_sp_caboose( - &sled1_bb, - SpSlot::Slot0, - "dummy", - SpComponentCaboose { - board: String::from("board2"), - git_commit: String::from("git_commit2"), - name: String::from("name2"), - version: String::from("version2"), - }, - ) - .unwrap_err(); - let message = format!("{:#}", error); - println!("found error: {}", message); - assert!(message.contains( - "SP caboose Slot0: reported caboose multiple times (previously" - )); - assert!(message.contains(", now ")); - - // We should still get a valid collection. - let collection = builder.build(); - println!("{:#?}", collection); - assert_eq!(collection.collector, "test_problems"); - - // We should still have the one sled and its SP slot0 caboose. - assert!(collection.baseboards.contains(&sled1_bb)); - let sp = collection.sps.get(&sled1_bb).unwrap(); - let caboose = sp.slot0_caboose.as_ref().unwrap(); - assert_eq!(caboose.caboose.board, "board2"); - assert!(collection.cabooses.contains(&caboose.caboose)); - assert_eq!(sp.slot1_caboose, None); - let rot = collection.rots.get(&sled1_bb).unwrap(); - assert_eq!(rot.slot_a_caboose, None); - assert_eq!(rot.slot_b_caboose, None); - - // We should see an error. - assert_eq!( - collection - .errors - .iter() - .map(|e| format!("{:#}", e)) - .collect::>(), - vec![ - "MGS \"fake MGS 1\": SP Sled slot 65536: \ - slot number did not fit into u16" - ] - ); - } -} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 8781fdd655..6ba8171f74 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -4,9 +4,9 @@ //! Collection of inventory from Omicron components -use crate::builder::{CollectionBuilder, SpSlot}; +use crate::builder::CollectionBuilder; use anyhow::Context; -use gateway_client::types::RotSlot; +use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; use slog::{debug, error}; use std::sync::Arc; @@ -146,78 +146,32 @@ impl Collector { // fetched already, fetch it and record it. Generally, we'd only // get here for the first MGS client. Assuming that one succeeds, // the other(s) will skip this loop. - for sp_slot in SpSlot::iter() { + for which in CabooseWhich::iter() { if self .in_progress - .found_sp_caboose_already(&baseboard_id, sp_slot) + .sp_found_caboose_already(&baseboard_id, which) { continue; } - let slot_num = match sp_slot { - SpSlot::Slot0 => 0, - SpSlot::Slot1 => 1, - }; - - let result = client - .sp_component_caboose_get(sp.type_, sp.slot, "sp", slot_num) - .await - .with_context(|| { - format!( - "MGS {:?}: SP {:?}: SP caboose {:?}", - client.baseurl(), - sp, - sp_slot - ) - }); - let caboose = match result { - Err(error) => { - self.in_progress.found_error(error); - continue; - } - Ok(response) => response.into_inner(), - }; - if let Err(error) = self.in_progress.found_sp_caboose( - &baseboard_id, - sp_slot, - client.baseurl(), - caboose, - ) { - error!( - &self.log, - "error reporting caboose: {:?} SP {:?} {:?}: {:#}", - baseboard_id, - sp_slot, - client.baseurl(), - error - ); - } - } - - for rot_slot in RotSlot::iter() { - if self - .in_progress - .found_rot_caboose_already(&baseboard_id, rot_slot) - { - continue; - } - - let slot_num = match rot_slot { - RotSlot::A => 0, - RotSlot::B => 1, + let (component, slot) = match which { + CabooseWhich::SpSlot0 => ("sp", 0), + CabooseWhich::SpSlot1 => ("sp", 1), + CabooseWhich::RotSlotA => ("rot", 0), + CabooseWhich::RotSlotB => ("rot", 1), }; let result = client .sp_component_caboose_get( - sp.type_, sp.slot, "rot", slot_num, + sp.type_, sp.slot, component, slot, ) .await .with_context(|| { format!( - "MGS {:?}: SP {:?}: RoT caboose {:?}", + "MGS {:?}: SP {:?}: caboose {:?}", client.baseurl(), sp, - rot_slot + which ) }); let caboose = match result { @@ -227,17 +181,17 @@ impl Collector { } Ok(response) => response.into_inner(), }; - if let Err(error) = self.in_progress.found_rot_caboose( + if let Err(error) = self.in_progress.found_sp_caboose( &baseboard_id, - rot_slot, + which, client.baseurl(), caboose, ) { error!( &self.log, - "error reporting caboose: {:?} RoT {:?} {:?}: {:#}", + "error reporting caboose: {:?} {:?} {:?}: {:#}", baseboard_id, - rot_slot, + which, client.baseurl(), error ); @@ -289,61 +243,37 @@ mod test { // data comes straight from MGS. And proper handling of that data is // tested in the builder. write!(&mut s, "\nSPs:\n").unwrap(); - for (bb, sp) in &collection.sps { + for (bb, _) in &collection.sps { write!( &mut s, - " baseboard part {:?} serial {:?} slot0 \ - caboose {} slot1 caboose {}\n", - bb.part_number, - bb.serial_number, - sp.slot0_caboose - .as_ref() - .map(|c| format!( - "{:?}/{:?}", - c.caboose.board, c.caboose.git_commit - )) - .as_deref() - .unwrap_or("(none)"), - sp.slot1_caboose - .as_ref() - .map(|c| format!( - "{:?}/{:?}", - c.caboose.board, c.caboose.git_commit - )) - .as_deref() - .unwrap_or("(none)"), + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, ) .unwrap(); } write!(&mut s, "\nRoTs:\n").unwrap(); - for (bb, rot) in &collection.rots { + for (bb, _) in &collection.rots { write!( &mut s, - " baseboard part {:?} serial {:?} slot A \ - caboose {} slot B caboose {}\n", - bb.part_number, - bb.serial_number, - rot.slot_a_caboose - .as_ref() - .map(|c| format!( - "{:?}/{:?}", - c.caboose.board, c.caboose.git_commit - )) - .as_deref() - .unwrap_or("(none)"), - rot.slot_b_caboose - .as_ref() - .map(|c| format!( - "{:?}/{:?}", - c.caboose.board, c.caboose.git_commit - )) - .as_deref() - .unwrap_or("(none)"), + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, ) .unwrap(); } + write!(&mut s, "\ncabooses found:\n").unwrap(); + for (kind, bb_to_found) in &collection.cabooses_found { + for (bb, found) in bb_to_found { + write!( + &mut s, + " {:?} baseboard part {:?} serial {:?}: board {:?}\n", + kind, bb.part_number, bb.serial_number, found.caboose.board, + ) + .unwrap(); + } + } + write!(&mut s, "\nerrors:\n").unwrap(); for e in &collection.errors { write!(&mut s, "error: {:#}\n", e).unwrap(); diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt index 630712a4e8..4a3bf62d63 100644 --- a/nexus/inventory/tests/output/collector_basic.txt +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -11,15 +11,33 @@ cabooses: board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" SPs: - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" RoTs: - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" errors: diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt index e89fce075f..08806d0fe8 100644 --- a/nexus/inventory/tests/output/collector_errors.txt +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -11,16 +11,34 @@ cabooses: board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" SPs: - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot0 caboose "SimGimletSp"/"ffffffff" slot1 caboose "SimGimletSp"/"ffffffff" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot0 caboose "SimSidecarSp"/"ffffffff" slot1 caboose "SimSidecarSp"/"ffffffff" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" RoTs: - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" - baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" slot A caboose "SimGimletRot"/"eeeeeeee" slot B caboose "SimGimletRot"/"eeeeeeee" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" - baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" slot A caboose "SimSidecarRot"/"eeeeeeee" slot B caboose "SimSidecarRot"/"eeeeeeee" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" errors: error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error trying to connect: tcp connect error: Network is unreachable (os error 128): tcp connect error: Network is unreachable (os error 128): Network is unreachable (os error 128) diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index ba889133ef..c4f2b665fe 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -17,6 +17,7 @@ pub use gateway_client::types::SpType; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::sync::Arc; +use strum::EnumIter; use uuid::Uuid; /// Results of collecting hardware/software inventory from various Omicron @@ -65,6 +66,13 @@ pub struct Collection { /// /// In practice, these will be inserted into the `inv_root_of_trust` table. pub rots: BTreeMap, RotState>, + /// all caboose contents found, keyed first by the kind of caboose + /// (`CabooseWhich`), then the baseboard id of the sled where they were + /// found + /// + /// In practice, these will be inserted into the `inv_caboose` table. + pub cabooses_found: + BTreeMap, CabooseFound>>, } /// A unique baseboard id found during a collection @@ -115,7 +123,6 @@ impl From for Caboose { /// particular source, but these are only for debugging) #[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] pub struct CabooseFound { - pub id: Uuid, pub time_collected: DateTime, pub source: String, pub caboose: Arc, @@ -133,9 +140,6 @@ pub struct ServiceProcessor { pub baseboard_revision: u32, pub hubris_archive: String, pub power_state: PowerState, - - pub slot0_caboose: Option>, - pub slot1_caboose: Option>, } /// Describes the root of trust state found (from a service processor) during @@ -151,7 +155,13 @@ pub struct RotState { pub transient_boot_preference: Option, pub slot_a_sha3_256_digest: Option, pub slot_b_sha3_256_digest: Option, +} - pub slot_a_caboose: Option>, - pub slot_b_caboose: Option>, +/// Describes which caboose this is (which component, which slot) +#[derive(Clone, Copy, Debug, EnumIter, PartialEq, Eq, PartialOrd, Ord)] +pub enum CabooseWhich { + SpSlot0, + SpSlot1, + RotSlotA, + RotSlotB, } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 311a6afcf5..07971c19ce 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2668,11 +2668,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( hubris_archive_id TEXT NOT NULL, power_state omicron.public.hw_power_state NOT NULL, - -- Caboose information (foreign keys into `inv_caboose`). The requests to - -- fetch this information can individually fail so these fields can be NULL. - slot0_inv_caboose_id UUID, - slot1_inv_caboose_id UUID, - PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); @@ -2698,42 +2693,36 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( slot_a_sha3_256 TEXT, -- nullable slot_b_sha3_256 TEXT, -- nullable - -- Caboose information (foreign keys into `inv_caboose`). The requests to - -- fetch this information can individually fail so these fields can be NULL. - slot_a_inv_caboose_id UUID, - slot_b_inv_caboose_id UUID, - PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); + -- cabooses found --- --- Rows in this table reflect that a particular caboose (`sw_caboose_id`) was --- found in this collection, having been reported by `source` at --- `time_collected`. It may be an SP or RoT caboose, and it could be in either --- slot. To know which, you need to look at which field in --- `inv_service_processor` or `inv_root_of_trust` points to it. --- --- Technically, we don't need `inv_collection_id` here because it's implied by --- whatever points to this record. CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, -- when this observation was made time_collected TIMESTAMPTZ NOT NULL, -- which MGS instance reported this data source TEXT NOT NULL, - id UUID PRIMARY KEY, - sw_caboose_id UUID NOT NULL -); + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, --- Allow us to paginate through the cabooses that are part of a collection -CREATE INDEX IF NOT EXISTS lookup_caboose ON omicron.public.inv_caboose ( - inv_collection_id, id + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) ); + /*******************************************************************/ /* From 298400060aa641f864dab6860eb92ffdb3a6b8e7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 25 Oct 2023 17:13:15 -0700 Subject: [PATCH 12/20] add new database test, restore and fix previous builder test --- Cargo.lock | 1 + dev-tools/omdb/src/bin/omdb/db.rs | 2 +- nexus/db-model/src/inventory.rs | 113 ++- nexus/db-queries/Cargo.toml | 1 + .../db-queries/src/db/datastore/inventory.rs | 310 ++++++- nexus/inventory/src/builder.rs | 758 +++++++++++++++++- nexus/inventory/src/collector.rs | 12 +- nexus/inventory/src/lib.rs | 3 + nexus/types/src/inventory.rs | 16 +- 9 files changed, 1181 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 43c551d9c2..b0b8175268 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4405,6 +4405,7 @@ dependencies = [ "newtype_derive", "nexus-db-model", "nexus-defaults", + "nexus-inventory", "nexus-test-utils", "nexus-types", "omicron-common 0.1.0", diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 4546a6e543..6972b60608 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -12,7 +12,7 @@ //! would be the only consumer -- and in that case it's okay to query the //! database directly. -// NOTE: eminates from Tabled macros +// NOTE: emanates from Tabled macros #![allow(clippy::useless_vec)] use crate::Omdb; diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index e9c1ee1f98..fd8c0952b5 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -48,6 +48,16 @@ impl From for HwPowerState { } } +impl From for PowerState { + fn from(value: HwPowerState) -> Self { + match value { + HwPowerState::A0 => PowerState::A0, + HwPowerState::A1 => PowerState::A1, + HwPowerState::A2 => PowerState::A2, + } + } +} + // See [`nexus_types::inventory::RotSlot`]. impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -72,6 +82,15 @@ impl From for HwRotSlot { } } +impl From for RotSlot { + fn from(value: HwRotSlot) -> RotSlot { + match value { + HwRotSlot::A => RotSlot::A, + HwRotSlot::B => RotSlot::B, + } + } +} + // See [`nexus_types::inventory::CabooseWhich`]. impl_enum_type!( #[derive(SqlType, Debug, QueryId)] @@ -91,19 +110,24 @@ impl_enum_type!( impl From for CabooseWhich { fn from(c: nexus_types::inventory::CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; match c { - nexus_types::inventory::CabooseWhich::SpSlot0 => { - CabooseWhich::SpSlot0 - } - nexus_types::inventory::CabooseWhich::SpSlot1 => { - CabooseWhich::SpSlot1 - } - nexus_types::inventory::CabooseWhich::RotSlotA => { - CabooseWhich::RotSlotA - } - nexus_types::inventory::CabooseWhich::RotSlotB => { - CabooseWhich::RotSlotB - } + nexus_inventory::CabooseWhich::SpSlot0 => CabooseWhich::SpSlot0, + nexus_inventory::CabooseWhich::SpSlot1 => CabooseWhich::SpSlot1, + nexus_inventory::CabooseWhich::RotSlotA => CabooseWhich::RotSlotA, + nexus_inventory::CabooseWhich::RotSlotB => CabooseWhich::RotSlotB, + } + } +} + +impl From for nexus_types::inventory::CabooseWhich { + fn from(row: CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; + match row { + CabooseWhich::SpSlot0 => nexus_inventory::CabooseWhich::SpSlot0, + CabooseWhich::SpSlot1 => nexus_inventory::CabooseWhich::SpSlot1, + CabooseWhich::RotSlotA => nexus_inventory::CabooseWhich::RotSlotA, + CabooseWhich::RotSlotB => nexus_inventory::CabooseWhich::RotSlotB, } } } @@ -144,6 +168,16 @@ impl From for SpType { } } +impl From for nexus_types::inventory::SpType { + fn from(value: SpType) -> Self { + match value { + SpType::Sled => nexus_types::inventory::SpType::Sled, + SpType::Switch => nexus_types::inventory::SpType::Switch, + SpType::Power => nexus_types::inventory::SpType::Power, + } + } +} + /// See [`nexus_types::inventory::Collection`]. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = inv_collection)] @@ -184,6 +218,15 @@ impl<'a> From<&'a BaseboardId> for HwBaseboardId { } } +impl From for BaseboardId { + fn from(row: HwBaseboardId) -> Self { + BaseboardId { + part_number: row.part_number, + serial_number: row.serial_number, + } + } +} + /// See [`nexus_types::inventory::Caboose`]. #[derive( Queryable, @@ -217,6 +260,17 @@ impl<'a> From<&'a Caboose> for SwCaboose { } } +impl From for Caboose { + fn from(row: SwCaboose) -> Self { + Self { + board: row.board, + git_commit: row.git_commit, + name: row.name, + version: row.version, + } + } +} + /// See [`nexus_types::inventory::Collection`]. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = inv_collection_error)] @@ -253,6 +307,20 @@ pub struct InvServiceProcessor { pub power_state: HwPowerState, } +impl From for nexus_types::inventory::ServiceProcessor { + fn from(row: InvServiceProcessor) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + sp_type: nexus_types::inventory::SpType::from(row.sp_type), + sp_slot: **row.sp_slot, + baseboard_revision: **row.baseboard_revision, + hubris_archive: row.hubris_archive_id, + power_state: PowerState::from(row.power_state), + } + } +} + /// Newtype wrapping the MGS-reported slot number for an SP /// /// Current racks only have 32 slots for any given SP type. MGS represents the @@ -340,6 +408,27 @@ pub struct InvRootOfTrust { pub slot_b_sha3_256: Option, } +impl From for nexus_types::inventory::RotState { + fn from(row: InvRootOfTrust) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + active_slot: RotSlot::from(row.slot_active), + persistent_boot_preference: RotSlot::from( + row.slot_boot_pref_persistent, + ), + pending_persistent_boot_preference: row + .slot_boot_pref_persistent_pending + .map(RotSlot::from), + transient_boot_preference: row + .slot_boot_pref_transient + .map(RotSlot::from), + slot_a_sha3_256_digest: row.slot_a_sha3_256, + slot_b_sha3_256_digest: row.slot_b_sha3_256, + } + } +} + /// See [`nexus_types::inventory::CabooseFound`]. #[derive(Queryable, Clone, Debug, Selectable)] #[diesel(table_name = inv_caboose)] diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index eaf3dc1295..c30342f003 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -71,6 +71,7 @@ expectorate.workspace = true hyper-rustls.workspace = true internal-dns.workspace = true itertools.workspace = true +nexus-inventory.workspace = true nexus-test-utils.workspace = true omicron-sled-agent.workspace = true omicron-test-utils.workspace = true diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 3b0df6b519..77196cf7a0 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -72,7 +72,7 @@ impl DataStore { .errors .iter() .enumerate() - .map(|(i, error)| { + .map(|(i, message)| { let index = u16::try_from(i).map_err(|e| { Error::internal_error(&format!( "failed to convert error index to u16 (too \ @@ -80,8 +80,11 @@ impl DataStore { e )) })?; - let message = format!("{:#}", error); - Ok(InvCollectionError::new(collection_id, index, message)) + Ok(InvCollectionError::new( + collection_id, + index, + message.clone(), + )) }) .collect::, Error>>()?; @@ -912,3 +915,304 @@ impl diesel::query_builder::QueryId for InvCabooseInsert { type QueryId = (); const HAS_STATIC_QUERY_ID: bool = false; } + +#[cfg(test)] +mod test { + use crate::db::datastore::datastore_test; + use crate::db::datastore::DataStoreConnection; + use crate::db::schema; + use anyhow::anyhow; + use anyhow::bail; + use anyhow::Context; + use async_bb8_diesel::AsyncRunQueryDsl; + use diesel::expression::SelectableHelper; + use diesel::ExpressionMethods; + use diesel::QueryDsl; + use nexus_db_model::HwBaseboardId; + use nexus_db_model::InvCaboose; + use nexus_db_model::InvCollection; + use nexus_db_model::InvCollectionError; + use nexus_db_model::InvRootOfTrust; + use nexus_db_model::InvServiceProcessor; + use nexus_db_model::SwCaboose; + use nexus_test_utils::db::test_setup_database; + use nexus_types::inventory::BaseboardId; + use nexus_types::inventory::Caboose; + use nexus_types::inventory::CabooseFound; + use nexus_types::inventory::CabooseWhich; + use nexus_types::inventory::Collection; + use nexus_types::inventory::RotState; + use nexus_types::inventory::ServiceProcessor; + use omicron_test_utils::dev; + use std::collections::BTreeMap; + use std::collections::BTreeSet; + use std::num::NonZeroU32; + use std::sync::Arc; + use uuid::Uuid; + + // This function could move into the datastore if it proves helpful. We'd + // need to work out how to report the usual type of Error. For now we don't + // need it so we limit its scope to the test suite. + async fn read_collection_best_effort( + conn: &DataStoreConnection<'_>, + id: Uuid, + limit: NonZeroU32, + ) -> anyhow::Result<(Collection, bool)> { + let sql_limit = i64::from(u32::from(limit)); + let usize_limit = usize::try_from(u32::from(limit)).unwrap(); + let mut limit_reached = false; + let (time_started, time_done, collector) = { + use schema::inv_collection::dsl; + + let collections = dsl::inv_collection + .filter(dsl::id.eq(id)) + .limit(2) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collection")?; + anyhow::ensure!(collections.len() == 1); + let collection = collections.into_iter().next().unwrap(); + ( + collection.time_started, + collection.time_done, + collection.collector, + ) + }; + + let errors: Vec = { + use schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(id)) + .order_by(dsl::idx) + .limit(sql_limit) + .select(InvCollectionError::as_select()) + .load_async(&**conn) + .await + .context("loading collection errors")? + .into_iter() + .map(|e| e.message) + .collect() + }; + limit_reached = limit_reached || errors.len() == usize_limit; + + let sps: BTreeMap<_, _> = { + use schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvServiceProcessor::as_select()) + .load_async(&**conn) + .await + .context("loading service processors")? + .into_iter() + .map(|sp_row| { + let baseboard_id = sp_row.hw_baseboard_id; + (baseboard_id, ServiceProcessor::from(sp_row)) + }) + .collect() + }; + limit_reached = limit_reached || sps.len() == usize_limit; + + let rots: BTreeMap<_, _> = { + use schema::inv_root_of_trust::dsl; + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvRootOfTrust::as_select()) + .load_async(&**conn) + .await + .context("loading roots of trust")? + .into_iter() + .map(|rot_row| { + let baseboard_id = rot_row.hw_baseboard_id; + (baseboard_id, RotState::from(rot_row)) + }) + .collect() + }; + limit_reached = limit_reached || rots.len() == usize_limit; + + // Collect the unique baseboard ids referenced by SPs and RoTs. + let baseboard_id_ids: BTreeSet<_> = + sps.keys().chain(rots.keys()).cloned().collect(); + // Fetch the corresponding baseboard records. + let baseboards_by_id: BTreeMap<_, _> = { + use schema::hw_baseboard_id::dsl; + dsl::hw_baseboard_id + .filter(dsl::id.eq_any(baseboard_id_ids)) + .limit(sql_limit) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboards")? + .into_iter() + .map(|bb| (bb.id, Arc::new(BaseboardId::from(bb)))) + .collect() + }; + limit_reached = limit_reached || baseboards_by_id.len() == usize_limit; + + // Having those, we can replace the keys in the maps above with + // references to the actual baseboard rather than the uuid. + let sps = sps + .into_iter() + .map(|(id, sp)| { + baseboards_by_id.get(&id).map(|bb| (bb.clone(), sp)).ok_or_else( + || anyhow!("missing baseboard that we should have fetched"), + ) + }) + .collect::, _>>()?; + let rots = rots + .into_iter() + .map(|(id, rot)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), rot)) + .ok_or_else(|| { + anyhow!("missing baseboard that we should have fetched") + }) + }) + .collect::, _>>()?; + + // Fetch records of cabooses found. + let inv_caboose_rows = { + use schema::inv_caboose::dsl; + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading inv_cabooses")? + }; + limit_reached = limit_reached || inv_caboose_rows.len() == usize_limit; + + // Collect the unique sw_caboose_ids for those cabooses. + let sw_caboose_ids: BTreeSet<_> = inv_caboose_rows + .iter() + .map(|inv_caboose| inv_caboose.sw_caboose_id) + .collect(); + // Fetch the corresponing records. + let cabooses_by_id: BTreeMap<_, _> = { + use schema::sw_caboose::dsl; + dsl::sw_caboose + .filter(dsl::id.eq_any(sw_caboose_ids)) + .limit(sql_limit) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading sw_cabooses")? + .into_iter() + .map(|sw_caboose_row| { + (sw_caboose_row.id, Arc::new(Caboose::from(sw_caboose_row))) + }) + .collect() + }; + limit_reached = limit_reached || cabooses_by_id.len() == usize_limit; + + // Assemble the lists of cabooses found. + let mut cabooses_found = BTreeMap::new(); + for c in inv_caboose_rows { + let by_baseboard = cabooses_found + .entry(CabooseWhich::from(c.which)) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&c.hw_baseboard_id) else { + bail!( + "unknown baseboard found in inv_caboose: {}", + c.hw_baseboard_id + ); + }; + let Some(sw_caboose) = cabooses_by_id.get(&c.sw_caboose_id) else { + bail!( + "unknown caboose found in inv_caboose: {}", + c.sw_caboose_id + ); + }; + + let previous = by_baseboard.insert( + bb.clone(), + CabooseFound { + time_collected: c.time_collected, + source: c.source, + caboose: sw_caboose.clone(), + }, + ); + anyhow::ensure!( + previous.is_none(), + "duplicate caboose found: {:?} baseboard {:?}", + c.which, + c.hw_baseboard_id + ); + } + + Ok(( + Collection { + id, + errors, + time_started, + time_done, + collector, + baseboards: baseboards_by_id.values().cloned().collect(), + cabooses: cabooses_by_id.values().cloned().collect(), + sps, + rots, + cabooses_found, + }, + limit_reached, + )) + } + + async fn read_collection_all_or_nothing( + conn: &DataStoreConnection<'_>, + id: Uuid, + limit: NonZeroU32, + ) -> anyhow::Result { + let (collection, limit_reached) = + read_collection_best_effort(conn, id, limit).await?; + anyhow::ensure!( + !limit_reached, + "hit limit of {} records while loading collection", + limit + ); + Ok(collection) + } + + async fn read_collection( + conn: &DataStoreConnection<'_>, + id: Uuid, + ) -> anyhow::Result { + let limit = NonZeroU32::new(1000).unwrap(); + read_collection_all_or_nothing(conn, id, limit).await + } + + #[tokio::test] + async fn test_insert_empty() { + // Setup + let logctx = dev::test_setup_log("inventory_insert_empty"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create an empty collection and write it to the database. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection) + .await + .expect("failed to insert collection"); + + // Read it back. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + let collection_read = read_collection(&conn, collection.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection, collection_read); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + // XXX-dap TODO-coverage + // - sequence of collections with overlapping baseboards, new baseboards + // - find pruneable collection + // - delete collection +} diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 85c1122c29..156f328d72 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -54,7 +54,7 @@ impl CollectionBuilder { pub fn new(collector: &str) -> Self { CollectionBuilder { errors: vec![], - time_started: Utc::now(), + time_started: now(), collector: collector.to_owned(), baseboards: BTreeSet::new(), cabooses: BTreeSet::new(), @@ -68,9 +68,13 @@ impl CollectionBuilder { pub fn build(self) -> Collection { Collection { id: Uuid::new_v4(), - errors: self.errors, + errors: self + .errors + .into_iter() + .map(|e| format!("{:#}", e)) + .collect(), time_started: self.time_started, - time_done: Utc::now(), + time_done: now(), collector: self.collector, baseboards: self.baseboards, cabooses: self.cabooses, @@ -122,7 +126,7 @@ impl CollectionBuilder { // Separate the SP state into the SP-specific state and the RoT state, // if any. - let now = Utc::now(); + let now = now(); let _ = self.sps.entry(baseboard.clone()).or_insert_with(|| { ServiceProcessor { time_collected: now, @@ -180,7 +184,7 @@ impl CollectionBuilder { /// /// This is used to avoid requesting it multiple times (from multiple MGS /// instances). - pub fn sp_found_caboose_already( + pub fn found_caboose_already( &self, baseboard: &BaseboardId, which: CabooseWhich, @@ -198,16 +202,16 @@ impl CollectionBuilder { /// /// `source` is an arbitrary string for debugging that describes the MGS /// that reported this data (generally a URL string). - pub fn found_sp_caboose( + pub fn found_caboose( &mut self, baseboard: &BaseboardId, which: CabooseWhich, source: &str, caboose: SpComponentCaboose, ) -> Result<(), anyhow::Error> { - // Normalize the caboose contents: i.e., if we've seen this exact caboose - // contents before, use the same record from before. Otherwise, make a - // new one. + // Normalize the caboose contents: i.e., if we've seen this exact + // caboose contents before, use the same record from before. Otherwise, + // make a new one. let sw_caboose = Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); let (baseboard, _) = @@ -223,7 +227,7 @@ impl CollectionBuilder { if let Some(previous) = by_id.insert( baseboard.clone(), CabooseFound { - time_collected: Utc::now(), + time_collected: now(), source: source.to_owned(), caboose: sw_caboose.clone(), }, @@ -276,3 +280,737 @@ impl CollectionBuilder { self.errors.push(error); } } + +/// Returns the current time, truncated to the previous microsecond. +/// +/// This exists because the database doesn't store nanosecond-precision, so if +/// we store nanosecond-precision timestamps, then DateTime conversion is lossy +/// when round-tripping through the database. That's rather inconvenient. +fn now() -> DateTime { + let ts = Utc::now(); + ts - std::time::Duration::from_nanos(u64::from(ts.timestamp_subsec_nanos())) +} + +#[cfg(test)] +mod test { + use super::now; + use super::CollectionBuilder; + use gateway_client::types::PowerState; + use gateway_client::types::RotSlot; + use gateway_client::types::RotState; + use gateway_client::types::SpComponentCaboose; + use gateway_client::types::SpState; + use gateway_client::types::SpType; + use nexus_types::inventory::BaseboardId; + use nexus_types::inventory::Caboose; + use nexus_types::inventory::CabooseWhich; + use strum::IntoEnumIterator; + + // Verify the contents of an empty collection. + #[test] + fn test_empty() { + let time_before = now(); + let builder = CollectionBuilder::new("test_empty"); + let collection = builder.build(); + let time_after = now(); + + assert!(collection.errors.is_empty()); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_empty"); + assert!(collection.baseboards.is_empty()); + assert!(collection.cabooses.is_empty()); + assert!(collection.sps.is_empty()); + assert!(collection.rots.is_empty()); + assert!(collection.cabooses_found.is_empty()); + } + + // Simple test of a single, fairly typical collection that contains just + // about all kinds of valid data. That includes exercising: + // + // - all three baseboard types (switch, sled, PSC) + // - various valid values for all fields (sources, slot numbers, power + // states, baseboard revisions, cabooses, etc.) + // - some empty slots + // - some missing cabooses + // - some cabooses common to multiple baseboards; others not + // - serial number reused across different model numbers + // + // This test is admittedly pretty tedious and maybe not worthwhile but it's + // a useful quick check. + #[test] + fn test_basic() { + let time_before = now(); + let mut builder = CollectionBuilder::new("test_basic"); + + // an ordinary, working sled + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest1", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest1", + )), + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // another ordinary sled with different values for ordinary fields + let sled2_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Sled, + 4, + SpState { + base_mac_address: [1; 6], + hubris_archive_id: String::from("hubris2"), + model: String::from("model2"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: Some(RotSlot::A), + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest2", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest2", + )), + transient_boot_preference: Some(RotSlot::B), + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a switch + let switch1_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Switch, + 0, + SpState { + base_mac_address: [2; 6], + hubris_archive_id: String::from("hubris3"), + model: String::from("model3"), + power_state: PowerState::A1, + revision: 2, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest3", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest3", + )), + transient_boot_preference: None, + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a PSC + let psc_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Power, + 1, + SpState { + base_mac_address: [3; 6], + hubris_archive_id: String::from("hubris4"), + model: String::from("model4"), + power_state: PowerState::A2, + revision: 3, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from( + "slotAdigest4", + )), + slot_b_sha3_256_digest: Some(String::from( + "slotBdigest4", + )), + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // a sled with no RoT state or other optional fields + let sled3_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 5, + SpState { + base_mac_address: [4; 6], + hubris_archive_id: String::from("hubris5"), + model: String::from("model1"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::CommunicationFailed { + message: String::from("test suite injected error"), + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // Report some cabooses. + + // We'll use the same cabooses for most of these components, although + // that's not possible in a real system. We deliberately construct a + // new value each time to make sure the builder correctly normalizes it. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; + for bb in &common_caboose_baseboards { + for which in CabooseWhich::iter() { + assert!(!builder.found_caboose_already(bb, which)); + let _ = builder + .found_caboose( + bb, + which, + "test suite", + SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }, + ) + .unwrap(); + assert!(builder.found_caboose_already(bb, which)); + } + } + + // For the PSC, use different cabooses for both slots of both the SP and + // RoT, just to exercise that we correctly keep track of different + // cabooses. + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot0, + "test suite", + SpComponentCaboose { + board: String::from("psc_sp_0"), + git_commit: String::from("psc_sp_0"), + name: String::from("psc_sp_0"), + version: String::from("psc_sp_0"), + }, + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot1, + "test suite", + SpComponentCaboose { + board: String::from("psc_sp_1"), + git_commit: String::from("psc_sp_1"), + name: String::from("psc_sp_1"), + version: String::from("psc_sp_1"), + }, + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotA, + "test suite", + SpComponentCaboose { + board: String::from("psc_rot_a"), + git_commit: String::from("psc_rot_a"), + name: String::from("psc_rot_a"), + version: String::from("psc_rot_a"), + }, + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotB, + "test suite", + SpComponentCaboose { + board: String::from("psc_rot_b"), + git_commit: String::from("psc_rot_b"), + name: String::from("psc_rot_b"), + version: String::from("psc_rot_b"), + }, + ) + .unwrap(); + + // We deliberately provide no cabooses for sled3. + + // Finish the collection and verify the basics. + let collection = builder.build(); + let time_after = now(); + println!("{:#?}", collection); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_basic"); + + // Verify the one error that ought to have been produced for the SP with + // no RoT information. + assert_eq!( + collection.errors.iter().map(|e| e.to_string()).collect::>(), + ["MGS \"fake MGS 1\": reading RoT state for BaseboardId \ + { part_number: \"model1\", serial_number: \"s2\" }: test suite \ + injected error"] + ); + + // Verify the baseboard ids found. + let expected_baseboards = + &[&sled1_bb, &sled2_bb, &sled3_bb, &switch1_bb, &psc_bb]; + for bb in expected_baseboards { + assert!(collection.baseboards.contains(*bb)); + } + assert_eq!(collection.baseboards.len(), expected_baseboards.len()); + + // Verify the stuff that's easy to verify for all SPs: timestamps. + assert_eq!(collection.sps.len(), collection.baseboards.len()); + for (bb, sp) in collection.sps.iter() { + assert!(collection.time_started <= sp.time_collected); + assert!(sp.time_collected <= collection.time_done); + + if let Some(rot) = collection.rots.get(bb) { + assert_eq!(rot.source, sp.source); + assert_eq!(rot.time_collected, sp.time_collected); + } + + for which in [CabooseWhich::SpSlot0, CabooseWhich::SpSlot1] { + let caboose = collection.caboose_for(which, bb); + if let Some(c) = caboose { + assert!(collection.time_started <= c.time_collected); + assert!(c.time_collected <= collection.time_done); + assert!(collection.cabooses.contains(&c.caboose)); + } + } + } + + // Verify the common caboose. + let common_caboose = Caboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + for bb in &common_caboose_baseboards { + let _ = collection.sps.get(*bb).unwrap(); + let c0 = collection.caboose_for(CabooseWhich::SpSlot0, bb).unwrap(); + let c1 = collection.caboose_for(CabooseWhich::SpSlot1, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + + let _ = collection.rots.get(*bb).unwrap(); + let c0 = + collection.caboose_for(CabooseWhich::RotSlotA, bb).unwrap(); + let c1 = + collection.caboose_for(CabooseWhich::RotSlotB, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + } + assert!(collection.cabooses.contains(&common_caboose)); + + // Verify the specific, different data for the healthy SPs and RoTs that + // we reported. + // sled1 + let sp = collection.sps.get(&sled1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 3); + assert_eq!(sp.baseboard_revision, 0); + assert_eq!(sp.hubris_archive, "hubris1"); + assert_eq!(sp.power_state, PowerState::A0); + let rot = collection.rots.get(&sled1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::A); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest1" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest1" + ); + assert_eq!(rot.transient_boot_preference, None); + + // sled2 + let sp = collection.sps.get(&sled2_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 4); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris2"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&sled2_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, Some(RotSlot::A)); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest2" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest2" + ); + assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); + + // switch + let sp = collection.sps.get(&switch1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Switch); + assert_eq!(sp.sp_slot, 0); + assert_eq!(sp.baseboard_revision, 2); + assert_eq!(sp.hubris_archive, "hubris3"); + assert_eq!(sp.power_state, PowerState::A1); + let rot = collection.rots.get(&switch1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest3" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest3" + ); + assert_eq!(rot.transient_boot_preference, None); + + // PSC + let sp = collection.sps.get(&psc_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Power); + assert_eq!(sp.sp_slot, 1); + assert_eq!(sp.baseboard_revision, 3); + assert_eq!(sp.hubris_archive, "hubris4"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&psc_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest4" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest4" + ); + assert_eq!(rot.transient_boot_preference, None); + + // The PSC has four different cabooses! + let c = &collection + .caboose_for(CabooseWhich::SpSlot0, &psc_bb) + .unwrap() + .caboose; + assert_eq!(c.board, "psc_sp_0"); + assert!(collection.cabooses.contains(c)); + let c = &collection + .caboose_for(CabooseWhich::SpSlot1, &psc_bb) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_sp_1"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotA, &psc_bb) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_rot_a"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotB, &psc_bb) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "psc_rot_b"); + + // Verify the reported SP state for sled3, which did not have a healthy + // RoT, nor any cabooses. + let sp = collection.sps.get(&sled3_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 5); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris5"); + assert_eq!(sp.power_state, PowerState::A2); + assert!(collection + .caboose_for(CabooseWhich::SpSlot0, &sled3_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled3_bb) + .is_none()); + assert!(!collection.rots.contains_key(&sled3_bb)); + + // There shouldn't be any other RoTs. + assert_eq!(collection.sps.len(), collection.rots.len() + 1); + + // There should be five cabooses: the four used for the PSC (see above), + // plus the common one. + assert_eq!(collection.cabooses.len(), 5); + } + + // Exercises all the failure cases that shouldn't happen in real systems. + // Despite all of these failures, we should get a valid collection at the + // end. + #[test] + fn test_problems() { + let mut builder = CollectionBuilder::new("test_problems"); + + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // report the same SP again with the same contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report the same SP again with different contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report an SP with an impossible slot number + let sled2_sp = builder.found_sp_state( + "fake MGS 1", + SpType::Sled, + u32::from(u16::MAX) + 1, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ); + assert_eq!(sled2_sp, None); + + // report SP caboose for an unknown baseboard + let bogus_baseboard = BaseboardId { + part_number: String::from("p1"), + serial_number: String::from("bogus"), + }; + let caboose1 = SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + let error = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "reporting caboose for unknown baseboard: \ + BaseboardId { part_number: \"p1\", serial_number: \"bogus\" } \ + (Caboose { board: \"board1\", git_commit: \"git_commit1\", \ + name: \"name1\", version: \"version1\" })" + ); + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + + // report RoT caboose for an unknown baseboard + let error2 = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::RotSlotA, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!(error.to_string(), error2.to_string(),); + + // report the same caboose twice with the same contents + let _ = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap(); + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + format!("{:#}", error), + "baseboard BaseboardId { part_number: \"model1\", \ + serial_number: \"s1\" } caboose SpSlot0: reported multiple \ + times (same value)" + ); + // report the same caboose again with different contents + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + SpComponentCaboose { + board: String::from("board2"), + git_commit: String::from("git_commit2"), + name: String::from("name2"), + version: String::from("version2"), + }, + ) + .unwrap_err(); + let message = format!("{:#}", error); + println!("found error: {}", message); + assert!(message.contains( + "caboose SpSlot0: reported caboose multiple times (previously" + )); + assert!(message.contains(", now ")); + + // We should still get a valid collection. + let collection = builder.build(); + println!("{:#?}", collection); + assert_eq!(collection.collector, "test_problems"); + + // We should still have the one sled and its SP slot0 caboose. + assert!(collection.baseboards.contains(&sled1_bb)); + let _ = collection.sps.get(&sled1_bb).unwrap(); + let caboose = + collection.caboose_for(CabooseWhich::SpSlot0, &sled1_bb).unwrap(); + assert_eq!(caboose.caboose.board, "board2"); + assert!(collection.cabooses.contains(&caboose.caboose)); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled1_bb) + .is_none()); + let _ = collection.rots.get(&sled1_bb).unwrap(); + assert!(collection + .caboose_for(CabooseWhich::RotSlotA, &sled1_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::RotSlotB, &sled1_bb) + .is_none()); + + // We should see an error. + assert_eq!( + collection + .errors + .iter() + .map(|e| format!("{:#}", e)) + .collect::>(), + vec![ + "MGS \"fake MGS 1\": SP Sled slot 65536: \ + slot number did not fit into u16" + ] + ); + } +} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 6ba8171f74..82e513ec30 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -147,9 +147,7 @@ impl Collector { // get here for the first MGS client. Assuming that one succeeds, // the other(s) will skip this loop. for which in CabooseWhich::iter() { - if self - .in_progress - .sp_found_caboose_already(&baseboard_id, which) + if self.in_progress.found_caboose_already(&baseboard_id, which) { continue; } @@ -181,7 +179,7 @@ impl Collector { } Ok(response) => response.into_inner(), }; - if let Err(error) = self.in_progress.found_sp_caboose( + if let Err(error) = self.in_progress.found_caboose( &baseboard_id, which, client.baseurl(), @@ -276,7 +274,7 @@ mod test { write!(&mut s, "\nerrors:\n").unwrap(); for e in &collection.errors { - write!(&mut s, "error: {:#}\n", e).unwrap(); + write!(&mut s, "error: {}\n", e).unwrap(); } s @@ -365,8 +363,8 @@ mod test { }; let bad_client = { // This IP range is guaranteed by RFC 6666 to discard traffic. - let url = format!("http://[100::1]:12345"); - let client = gateway_client::Client::new(&url, log.clone()); + let url = "http://[100::1]:12345"; + let client = gateway_client::Client::new(url, log.clone()); Arc::new(client) }; let mgs_clients = &[bad_client, real_client]; diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs index c30c25369a..b0a823a69e 100644 --- a/nexus/inventory/src/lib.rs +++ b/nexus/inventory/src/lib.rs @@ -20,4 +20,7 @@ mod builder; mod collector; +// only exposed for test code to construct collections +pub use builder::CollectionBuilder; + pub use collector::Collector; diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index c4f2b665fe..112eec3a65 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -35,12 +35,12 @@ use uuid::Uuid; /// database. /// /// See the documentation in the database schema for more background. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub struct Collection { /// unique identifier for this collection pub id: Uuid, /// errors encountered during collection - pub errors: Vec, + pub errors: Vec, /// time the collection started pub time_started: DateTime, /// time the collection eneded @@ -75,6 +75,18 @@ pub struct Collection { BTreeMap, CabooseFound>>, } +impl Collection { + pub fn caboose_for( + &self, + which: CabooseWhich, + baseboard_id: &BaseboardId, + ) -> Option<&CabooseFound> { + self.cabooses_found + .get(&which) + .and_then(|by_bb| by_bb.get(baseboard_id)) + } +} + /// A unique baseboard id found during a collection /// /// Baseboard ids are the keys used to link up information from disparate From 64ed0539fa1dd80b4a20ff0d99820ee51a92573d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 27 Oct 2023 11:58:01 -0700 Subject: [PATCH 13/20] add more tests and update omdb to use functions I added for tests --- Cargo.lock | 1 + dev-tools/omdb/src/bin/omdb/db.rs | 328 ++---- nexus/db-queries/Cargo.toml | 1 + .../db-queries/src/db/datastore/inventory.rs | 931 +++++++++++++----- nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/inventory/src/builder.rs | 296 +----- nexus/inventory/src/examples.rs | 254 +++++ nexus/inventory/src/lib.rs | 1 + .../app/background/inventory_collection.rs | 80 ++ 9 files changed, 1144 insertions(+), 749 deletions(-) create mode 100644 nexus/inventory/src/examples.rs diff --git a/Cargo.lock b/Cargo.lock index b0b8175268..e24eec38a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4392,6 +4392,7 @@ dependencies = [ "dropshot", "expectorate", "futures", + "gateway-client", "headers", "hex", "http", diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 6972b60608..54e344a04d 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -30,7 +30,7 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; -use nexus_db_model::CabooseWhich; +use gateway_client::types::SpType; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -40,21 +40,17 @@ use nexus_db_model::DnsZone; use nexus_db_model::ExternalIp; use nexus_db_model::HwBaseboardId; use nexus_db_model::Instance; -use nexus_db_model::InvCaboose; use nexus_db_model::InvCollection; -use nexus_db_model::InvCollectionError; -use nexus_db_model::InvRootOfTrust; -use nexus_db_model::InvServiceProcessor; use nexus_db_model::Project; use nexus_db_model::Region; use nexus_db_model::Sled; -use nexus_db_model::SpType; use nexus_db_model::SwCaboose; use nexus_db_model::Vmm; use nexus_db_model::Zpool; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::datastore::DataStoreConnection; +use nexus_db_queries::db::datastore::DataStoreInventoryTest; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; @@ -63,6 +59,8 @@ use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsRecord; use nexus_types::internal_api::params::Srv; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Generation; use omicron_common::postgres_config::PostgresConfigWithUrl; @@ -450,15 +448,23 @@ where D: Display, { if items.len() == usize::try_from(limit.get()).unwrap() { - eprintln!( - "WARN: {}: found {} items (the limit). There may be more items \ - that were ignored. Consider overriding with --fetch-limit.", - context(), - items.len(), - ); + limit_error(limit, context); } } +fn limit_error(limit: NonZeroU32, context: F) +where + F: FnOnce() -> D, + D: Display, +{ + eprintln!( + "WARN: {}: found {} items (the limit). There may be more items \ + that were ignored. Consider overriding with --fetch-limit.", + context(), + limit, + ); +} + /// Returns pagination parameters to fetch the first page of results for a /// paginated endpoint fn first_page<'a, T>(limit: NonZeroU32) -> DataPageParams<'a, T> { @@ -1472,7 +1478,7 @@ async fn cmd_db_inventory( }) => cmd_db_inventory_collections_list(&conn, limit).await, InventoryCommands::Collections(CollectionsArgs { command: CollectionsCommands::Show(CollectionsShowArgs { id }), - }) => cmd_db_inventory_collections_show(&conn, id, limit).await, + }) => cmd_db_inventory_collections_show(datastore, id, limit).await, } } @@ -1632,50 +1638,21 @@ async fn cmd_db_inventory_collections_list( } async fn cmd_db_inventory_collections_show( - conn: &DataStoreConnection<'_>, + datastore: &DataStore, id: Uuid, limit: NonZeroU32, ) -> Result<(), anyhow::Error> { - inv_collection_print(conn, id).await?; - let nerrors = inv_collection_print_errors(conn, id, limit).await?; - - // Load all the baseboards. We could select only the baseboards referenced - // by this collection. But it's simpler to fetch everything. And it's - // uncommon enough at this point to have unreferenced baseboards that it's - // worth calling them out. - let baseboard_ids = { - use db::schema::hw_baseboard_id::dsl; - let baseboard_ids = dsl::hw_baseboard_id - .limit(i64::from(u32::from(limit))) - .select(HwBaseboardId::as_select()) - .load_async(&**conn) - .await - .context("loading baseboard ids")?; - check_limit(&baseboard_ids, limit, || "loading baseboard ids"); - baseboard_ids.into_iter().map(|b| (b.id, b)).collect::>() - }; - - // Similarly, load cabooses that are referenced by this collection. - let cabooses = { - use db::schema::inv_caboose::dsl as inv_dsl; - use db::schema::sw_caboose::dsl as sw_dsl; - let unique_cabooses = inv_dsl::inv_caboose - .filter(inv_dsl::inv_collection_id.eq(id)) - .select(inv_dsl::sw_caboose_id) - .distinct(); - let cabooses = sw_dsl::sw_caboose - .filter(sw_dsl::id.eq_any(unique_cabooses)) - .limit(i64::from(u32::from(limit))) - .select(SwCaboose::as_select()) - .load_async(&**conn) - .await - .context("loading cabooses")?; - check_limit(&cabooses, limit, || "loading cabooses"); - cabooses.into_iter().map(|c| (c.id, c)).collect::>() - }; + let (collection, incomplete) = datastore + .inventory_collection_read_best_effort(id, limit) + .await + .context("reading collection")?; + if incomplete { + limit_error(limit, || "loading collection"); + } - inv_collection_print_devices(conn, id, limit, &baseboard_ids, &cabooses) - .await?; + inv_collection_print(&collection).await?; + let nerrors = inv_collection_print_errors(&collection).await?; + inv_collection_print_devices(&collection).await?; if nerrors > 0 { eprintln!( @@ -1690,29 +1667,13 @@ async fn cmd_db_inventory_collections_show( } async fn inv_collection_print( - conn: &DataStoreConnection<'_>, - id: Uuid, + collection: &Collection, ) -> Result<(), anyhow::Error> { - use db::schema::inv_collection::dsl; - let collections = dsl::inv_collection - .filter(dsl::id.eq(id)) - .limit(2) - .select(InvCollection::as_select()) - .load_async(&**conn) - .await - .context("loading collection")?; - anyhow::ensure!( - collections.len() == 1, - "expected exactly one collection with id {}, found {}", - id, - collections.len() - ); - let c = collections.into_iter().next().unwrap(); - println!("collection: {}", c.id); + println!("collection: {}", collection.id); println!( "collector: {}{}", - c.collector, - if c.collector.parse::().is_ok() { + collection.collector, + if collection.collector.parse::().is_ok() { " (likely a Nexus instance)" } else { "" @@ -1720,106 +1681,42 @@ async fn inv_collection_print( ); println!( "started: {}", - humantime::format_rfc3339_millis(c.time_started.into()) + humantime::format_rfc3339_millis(collection.time_started.into()) ); println!( "done: {}", - humantime::format_rfc3339_millis(c.time_done.into()) + humantime::format_rfc3339_millis(collection.time_done.into()) ); Ok(()) } async fn inv_collection_print_errors( - conn: &DataStoreConnection<'_>, - id: Uuid, - limit: NonZeroU32, + collection: &Collection, ) -> Result { - use db::schema::inv_collection_error::dsl; - let errors = dsl::inv_collection_error - .filter(dsl::inv_collection_id.eq(id)) - .limit(i64::from(u32::from(limit))) - .select(InvCollectionError::as_select()) - .load_async(&**conn) - .await - .context("loading collection errors")?; - check_limit(&errors, limit, || "loading collection errors"); - - println!("errors: {}", errors.len()); - for e in &errors { - println!(" error {}: {}", e.idx, e.message); + println!("errors: {}", collection.errors.len()); + for (index, message) in collection.errors.iter().enumerate() { + println!(" error {}: {}", index, message); } - Ok(errors + Ok(collection + .errors .len() .try_into() .expect("could not convert error count into u32 (yikes)")) } async fn inv_collection_print_devices( - conn: &DataStoreConnection<'_>, - id: Uuid, - limit: NonZeroU32, - baseboard_ids: &BTreeMap, - sw_cabooses: &BTreeMap, + collection: &Collection, ) -> Result<(), anyhow::Error> { - // Load the service processors, grouped by baseboard id. - let sps: BTreeMap = { - use db::schema::inv_service_processor::dsl; - let sps = dsl::inv_service_processor - .filter(dsl::inv_collection_id.eq(id)) - .limit(i64::from(u32::from(limit))) - .select(InvServiceProcessor::as_select()) - .load_async(&**conn) - .await - .context("loading service processors")?; - check_limit(&sps, limit, || "loading service processors"); - sps.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() - }; - - // Load the roots of trust, grouped by baseboard id. - let rots: BTreeMap = { - use db::schema::inv_root_of_trust::dsl; - let rots = dsl::inv_root_of_trust - .filter(dsl::inv_collection_id.eq(id)) - .limit(i64::from(u32::from(limit))) - .select(InvRootOfTrust::as_select()) - .load_async(&**conn) - .await - .context("loading roots of trust")?; - check_limit(&rots, limit, || "loading roots of trust"); - rots.into_iter().map(|s| (s.hw_baseboard_id, s)).collect() - }; - - // Load cabooses found, grouped by baseboard id. - let inv_cabooses = { - use db::schema::inv_caboose::dsl; - let cabooses_found = dsl::inv_caboose - .filter(dsl::inv_collection_id.eq(id)) - .limit(i64::from(u32::from(limit))) - .select(InvCaboose::as_select()) - .load_async(&**conn) - .await - .context("loading cabooses found")?; - check_limit(&cabooses_found, limit, || "loading cabooses found"); - - let mut cabooses: BTreeMap> = BTreeMap::new(); - for ic in cabooses_found { - cabooses - .entry(ic.hw_baseboard_id) - .or_insert_with(Vec::new) - .push(ic); - } - cabooses - }; - // Assemble a list of baseboard ids, sorted first by device type (sled, // switch, power), then by slot number. This is the order in which we will // print everything out. - let mut sorted_baseboard_ids: Vec<_> = sps.keys().cloned().collect(); + let mut sorted_baseboard_ids: Vec<_> = + collection.sps.keys().cloned().collect(); sorted_baseboard_ids.sort_by(|s1, s2| { - let sp1 = sps.get(s1).unwrap(); - let sp2 = sps.get(s2).unwrap(); + let sp1 = collection.sps.get(s1).unwrap(); + let sp2 = collection.sps.get(s2).unwrap(); sp1.sp_type.cmp(&sp2.sp_type).then(sp1.sp_slot.cmp(&sp2.sp_slot)) }); @@ -1827,9 +1724,9 @@ async fn inv_collection_print_devices( for baseboard_id in &sorted_baseboard_ids { // This unwrap should not fail because the collection we're iterating // over came from the one we're looking into now. - let sp = sps.get(baseboard_id).unwrap(); - let baseboard = baseboard_ids.get(baseboard_id); - let rot = rots.get(baseboard_id); + let sp = collection.sps.get(baseboard_id).unwrap(); + let baseboard = collection.baseboards.get(baseboard_id); + let rot = collection.rots.get(baseboard_id); println!(""); match baseboard { @@ -1860,91 +1757,64 @@ async fn inv_collection_print_devices( println!(""); println!(" found at: {} from {}", sp.time_collected, sp.source); - println!(" cabooses:"); - if let Some(my_inv_cabooses) = inv_cabooses.get(baseboard_id) { - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct CabooseRow<'a> { - slot: &'static str, - board: &'a str, - name: &'a str, - version: &'a str, - git_commit: &'a str, - } - let mut nbugs = 0; - let rows = my_inv_cabooses.iter().map(|ic| { - let slot = match ic.which { - CabooseWhich::SpSlot0 => " SP slot 0", - CabooseWhich::SpSlot1 => " SP slot 1", - CabooseWhich::RotSlotA => "RoT slot A", - CabooseWhich::RotSlotB => "RoT slot B", - }; - - let (board, name, version, git_commit) = - match sw_cabooses.get(&ic.sw_caboose_id) { - None => { - nbugs += 1; - ("-", "-", "-", "-") - } - Some(c) => ( - c.board.as_str(), - c.name.as_str(), - c.version.as_str(), - c.git_commit.as_str(), - ), - }; - - CabooseRow { slot, board, name, version, git_commit } - }); - - let table = tabled::Table::new(rows) - .with(tabled::settings::Style::empty()) - .with(tabled::settings::Padding::new(0, 1, 0, 0)) - .to_string(); - - println!("{}", textwrap::indent(&table.to_string(), " ")); - - if nbugs > 0 { - // Similar to above, if we don't have the sw_caboose for some - // inv_caboose, then it's a bug in either this tool (if we - // failed to fetch it) or the inventory system (if it failed to - // insert it). - println!( - "error: at least one caboose above was missing data \ - -- this is a bug" - ); - } + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: String, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, } + println!(" cabooses:"); + let caboose_rows: Vec<_> = CabooseWhich::iter() + .filter_map(|c| { + collection.caboose_for(c, baseboard_id).map(|d| (c, d)) + }) + .map(|(c, found_caboose)| CabooseRow { + slot: format!("{:?}", c), + board: &found_caboose.caboose.board, + name: &found_caboose.caboose.name, + version: &found_caboose.caboose.version, + git_commit: &found_caboose.caboose.git_commit, + }) + .collect(); + let table = tabled::Table::new(caboose_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", textwrap::indent(&table.to_string(), " ")); + if let Some(rot) = rot { - println!(" RoT: active slot: slot {:?}", rot.slot_active); + println!(" RoT: active slot: slot {:?}", rot.active_slot); println!( " RoT: persistent boot preference: slot {:?}", - rot.slot_active + rot.persistent_boot_preference, ); println!( " RoT: pending persistent boot preference: {}", - rot.slot_boot_pref_persistent_pending + rot.pending_persistent_boot_preference .map(|s| format!("slot {:?}", s)) .unwrap_or_else(|| String::from("-")) ); println!( " RoT: transient boot preference: {}", - rot.slot_boot_pref_transient + rot.transient_boot_preference .map(|s| format!("slot {:?}", s)) .unwrap_or_else(|| String::from("-")) ); println!( " RoT: slot A SHA3-256: {}", - rot.slot_a_sha3_256 + rot.slot_a_sha3_256_digest .clone() .unwrap_or_else(|| String::from("-")) ); println!( " RoT: slot B SHA3-256: {}", - rot.slot_b_sha3_256 + rot.slot_b_sha3_256_digest .clone() .unwrap_or_else(|| String::from("-")) ); @@ -1954,49 +1824,35 @@ async fn inv_collection_print_devices( } println!(""); - for unused_baseboard in baseboard_ids - .keys() - .collect::>() - .difference(&sps.keys().collect::>()) - { - // It's not a bug in either omdb or the inventory system to find a - // baseboard not referenced in the collection. It might just mean a - // sled was removed from the system. But at this point it's uncommon - // enough to call out. - let b = baseboard_ids.get(unused_baseboard).unwrap(); - eprintln!( - "note: baseboard previously found, but not in this \ - collection: part {} serial {}", - b.part_number, b.serial_number - ); - } - for sp_missing_rot in sps + for sp_missing_rot in collection + .sps .keys() .collect::>() - .difference(&rots.keys().collect::>()) + .difference(&collection.rots.keys().collect::>()) { // It's not a bug in either omdb or the inventory system to find an SP // with no RoT. It just means that when we collected inventory from the // SP, it couldn't communicate with its RoT. - let sp = sps.get(sp_missing_rot).unwrap(); + let sp = collection.sps.get(*sp_missing_rot).unwrap(); println!( "warning: found SP with no RoT: {:?} slot {}", sp.sp_type, sp.sp_slot ); } - for rot_missing_sp in rots + + for rot_missing_sp in collection + .rots .keys() .collect::>() - .difference(&sps.keys().collect::>()) + .difference(&collection.sps.keys().collect::>()) { // It *is* a bug in the inventory system (or omdb) to find an RoT with // no SP, since we get the RoT information from the SP in the first // place. - let rot = rots.get(rot_missing_sp).unwrap(); println!( "error: found RoT with no SP: \ hw_baseboard_id {:?} -- this is a bug", - rot.hw_baseboard_id + rot_missing_sp ); } diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index c30342f003..62adbe5bd2 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -69,6 +69,7 @@ omicron-workspace-hack.workspace = true assert_matches.workspace = true expectorate.workspace = true hyper-rustls.workspace = true +gateway-client.workspace = true internal-dns.workspace = true itertools.workspace = true nexus-inventory.workspace = true diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 77196cf7a0..500af8e39b 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -8,11 +8,17 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; +use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use crate::db::TransactionError; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::AsyncSimpleConnection; use chrono::DateTime; use chrono::Utc; +use diesel::expression::SelectableHelper; use diesel::sql_types; use diesel::sql_types::Nullable; use diesel::Column; @@ -22,6 +28,8 @@ use diesel::NullableExpressionMethods; use diesel::QueryDsl; use diesel::QuerySource; use diesel::Table; +use futures::future::BoxFuture; +use futures::FutureExt; use nexus_db_model::CabooseWhich; use nexus_db_model::CabooseWhichEnum; use nexus_db_model::HwBaseboardId; @@ -29,8 +37,11 @@ use nexus_db_model::HwPowerState; use nexus_db_model::HwPowerStateEnum; use nexus_db_model::HwRotSlot; use nexus_db_model::HwRotSlotEnum; +use nexus_db_model::InvCaboose; use nexus_db_model::InvCollection; use nexus_db_model::InvCollectionError; +use nexus_db_model::InvRootOfTrust; +use nexus_db_model::InvServiceProcessor; use nexus_db_model::SpType; use nexus_db_model::SpTypeEnum; use nexus_db_model::SwCaboose; @@ -39,6 +50,10 @@ use nexus_types::inventory::CabooseFound; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::num::NonZeroU32; +use std::sync::Arc; use uuid::Uuid; impl DataStore { @@ -916,303 +931,719 @@ impl diesel::query_builder::QueryId for InvCabooseInsert { const HAS_STATIC_QUERY_ID: bool = false; } -#[cfg(test)] -mod test { - use crate::db::datastore::datastore_test; - use crate::db::datastore::DataStoreConnection; - use crate::db::schema; - use anyhow::anyhow; - use anyhow::bail; - use anyhow::Context; - use async_bb8_diesel::AsyncRunQueryDsl; - use diesel::expression::SelectableHelper; - use diesel::ExpressionMethods; - use diesel::QueryDsl; - use nexus_db_model::HwBaseboardId; - use nexus_db_model::InvCaboose; - use nexus_db_model::InvCollection; - use nexus_db_model::InvCollectionError; - use nexus_db_model::InvRootOfTrust; - use nexus_db_model::InvServiceProcessor; - use nexus_db_model::SwCaboose; - use nexus_test_utils::db::test_setup_database; - use nexus_types::inventory::BaseboardId; - use nexus_types::inventory::Caboose; - use nexus_types::inventory::CabooseFound; - use nexus_types::inventory::CabooseWhich; - use nexus_types::inventory::Collection; - use nexus_types::inventory::RotState; - use nexus_types::inventory::ServiceProcessor; - use omicron_test_utils::dev; - use std::collections::BTreeMap; - use std::collections::BTreeSet; - use std::num::NonZeroU32; - use std::sync::Arc; - use uuid::Uuid; +/// Extra interfaces that are not intended (and potentially unsafe) for use in +/// Nexus, but useful for testing and `omdb` +pub trait DataStoreInventoryTest: Send + Sync { + /// List all collections + /// + /// This does not paginate. + fn inventory_collections(&self) -> BoxFuture>>; + + /// Make a best effort to read the given collection while limiting queries + /// to `limit` results. Returns as much as it was able to get. The + /// returned bool indicates whether the returned collection might be + /// incomplete because the limit was reached. + fn inventory_collection_read_best_effort( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture>; + + /// Attempt to read the given collection while limiting queries to `limit` + /// records + fn inventory_collection_read_all_or_nothing( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture> { + async move { + let (collection, limit_reached) = + self.inventory_collection_read_best_effort(id, limit).await?; + anyhow::ensure!( + !limit_reached, + "hit limit of {} records while loading collection", + limit + ); + Ok(collection) + } + .boxed() + } +} + +impl DataStoreInventoryTest for DataStore { + fn inventory_collections(&self) -> BoxFuture>> { + async { + let conn = self + .pool_connection_for_tests() + .await + .context("getting connectoin")?; + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) + .await + .context("failed to allow table scan")?; + + use db::schema::inv_collection::dsl; + dsl::inv_collection + .select(dsl::id) + .order_by(dsl::time_started) + .load_async(&conn) + .await + .context("failed to list collections") + }) + .await + } + .boxed() + } // This function could move into the datastore if it proves helpful. We'd // need to work out how to report the usual type of Error. For now we don't // need it so we limit its scope to the test suite. - async fn read_collection_best_effort( - conn: &DataStoreConnection<'_>, + fn inventory_collection_read_best_effort( + &self, id: Uuid, limit: NonZeroU32, - ) -> anyhow::Result<(Collection, bool)> { - let sql_limit = i64::from(u32::from(limit)); - let usize_limit = usize::try_from(u32::from(limit)).unwrap(); - let mut limit_reached = false; - let (time_started, time_done, collector) = { - use schema::inv_collection::dsl; - - let collections = dsl::inv_collection - .filter(dsl::id.eq(id)) - .limit(2) - .select(InvCollection::as_select()) - .load_async(&**conn) + ) -> BoxFuture> { + async move { + let conn = &self + .pool_connection_for_tests() .await - .context("loading collection")?; - anyhow::ensure!(collections.len() == 1); - let collection = collections.into_iter().next().unwrap(); - ( - collection.time_started, - collection.time_done, - collection.collector, - ) - }; - - let errors: Vec = { - use schema::inv_collection_error::dsl; - dsl::inv_collection_error - .filter(dsl::inv_collection_id.eq(id)) - .order_by(dsl::idx) - .limit(sql_limit) - .select(InvCollectionError::as_select()) - .load_async(&**conn) - .await - .context("loading collection errors")? - .into_iter() - .map(|e| e.message) - .collect() - }; - limit_reached = limit_reached || errors.len() == usize_limit; - - let sps: BTreeMap<_, _> = { - use schema::inv_service_processor::dsl; - dsl::inv_service_processor - .filter(dsl::inv_collection_id.eq(id)) - .limit(sql_limit) - .select(InvServiceProcessor::as_select()) - .load_async(&**conn) - .await - .context("loading service processors")? - .into_iter() - .map(|sp_row| { - let baseboard_id = sp_row.hw_baseboard_id; - (baseboard_id, ServiceProcessor::from(sp_row)) - }) - .collect() - }; - limit_reached = limit_reached || sps.len() == usize_limit; - - let rots: BTreeMap<_, _> = { - use schema::inv_root_of_trust::dsl; - dsl::inv_root_of_trust - .filter(dsl::inv_collection_id.eq(id)) - .limit(sql_limit) - .select(InvRootOfTrust::as_select()) - .load_async(&**conn) - .await - .context("loading roots of trust")? + .context("getting connection")?; + let sql_limit = i64::from(u32::from(limit)); + let usize_limit = usize::try_from(u32::from(limit)).unwrap(); + let mut limit_reached = false; + let (time_started, time_done, collector) = { + use db::schema::inv_collection::dsl; + + let collections = dsl::inv_collection + .filter(dsl::id.eq(id)) + .limit(2) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collection")?; + anyhow::ensure!(collections.len() == 1); + let collection = collections.into_iter().next().unwrap(); + ( + collection.time_started, + collection.time_done, + collection.collector, + ) + }; + + let errors: Vec = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(id)) + .order_by(dsl::idx) + .limit(sql_limit) + .select(InvCollectionError::as_select()) + .load_async(&**conn) + .await + .context("loading collection errors")? + .into_iter() + .map(|e| e.message) + .collect() + }; + limit_reached = limit_reached || errors.len() == usize_limit; + + let sps: BTreeMap<_, _> = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvServiceProcessor::as_select()) + .load_async(&**conn) + .await + .context("loading service processors")? + .into_iter() + .map(|sp_row| { + let baseboard_id = sp_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::ServiceProcessor::from( + sp_row, + ), + ) + }) + .collect() + }; + limit_reached = limit_reached || sps.len() == usize_limit; + + let rots: BTreeMap<_, _> = { + use db::schema::inv_root_of_trust::dsl; + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvRootOfTrust::as_select()) + .load_async(&**conn) + .await + .context("loading roots of trust")? + .into_iter() + .map(|rot_row| { + let baseboard_id = rot_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::RotState::from(rot_row), + ) + }) + .collect() + }; + limit_reached = limit_reached || rots.len() == usize_limit; + + // Collect the unique baseboard ids referenced by SPs and RoTs. + let baseboard_id_ids: BTreeSet<_> = + sps.keys().chain(rots.keys()).cloned().collect(); + // Fetch the corresponding baseboard records. + let baseboards_by_id: BTreeMap<_, _> = { + use db::schema::hw_baseboard_id::dsl; + dsl::hw_baseboard_id + .filter(dsl::id.eq_any(baseboard_id_ids)) + .limit(sql_limit) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboards")? + .into_iter() + .map(|bb| { + ( + bb.id, + Arc::new( + nexus_types::inventory::BaseboardId::from(bb), + ), + ) + }) + .collect() + }; + limit_reached = + limit_reached || baseboards_by_id.len() == usize_limit; + + // Having those, we can replace the keys in the maps above with + // references to the actual baseboard rather than the uuid. + let sps = sps .into_iter() - .map(|rot_row| { - let baseboard_id = rot_row.hw_baseboard_id; - (baseboard_id, RotState::from(rot_row)) + .map(|(id, sp)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), sp)) + .ok_or_else(|| { + anyhow!( + "missing baseboard that we should have fetched" + ) + }) }) - .collect() - }; - limit_reached = limit_reached || rots.len() == usize_limit; - - // Collect the unique baseboard ids referenced by SPs and RoTs. - let baseboard_id_ids: BTreeSet<_> = - sps.keys().chain(rots.keys()).cloned().collect(); - // Fetch the corresponding baseboard records. - let baseboards_by_id: BTreeMap<_, _> = { - use schema::hw_baseboard_id::dsl; - dsl::hw_baseboard_id - .filter(dsl::id.eq_any(baseboard_id_ids)) - .limit(sql_limit) - .select(HwBaseboardId::as_select()) - .load_async(&**conn) - .await - .context("loading baseboards")? - .into_iter() - .map(|bb| (bb.id, Arc::new(BaseboardId::from(bb)))) - .collect() - }; - limit_reached = limit_reached || baseboards_by_id.len() == usize_limit; - - // Having those, we can replace the keys in the maps above with - // references to the actual baseboard rather than the uuid. - let sps = sps - .into_iter() - .map(|(id, sp)| { - baseboards_by_id.get(&id).map(|bb| (bb.clone(), sp)).ok_or_else( - || anyhow!("missing baseboard that we should have fetched"), - ) - }) - .collect::, _>>()?; - let rots = rots - .into_iter() - .map(|(id, rot)| { - baseboards_by_id + .collect::, _>>()?; + let rots = + rots.into_iter() + .map(|(id, rot)| { + baseboards_by_id .get(&id) .map(|bb| (bb.clone(), rot)) .ok_or_else(|| { anyhow!("missing baseboard that we should have fetched") }) - }) - .collect::, _>>()?; - - // Fetch records of cabooses found. - let inv_caboose_rows = { - use schema::inv_caboose::dsl; - dsl::inv_caboose - .filter(dsl::inv_collection_id.eq(id)) - .limit(sql_limit) - .select(InvCaboose::as_select()) - .load_async(&**conn) - .await - .context("loading inv_cabooses")? - }; - limit_reached = limit_reached || inv_caboose_rows.len() == usize_limit; + }) + .collect::, _>>()?; + + // Fetch records of cabooses found. + let inv_caboose_rows = { + use db::schema::inv_caboose::dsl; + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading inv_cabooses")? + }; + limit_reached = + limit_reached || inv_caboose_rows.len() == usize_limit; + + // Collect the unique sw_caboose_ids for those cabooses. + let sw_caboose_ids: BTreeSet<_> = inv_caboose_rows + .iter() + .map(|inv_caboose| inv_caboose.sw_caboose_id) + .collect(); + // Fetch the corresponing records. + let cabooses_by_id: BTreeMap<_, _> = { + use db::schema::sw_caboose::dsl; + dsl::sw_caboose + .filter(dsl::id.eq_any(sw_caboose_ids)) + .limit(sql_limit) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading sw_cabooses")? + .into_iter() + .map(|sw_caboose_row| { + ( + sw_caboose_row.id, + Arc::new(nexus_types::inventory::Caboose::from( + sw_caboose_row, + )), + ) + }) + .collect() + }; + limit_reached = + limit_reached || cabooses_by_id.len() == usize_limit; + + // Assemble the lists of cabooses found. + let mut cabooses_found = BTreeMap::new(); + for c in inv_caboose_rows { + let by_baseboard = cabooses_found + .entry(nexus_types::inventory::CabooseWhich::from(c.which)) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&c.hw_baseboard_id) else { + bail!( + "unknown baseboard found in inv_caboose: {}", + c.hw_baseboard_id + ); + }; + let Some(sw_caboose) = cabooses_by_id.get(&c.sw_caboose_id) + else { + bail!( + "unknown caboose found in inv_caboose: {}", + c.sw_caboose_id + ); + }; - // Collect the unique sw_caboose_ids for those cabooses. - let sw_caboose_ids: BTreeSet<_> = inv_caboose_rows - .iter() - .map(|inv_caboose| inv_caboose.sw_caboose_id) - .collect(); - // Fetch the corresponing records. - let cabooses_by_id: BTreeMap<_, _> = { - use schema::sw_caboose::dsl; - dsl::sw_caboose - .filter(dsl::id.eq_any(sw_caboose_ids)) - .limit(sql_limit) - .select(SwCaboose::as_select()) - .load_async(&**conn) - .await - .context("loading sw_cabooses")? - .into_iter() - .map(|sw_caboose_row| { - (sw_caboose_row.id, Arc::new(Caboose::from(sw_caboose_row))) - }) - .collect() - }; - limit_reached = limit_reached || cabooses_by_id.len() == usize_limit; - - // Assemble the lists of cabooses found. - let mut cabooses_found = BTreeMap::new(); - for c in inv_caboose_rows { - let by_baseboard = cabooses_found - .entry(CabooseWhich::from(c.which)) - .or_insert_with(BTreeMap::new); - let Some(bb) = baseboards_by_id.get(&c.hw_baseboard_id) else { - bail!( - "unknown baseboard found in inv_caboose: {}", - c.hw_baseboard_id + let previous = by_baseboard.insert( + bb.clone(), + nexus_types::inventory::CabooseFound { + time_collected: c.time_collected, + source: c.source, + caboose: sw_caboose.clone(), + }, ); - }; - let Some(sw_caboose) = cabooses_by_id.get(&c.sw_caboose_id) else { - bail!( - "unknown caboose found in inv_caboose: {}", - c.sw_caboose_id + anyhow::ensure!( + previous.is_none(), + "duplicate caboose found: {:?} baseboard {:?}", + c.which, + c.hw_baseboard_id ); - }; + } - let previous = by_baseboard.insert( - bb.clone(), - CabooseFound { - time_collected: c.time_collected, - source: c.source, - caboose: sw_caboose.clone(), + Ok(( + Collection { + id, + errors, + time_started, + time_done, + collector, + baseboards: baseboards_by_id.values().cloned().collect(), + cabooses: cabooses_by_id.values().cloned().collect(), + sps, + rots, + cabooses_found, }, - ); - anyhow::ensure!( - previous.is_none(), - "duplicate caboose found: {:?} baseboard {:?}", - c.which, - c.hw_baseboard_id - ); + limit_reached, + )) } - - Ok(( - Collection { - id, - errors, - time_started, - time_done, - collector, - baseboards: baseboards_by_id.values().cloned().collect(), - cabooses: cabooses_by_id.values().cloned().collect(), - sps, - rots, - cabooses_found, - }, - limit_reached, - )) + .boxed() } +} - async fn read_collection_all_or_nothing( - conn: &DataStoreConnection<'_>, - id: Uuid, - limit: NonZeroU32, - ) -> anyhow::Result { - let (collection, limit_reached) = - read_collection_best_effort(conn, id, limit).await?; - anyhow::ensure!( - !limit_reached, - "hit limit of {} records while loading collection", - limit - ); - Ok(collection) - } +#[cfg(test)] +mod test { + use crate::db::datastore::datastore_test; + use crate::db::datastore::inventory::DataStoreInventoryTest; + use crate::db::datastore::DataStore; + use crate::db::datastore::DataStoreConnection; + use crate::db::schema; + use anyhow::Context; + use async_bb8_diesel::AsyncConnection; + use async_bb8_diesel::AsyncRunQueryDsl; + use async_bb8_diesel::AsyncSimpleConnection; + use diesel::QueryDsl; + use gateway_client::types::SpType; + use nexus_inventory::examples::representative; + use nexus_inventory::examples::Representative; + use nexus_test_utils::db::test_setup_database; + use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; + use nexus_types::inventory::CabooseWhich; + use nexus_types::inventory::Collection; + use omicron_test_utils::dev; + use std::num::NonZeroU32; + use uuid::Uuid; async fn read_collection( - conn: &DataStoreConnection<'_>, + datastore: &DataStore, id: Uuid, ) -> anyhow::Result { let limit = NonZeroU32::new(1000).unwrap(); - read_collection_all_or_nothing(conn, id, limit).await + datastore.inventory_collection_read_all_or_nothing(id, limit).await + } + + async fn count_baseboards_cabooses( + conn: &DataStoreConnection<'_>, + ) -> anyhow::Result<(usize, usize)> { + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let bb_count = schema::hw_baseboard_id::dsl::hw_baseboard_id + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count baseboards")?; + let caboose_count = schema::sw_caboose::dsl::sw_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count cabooses")?; + let bb_count_usize = usize::try_from(bb_count) + .context("failed to convert baseboard count to usize")?; + let caboose_count_usize = usize::try_from(caboose_count) + .context("failed to convert caboose count to usize")?; + Ok((bb_count_usize, caboose_count_usize)) + }) + .await } + /// Tests inserting several collections, reading them back, and making sure + /// they look the same. #[tokio::test] - async fn test_insert_empty() { + async fn test_inventory_insert() { // Setup - let logctx = dev::test_setup_log("inventory_insert_empty"); + let logctx = dev::test_setup_log("inventory_insert"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; // Create an empty collection and write it to the database. let builder = nexus_inventory::CollectionBuilder::new("test"); - let collection = builder.build(); + let collection1 = builder.build(); datastore - .inventory_insert_collection(&opctx, &collection) + .inventory_insert_collection(&opctx, &collection1) .await .expect("failed to insert collection"); // Read it back. let conn = datastore.pool_connection_for_tests().await.unwrap(); - let collection_read = read_collection(&conn, collection.id) + let collection_read = read_collection(&datastore, collection1.id) .await .expect("failed to read collection back"); - assert_eq!(collection, collection_read); + assert_eq!(collection1, collection_read); + + // There ought to be no baseboards or cabooses in the databases from + // that collection. + assert_eq!(collection1.baseboards.len(), 0); + assert_eq!(collection1.cabooses.len(), 0); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection1.baseboards.len(), nbaseboards); + assert_eq!(collection1.cabooses.len(), ncabooses); + + // Now insert a more complex collection, write it to the database, and + // read it back. + let Representative { builder, .. } = representative(); + let collection2 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection2) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection2.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection2, collection_read); + // Verify that we have exactly the set of cabooses and baseboards in the + // databases that came from this first non-empty collection. + assert_ne!(collection2.baseboards.len(), collection1.baseboards.len()); + assert_ne!(collection2.cabooses.len(), collection1.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection2.baseboards.len(), nbaseboards); + assert_eq!(collection2.cabooses.len(), ncabooses); + + // Now insert an equivalent collection again. Verify the distinct + // baseboards and cabooses again. This is important: the insertion + // process should re-use the baseboards and cabooses from the previous + // collection. + let Representative { builder, .. } = representative(); + let collection3 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection3) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection3.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection3, collection_read); + // Verify that we have the same number of cabooses and baseboards, since + // those didn't change. + assert_eq!(collection3.baseboards.len(), collection2.baseboards.len()); + assert_eq!(collection3.cabooses.len(), collection2.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection3.baseboards.len(), nbaseboards); + assert_eq!(collection3.cabooses.len(), ncabooses); + + // Now insert a collection that's almost equivalent, but has an extra + // couple of baseboards and caboose. Verify that we re-use the existing + // ones, but still insert the new ones. + let Representative { mut builder, .. } = representative(); + builder.found_sp_state( + "test suite", + SpType::Switch, + 1, + nexus_inventory::examples::sp_state("2"), + ); + let bb = builder + .found_sp_state( + "test suite", + SpType::Power, + 1, + nexus_inventory::examples::sp_state("3"), + ) + .unwrap(); + builder + .found_caboose( + &bb, + CabooseWhich::SpSlot0, + "dummy", + nexus_inventory::examples::caboose("dummy"), + ) + .unwrap(); + let collection4 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection4) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection4.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection4, collection_read); + // Verify the number of baseboards and collections again. + assert_eq!( + collection4.baseboards.len(), + collection3.baseboards.len() + 2 + ); + assert_eq!( + collection4.cabooses.len(), + collection3.baseboards.len() + 1 + ); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // This time, go back to our earlier collection. This logically removes + // some baseboards. They should still be present in the database, but + // not in the collection. + let Representative { builder, .. } = representative(); + let collection5 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection5.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection5, collection_read); + assert_eq!(collection5.baseboards.len(), collection3.baseboards.len()); + assert_eq!(collection5.cabooses.len(), collection3.cabooses.len()); + assert_ne!(collection5.baseboards.len(), collection4.baseboards.len()); + assert_ne!(collection5.cabooses.len(), collection4.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // Try to insert the same collection again and make sure it fails. + let error = datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect_err("unexpectedly succeeded in inserting collection"); + assert!(format!("{:#}", error) + .contains("duplicate key value violates unique constraint")); + + // Now that we've inserted a bunch of collections, we can test pruning. + // + // The datastore should start by pruning the oldest collection, unless + // it's the only collection with no errors. The oldest one is + // `collection1`, which _is_ the only one with no errors. So we should + // get back `collection2`. + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + println!( + "all collections: {:?}\n", + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + datastore + .inventory_prune_collections(&opctx, 4) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection3.id, collection4.id, collection5.id,] + ); + // Again, we should skip over collection1 and delete the next oldest: + // collection3. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + // At this point, if we're keeping 3, we don't need to prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + + // If we then insert an empty collection (which has no errors), + // collection1 becomes pruneable. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection6 = builder.build(); + println!( + "collection 6: {} ({:?})", + collection6.id, collection6.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection6) + .await + .expect("failed to insert collection"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id, collection6.id,] + ); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + // Again, at this point, we should not prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + + // If we insert another collection with errors, then prune, we should + // end up pruning collection 4. + let Representative { builder, .. } = representative(); + let collection7 = builder.build(); + println!( + "collection 7: {} ({:?})", + collection7.id, collection7.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection7) + .await + .expect("failed to insert collection"); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection5.id, collection6.id, collection7.id,] + ); + + // If we try to fetch a pruned collection, we should get nothing. + let _ = read_collection(&datastore, collection4.id) + .await + .expect_err("unexpectedly read pruned collection"); + + // But we should still be able to fetch the collections that do exist. + let collection_read = + read_collection(&datastore, collection5.id).await.unwrap(); + assert_eq!(collection5, collection_read); + let collection_read = + read_collection(&datastore, collection6.id).await.unwrap(); + assert_eq!(collection6, collection_read); + let collection_read = + read_collection(&datastore, collection7.id).await.unwrap(); + assert_eq!(collection7, collection_read); + + // We should prune more than one collection, if needed. We'll wind up + // with just collection6 because that's the latest one with no errors. + datastore + .inventory_prune_collections(&opctx, 1) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection6.id,] + ); + + // Remove the remaining collection and make sure the inventory tables + // are empty (i.e., we got everything). + datastore + .inventory_delete_collection(&opctx, collection6.id) + .await + .expect("failed to delete collection"); + assert_eq!(datastore.inventory_collections().await.unwrap(), &[]); + + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let count = schema::inv_collection::dsl::inv_collection + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_collection_error::dsl::inv_collection_error + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = + schema::inv_service_processor::dsl::inv_service_processor + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_root_of_trust::dsl::inv_root_of_trust + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_caboose::dsl::inv_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + Ok::<(), anyhow::Error>(()) + }) + .await + .expect("failed to check that tables were empty"); + + // We currently keep the baseboard ids and sw_cabooses around. + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_ne!(nbaseboards, 0); + assert_ne!(ncabooses, 0); // Clean up. db.cleanup().await.unwrap(); logctx.cleanup_successful(); } - - // XXX-dap TODO-coverage - // - sequence of collections with overlapping baseboards, new baseboards - // - find pruneable collection - // - delete collection } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 12959db827..3b2d81e1c7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -94,6 +94,7 @@ pub use db_metadata::{ }; pub use dns::DnsVersionUpdateBuilder; pub use instance::InstanceAndActiveVmm; +pub use inventory::DataStoreInventoryTest; pub use rack::RackInit; pub use silo::Discoverability; pub use switch_port::SwitchPortSettingsCombinedResult; diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 156f328d72..ad008ee4df 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -288,13 +288,19 @@ impl CollectionBuilder { /// when round-tripping through the database. That's rather inconvenient. fn now() -> DateTime { let ts = Utc::now(); - ts - std::time::Duration::from_nanos(u64::from(ts.timestamp_subsec_nanos())) + let nanosecs = ts.timestamp_subsec_nanos(); + let micros = ts.timestamp_subsec_micros(); + let only_nanos = nanosecs - micros * 1000; + ts - std::time::Duration::from_nanos(u64::from(only_nanos)) } #[cfg(test)] mod test { use super::now; use super::CollectionBuilder; + use crate::examples::representative; + use crate::examples::sp_state; + use crate::examples::Representative; use gateway_client::types::PowerState; use gateway_client::types::RotSlot; use gateway_client::types::RotState; @@ -304,7 +310,6 @@ mod test { use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Caboose; use nexus_types::inventory::CabooseWhich; - use strum::IntoEnumIterator; // Verify the contents of an empty collection. #[test] @@ -342,240 +347,19 @@ mod test { #[test] fn test_basic() { let time_before = now(); - let mut builder = CollectionBuilder::new("test_basic"); - - // an ordinary, working sled - let sled1_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 3, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 0, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest1", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest1", - )), - transient_boot_preference: None, - }, - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // another ordinary sled with different values for ordinary fields - let sled2_bb = builder - .found_sp_state( - "fake MGS 2", - SpType::Sled, - 4, - SpState { - base_mac_address: [1; 6], - hubris_archive_id: String::from("hubris2"), - model: String::from("model2"), - power_state: PowerState::A2, - revision: 1, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: Some(RotSlot::A), - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest2", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest2", - )), - transient_boot_preference: Some(RotSlot::B), - }, - // same serial number, which is okay because it's a - // different model number - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // a switch - let switch1_bb = builder - .found_sp_state( - "fake MGS 2", - SpType::Switch, - 0, - SpState { - base_mac_address: [2; 6], - hubris_archive_id: String::from("hubris3"), - model: String::from("model3"), - power_state: PowerState::A1, - revision: 2, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest3", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest3", - )), - transient_boot_preference: None, - }, - // same serial number, which is okay because it's a - // different model number - serial_number: String::from("s1"), - }, - ) - .unwrap(); - - // a PSC - let psc_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Power, - 1, - SpState { - base_mac_address: [3; 6], - hubris_archive_id: String::from("hubris4"), - model: String::from("model4"), - power_state: PowerState::A2, - revision: 3, - rot: RotState::Enabled { - active: RotSlot::B, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: Some(String::from( - "slotAdigest4", - )), - slot_b_sha3_256_digest: Some(String::from( - "slotBdigest4", - )), - transient_boot_preference: None, - }, - serial_number: String::from("s2"), - }, - ) - .unwrap(); - - // a sled with no RoT state or other optional fields - let sled3_bb = builder - .found_sp_state( - "fake MGS 1", - SpType::Sled, - 5, - SpState { - base_mac_address: [4; 6], - hubris_archive_id: String::from("hubris5"), - model: String::from("model1"), - power_state: PowerState::A2, - revision: 1, - rot: RotState::CommunicationFailed { - message: String::from("test suite injected error"), - }, - serial_number: String::from("s2"), - }, - ) - .unwrap(); - - // Report some cabooses. - - // We'll use the same cabooses for most of these components, although - // that's not possible in a real system. We deliberately construct a - // new value each time to make sure the builder correctly normalizes it. - let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; - for bb in &common_caboose_baseboards { - for which in CabooseWhich::iter() { - assert!(!builder.found_caboose_already(bb, which)); - let _ = builder - .found_caboose( - bb, - which, - "test suite", - SpComponentCaboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), - }, - ) - .unwrap(); - assert!(builder.found_caboose_already(bb, which)); - } - } - - // For the PSC, use different cabooses for both slots of both the SP and - // RoT, just to exercise that we correctly keep track of different - // cabooses. - let _ = builder - .found_caboose( - &psc_bb, - CabooseWhich::SpSlot0, - "test suite", - SpComponentCaboose { - board: String::from("psc_sp_0"), - git_commit: String::from("psc_sp_0"), - name: String::from("psc_sp_0"), - version: String::from("psc_sp_0"), - }, - ) - .unwrap(); - let _ = builder - .found_caboose( - &psc_bb, - CabooseWhich::SpSlot1, - "test suite", - SpComponentCaboose { - board: String::from("psc_sp_1"), - git_commit: String::from("psc_sp_1"), - name: String::from("psc_sp_1"), - version: String::from("psc_sp_1"), - }, - ) - .unwrap(); - let _ = builder - .found_caboose( - &psc_bb, - CabooseWhich::RotSlotA, - "test suite", - SpComponentCaboose { - board: String::from("psc_rot_a"), - git_commit: String::from("psc_rot_a"), - name: String::from("psc_rot_a"), - version: String::from("psc_rot_a"), - }, - ) - .unwrap(); - let _ = builder - .found_caboose( - &psc_bb, - CabooseWhich::RotSlotB, - "test suite", - SpComponentCaboose { - board: String::from("psc_rot_b"), - git_commit: String::from("psc_rot_b"), - name: String::from("psc_rot_b"), - version: String::from("psc_rot_b"), - }, - ) - .unwrap(); - - // We deliberately provide no cabooses for sled3. - - // Finish the collection and verify the basics. + let Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch, + psc, + } = representative(); let collection = builder.build(); let time_after = now(); println!("{:#?}", collection); assert!(time_before <= collection.time_started); assert!(collection.time_started <= collection.time_done); assert!(collection.time_done <= time_after); - assert_eq!(collection.collector, "test_basic"); + assert_eq!(collection.collector, "example"); // Verify the one error that ought to have been produced for the SP with // no RoT information. @@ -588,7 +372,7 @@ mod test { // Verify the baseboard ids found. let expected_baseboards = - &[&sled1_bb, &sled2_bb, &sled3_bb, &switch1_bb, &psc_bb]; + &[&sled1_bb, &sled2_bb, &sled3_bb, &switch, &psc]; for bb in expected_baseboards { assert!(collection.baseboards.contains(*bb)); } @@ -616,11 +400,12 @@ mod test { } // Verify the common caboose. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch]; let common_caboose = Caboose { - board: String::from("board1"), - git_commit: String::from("git_commit1"), - name: String::from("name1"), - version: String::from("version1"), + board: String::from("board_1"), + git_commit: String::from("git_commit_1"), + name: String::from("name_1"), + version: String::from("version_1"), }; for bb in &common_caboose_baseboards { let _ = collection.sps.get(*bb).unwrap(); @@ -690,14 +475,14 @@ mod test { assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); // switch - let sp = collection.sps.get(&switch1_bb).unwrap(); + let sp = collection.sps.get(&switch).unwrap(); assert_eq!(sp.source, "fake MGS 2"); assert_eq!(sp.sp_type, SpType::Switch); assert_eq!(sp.sp_slot, 0); assert_eq!(sp.baseboard_revision, 2); assert_eq!(sp.hubris_archive, "hubris3"); assert_eq!(sp.power_state, PowerState::A1); - let rot = collection.rots.get(&switch1_bb).unwrap(); + let rot = collection.rots.get(&switch).unwrap(); assert_eq!(rot.active_slot, RotSlot::B); assert_eq!(rot.pending_persistent_boot_preference, None); assert_eq!(rot.persistent_boot_preference, RotSlot::A); @@ -712,14 +497,14 @@ mod test { assert_eq!(rot.transient_boot_preference, None); // PSC - let sp = collection.sps.get(&psc_bb).unwrap(); + let sp = collection.sps.get(&psc).unwrap(); assert_eq!(sp.source, "fake MGS 1"); assert_eq!(sp.sp_type, SpType::Power); assert_eq!(sp.sp_slot, 1); assert_eq!(sp.baseboard_revision, 3); assert_eq!(sp.hubris_archive, "hubris4"); assert_eq!(sp.power_state, PowerState::A2); - let rot = collection.rots.get(&psc_bb).unwrap(); + let rot = collection.rots.get(&psc).unwrap(); assert_eq!(rot.active_slot, RotSlot::B); assert_eq!(rot.pending_persistent_boot_preference, None); assert_eq!(rot.persistent_boot_preference, RotSlot::A); @@ -735,29 +520,29 @@ mod test { // The PSC has four different cabooses! let c = &collection - .caboose_for(CabooseWhich::SpSlot0, &psc_bb) + .caboose_for(CabooseWhich::SpSlot0, &psc) .unwrap() .caboose; - assert_eq!(c.board, "psc_sp_0"); + assert_eq!(c.board, "board_psc_sp_0"); assert!(collection.cabooses.contains(c)); let c = &collection - .caboose_for(CabooseWhich::SpSlot1, &psc_bb) + .caboose_for(CabooseWhich::SpSlot1, &psc) .unwrap() .caboose; assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_sp_1"); + assert_eq!(c.board, "board_psc_sp_1"); let c = &collection - .caboose_for(CabooseWhich::RotSlotA, &psc_bb) + .caboose_for(CabooseWhich::RotSlotA, &psc) .unwrap() .caboose; assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_rot_a"); + assert_eq!(c.board, "board_psc_rot_a"); let c = &collection - .caboose_for(CabooseWhich::RotSlotB, &psc_bb) + .caboose_for(CabooseWhich::RotSlotB, &psc) .unwrap() .caboose; assert!(collection.cabooses.contains(c)); - assert_eq!(c.board, "psc_rot_b"); + assert_eq!(c.board, "board_psc_rot_b"); // Verify the reported SP state for sled3, which did not have a healthy // RoT, nor any cabooses. @@ -872,22 +657,7 @@ mod test { "fake MGS 1", SpType::Sled, u32::from(u16::MAX) + 1, - SpState { - base_mac_address: [0; 6], - hubris_archive_id: String::from("hubris1"), - model: String::from("model1"), - power_state: PowerState::A0, - revision: 1, - rot: RotState::Enabled { - active: RotSlot::A, - pending_persistent_boot_preference: None, - persistent_boot_preference: RotSlot::A, - slot_a_sha3_256_digest: None, - slot_b_sha3_256_digest: None, - transient_boot_preference: None, - }, - serial_number: String::from("s2"), - }, + sp_state("1"), ); assert_eq!(sled2_sp, None); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs new file mode 100644 index 0000000000..52aca397bb --- /dev/null +++ b/nexus/inventory/src/examples.rs @@ -0,0 +1,254 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Example collections used for testing + +use crate::CollectionBuilder; +use gateway_client::types::PowerState; +use gateway_client::types::RotSlot; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseWhich; +use std::sync::Arc; +use strum::IntoEnumIterator; + +/// Returns an example Collection used for testing +/// +/// This collection is intended to cover a variety of possible inventory data, +/// including: +/// +/// - all three baseboard types (switch, sled, PSC) +/// - various valid values for all fields (sources, slot numbers, power +/// states, baseboard revisions, cabooses, etc.) +/// - some empty slots +/// - some missing cabooses +/// - some cabooses common to multiple baseboards; others not +/// - serial number reused across different model numbers +pub fn representative() -> Representative { + let mut builder = CollectionBuilder::new("example"); + + // an ordinary, working sled + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // another ordinary sled with different values for ordinary fields + let sled2_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Sled, + 4, + SpState { + base_mac_address: [1; 6], + hubris_archive_id: String::from("hubris2"), + model: String::from("model2"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: Some(RotSlot::A), + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest2")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest2")), + transient_boot_preference: Some(RotSlot::B), + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a switch + let switch1_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Switch, + 0, + SpState { + base_mac_address: [2; 6], + hubris_archive_id: String::from("hubris3"), + model: String::from("model3"), + power_state: PowerState::A1, + revision: 2, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest3")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest3")), + transient_boot_preference: None, + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a PSC + let psc_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Power, + 1, + SpState { + base_mac_address: [3; 6], + hubris_archive_id: String::from("hubris4"), + model: String::from("model4"), + power_state: PowerState::A2, + revision: 3, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest4")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest4")), + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // a sled with no RoT state or other optional fields + let sled3_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 5, + SpState { + base_mac_address: [4; 6], + hubris_archive_id: String::from("hubris5"), + model: String::from("model1"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::CommunicationFailed { + message: String::from("test suite injected error"), + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // Report some cabooses. + + // We'll use the same cabooses for most of these components, although + // that's not possible in a real system. We deliberately construct a + // new value each time to make sure the builder correctly normalizes it. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; + for bb in &common_caboose_baseboards { + for which in CabooseWhich::iter() { + assert!(!builder.found_caboose_already(bb, which)); + let _ = builder + .found_caboose(bb, which, "test suite", caboose("1")) + .unwrap(); + assert!(builder.found_caboose_already(bb, which)); + } + } + + // For the PSC, use different cabooses for both slots of both the SP and + // RoT, just to exercise that we correctly keep track of different + // cabooses. + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot0, + "test suite", + caboose("psc_sp_0"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot1, + "test suite", + caboose("psc_sp_1"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotA, + "test suite", + caboose("psc_rot_a"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotB, + "test suite", + caboose("psc_rot_b"), + ) + .unwrap(); + + // We deliberately provide no cabooses for sled3. + + Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch: switch1_bb, + psc: psc_bb, + } +} + +pub struct Representative { + pub builder: CollectionBuilder, + pub sleds: [Arc; 3], + pub switch: Arc, + pub psc: Arc, +} + +/// Returns an SP state that can be used to populate a collection for testing +pub fn sp_state(unique: &str) -> SpState { + SpState { + base_mac_address: [0; 6], + hubris_archive_id: format!("hubris{}", unique), + model: format!("model{}", unique), + power_state: PowerState::A2, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: format!("serial{}", unique), + } +} + +pub fn caboose(unique: &str) -> SpComponentCaboose { + SpComponentCaboose { + board: format!("board_{}", unique), + git_commit: format!("git_commit_{}", unique), + name: format!("name_{}", unique), + version: format!("version_{}", unique), + } +} diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs index b0a823a69e..3a5f60b387 100644 --- a/nexus/inventory/src/lib.rs +++ b/nexus/inventory/src/lib.rs @@ -19,6 +19,7 @@ mod builder; mod collector; +pub mod examples; // only exposed for test code to construct collections pub use builder::CollectionBuilder; diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs index 93e7db2697..d9ab9fe4be 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/inventory_collection.rs @@ -130,3 +130,83 @@ async fn inventory_activate( Ok(collection) } + +#[cfg(test)] +mod test { + use crate::app::background::common::BackgroundTask; + use crate::app::background::inventory_collection::InventoryCollector; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DataStoreInventoryTest; + use nexus_test_utils_macros::nexus_test; + use omicron_test_utils::dev::poll; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + // Test that each activation creates a new collection and that we prune old + // collections, too. + #[nexus_test(server = crate::Server)] + async fn test_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Nexus starts our very background task, so we should find a collection + // in the database before too long. + let mut last_collections = + poll::wait_for_condition::<_, anyhow::Error, _, _>( + || async { + let collections = datastore + .inventory_collections() + .await + .map_err(poll::CondCheckError::Failed)?; + if collections.is_empty() { + Err(poll::CondCheckError::NotYet) + } else { + Ok(collections) + } + }, + &std::time::Duration::from_millis(50), + &std::time::Duration::from_secs(15), + ) + .await + .expect("background task did not populate initial collection"); + + let resolver = internal_dns::resolver::Resolver::new_from_addrs( + cptestctx.logctx.log.clone(), + &[cptestctx.internal_dns.dns_server.local_address()], + ) + .unwrap(); + + // Now we'll create our own copy of the background task and activate it + // a bunch and make sure that it always creates a new collection and + // does not allow a backlog to accumulate. + let nkeep = 3; + let mut task = + InventoryCollector::new(datastore.clone(), resolver, "me", nkeep); + let nkeep = usize::try_from(nkeep).unwrap(); + for i in 0..10 { + let _ = task.activate(&opctx).await; + let collections = datastore.inventory_collections().await.unwrap(); + println!( + "iter {}: last = {:?}, current = {:?}", + i, last_collections, collections + ); + + let expected_from_last: Vec<_> = if last_collections.len() <= nkeep + { + last_collections + } else { + last_collections.into_iter().skip(1).collect() + }; + let expected_from_current: Vec<_> = + collections.iter().rev().skip(1).rev().cloned().collect(); + assert_eq!(expected_from_last, expected_from_current); + assert_eq!(collections.len(), std::cmp::min(i + 2, nkeep + 1)); + last_collections = collections; + } + } +} From 7b212439a9ccb42c633185047210667322a22298 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 30 Oct 2023 10:51:59 -0700 Subject: [PATCH 14/20] CTE is not necessary for the INSERT with two foreign keys --- nexus/db-model/src/schema.rs | 2 +- .../db-queries/src/db/datastore/inventory.rs | 351 +++++++----------- 2 files changed, 136 insertions(+), 217 deletions(-) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 0b41733e6d..b2c957205d 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1237,7 +1237,7 @@ joinable!(ip_pool_range -> ip_pool (ip_pool_id)); allow_tables_to_appear_in_same_query!(inv_collection, inv_collection_error); joinable!(inv_collection_error -> inv_collection (inv_collection_id)); -allow_tables_to_appear_in_same_query!(sw_caboose, inv_caboose); +allow_tables_to_appear_in_same_query!(hw_baseboard_id, sw_caboose, inv_caboose); allow_tables_to_appear_in_same_query!( dataset, diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 500af8e39b..30aa946467 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -16,21 +16,17 @@ use anyhow::Context; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; -use chrono::DateTime; -use chrono::Utc; use diesel::expression::SelectableHelper; -use diesel::sql_types; use diesel::sql_types::Nullable; -use diesel::Column; +use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::IntoSql; +use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; use diesel::QueryDsl; -use diesel::QuerySource; use diesel::Table; use futures::future::BoxFuture; use futures::FutureExt; -use nexus_db_model::CabooseWhich; use nexus_db_model::CabooseWhichEnum; use nexus_db_model::HwBaseboardId; use nexus_db_model::HwPowerState; @@ -45,8 +41,6 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::SpType; use nexus_db_model::SpTypeEnum; use nexus_db_model::SwCaboose; -use nexus_types::inventory::BaseboardId; -use nexus_types::inventory::CabooseFound; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; @@ -336,19 +330,142 @@ impl DataStore { // Insert rows for the cabooses that we found. Like service // processors and roots of trust, we do this using INSERT INTO ... - // SELECT. But because there are two foreign keys, we need a more - // complicated `SELECT`, which requires using a CTE. + // SELECT. This one's a little more complicated because there are + // two foreign keys. Concretely, we have these three tables: + // + // - `hw_baseboard` with an "id" primary key and lookup columns + // "part_number" and "serial_number" + // - `sw_caboose` with an "id" primary key and lookup columns + // "board", "git_commit", "name", and "version" + // - `inv_caboose` with foreign keys "hw_baseboard_id", + // "sw_caboose_id", and various other columns + // + // We want to INSERT INTO `inv_caboose` a row with: + // + // - hw_baseboard_id (foreign key) the result of looking up an + // hw_baseboard row by a specific part number and serial number + // + // - sw_caboose_id (foreign key) the result of looking up a + // specific sw_caboose row by board, git_commit, name, and version + // + // - the other columns being literals + // + // To achieve this, we're going to generate something like: + // + // INSERT INTO + // inv_caboose ( + // hw_baseboard_id, + // sw_caboose_id, + // inv_collection_id, + // time_collected, + // source, + // which, + // ) + // SELECT ( + // hw_baseboard_id.id, + // sw_caboose.id, + // ... /* literal collection id */ + // ... /* literal time collected */ + // ... /* literal source */ + // ... /* literal 'which' */ + // ) + // FROM + // hw_baseboard + // INNER JOIN + // sw_caboose + // ON hw_baseboard.part_number = ... + // AND hw_baseboard.serial_number = ... + // AND sw_caboose.board = ... + // AND sw_caboose.git_commit = ... + // AND sw_caboose.name = ... + // AND sw_caboose.version = ...; + // + // Again, the whole point is to avoid back-and-forth between the + // client and the database. Those back-and-forth interactions can + // significantly increase latency and the probability of transaction + // conflicts. See RFD 192 for details. (Unfortunately, we still + // _are_ going back and forth here to issue each of these queries. + // But that's an artifact of the interface we currently have for + // sending queries. It should be possible to send all of these in + // one batch. for (which, tree) in &collection.cabooses_found { let db_which = nexus_db_model::CabooseWhich::from(*which); for (baseboard_id, found_caboose) in tree { - InvCabooseInsert::new( - collection_id, - baseboard_id, - found_caboose, - db_which, - ) - .execute_async(&conn) - .await?; + use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; + use db::schema::inv_caboose::dsl as dsl_inv_caboose; + use db::schema::sw_caboose::dsl as dsl_sw_caboose; + + let selection = db::schema::hw_baseboard_id::table + .inner_join( + db::schema::sw_caboose::table.on( + dsl_baseboard_id::part_number + .eq(baseboard_id.part_number.clone()) + .and( + dsl_baseboard_id::serial_number.eq( + baseboard_id.serial_number.clone(), + ), + ) + .and(dsl_sw_caboose::board.eq( + found_caboose.caboose.board.clone(), + )) + .and( + dsl_sw_caboose::git_commit.eq( + found_caboose + .caboose + .git_commit + .clone(), + ), + ) + .and( + dsl_sw_caboose::name.eq(found_caboose + .caboose + .name + .clone()), + ) + .and(dsl_sw_caboose::version.eq( + found_caboose.caboose.version.clone(), + )), + ), + ) + .select(( + dsl_baseboard_id::id, + dsl_sw_caboose::id, + collection_id.into_sql::(), + found_caboose + .time_collected + .into_sql::(), + found_caboose + .source + .clone() + .into_sql::(), + db_which.into_sql::(), + )); + + let _ = diesel::insert_into(db::schema::inv_caboose::table) + .values(selection) + .into_columns(( + dsl_inv_caboose::hw_baseboard_id, + dsl_inv_caboose::sw_caboose_id, + dsl_inv_caboose::inv_collection_id, + dsl_inv_caboose::time_collected, + dsl_inv_caboose::source, + dsl_inv_caboose::which, + )) + .execute_async(&conn) + .await?; + + // See the comments above. The same applies here. If you + // update the statement below because the schema for + // `inv_caboose` has changed, be sure to update the code + // above, too! + let ( + _hw_baseboard_id, + _sw_caboose_id, + _inv_collection_id, + _time_collected, + _source, + _which, + ) = dsl_inv_caboose::inv_caboose::all_columns(); } } @@ -732,204 +849,6 @@ impl DataStore { /// database. Those back-and-forth interactions can significantly increase /// latency and the probability of transaction conflicts. See RFD 192 for /// details. -#[must_use = "Queries must be executed"] -struct InvCabooseInsert { - // fields used to look up baseboard id - baseboard_part_number: String, - baseboard_serial_number: String, - - // fields used to look up caboose id - caboose_board: String, - caboose_git_commit: String, - caboose_name: String, - caboose_version: String, - - // literal values for the rest of the inv_caboose columns - collection_id: Uuid, - time_collected: DateTime, - source: String, - which: CabooseWhich, - - // These are Diesel structures representing table names in the "from" or - // "into" parts of queries (e.g., "SELECT FROM tablename" or "INSERT INTO - // tablename"). We need this in `walk_ast()` below, but they must outlive - // `walk_ast()`, so they need to be created ahead of time. - // - // TODO-cleanup These Diesel-internal types are nasty. It's not clear how - // else to do this. - from_hw_baseboard_id: - diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::hw_baseboard_id::table, - >, - from_sw_caboose: diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::sw_caboose::table, - >, - into_inv_caboose: - diesel::internal::table_macro::StaticQueryFragmentInstance< - db::schema::inv_caboose::table, - >, -} - -impl InvCabooseInsert { - pub fn new( - collection_id: Uuid, - baseboard: &BaseboardId, - found_caboose: &CabooseFound, - which: CabooseWhich, - ) -> InvCabooseInsert { - InvCabooseInsert { - baseboard_part_number: baseboard.part_number.clone(), - baseboard_serial_number: baseboard.serial_number.clone(), - caboose_board: found_caboose.caboose.board.clone(), - caboose_git_commit: found_caboose.caboose.git_commit.clone(), - caboose_name: found_caboose.caboose.name.clone(), - caboose_version: found_caboose.caboose.version.clone(), - collection_id, - time_collected: found_caboose.time_collected, - source: found_caboose.source.clone(), - which, - from_hw_baseboard_id: db::schema::hw_baseboard_id::table - .from_clause(), - from_sw_caboose: db::schema::sw_caboose::table.from_clause(), - // It sounds a little goofy to use "from_clause()" when this is - // really part of an INSERT. But really this just produces the - // table name as an identifier. This is the same for both "FROM" - // and "INSERT" clauses. And diesel internally does the same thing - // here (see the type of `InsertStatement::into_clause`). - into_inv_caboose: db::schema::inv_caboose::table.from_clause(), - } - } -} - -impl diesel::query_builder::QueryFragment for InvCabooseInsert { - fn walk_ast<'b>( - &'b self, - mut pass: diesel::query_builder::AstPass<'_, 'b, diesel::pg::Pg>, - ) -> diesel::QueryResult<()> { - use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; - use db::schema::inv_caboose::dsl as dsl_inv_caboose; - use db::schema::sw_caboose::dsl as dsl_sw_caboose; - - pass.unsafe_to_cache_prepared(); - pass.push_sql("WITH my_new_row AS ("); - - pass.push_sql("SELECT "); - - // Emit the values that we're going to insert into `inv_caboose`. - // First, emit the looked-up foreign keys. - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::id::NAME)?; - pass.push_sql(", "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::id::NAME)?; - pass.push_sql(", "); - // Next, emit the literal values used for the rest of the columns. - pass.push_bind_param::(&self.collection_id)?; - pass.push_sql(", "); - pass.push_bind_param::( - &self.time_collected, - )?; - pass.push_sql(", "); - pass.push_bind_param::(&self.source)?; - pass.push_sql(", "); - pass.push_bind_param::(&self.which)?; - - // Finish the SELECT by adding the list of tables and the WHERE to pick - // out only the relevant row from each tables. - pass.push_sql(" FROM "); - - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql(", "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - - pass.push_sql(" WHERE "); - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::part_number::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::( - &self.baseboard_part_number, - )?; - pass.push_sql(" AND "); - self.from_hw_baseboard_id.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_baseboard_id::serial_number::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::( - &self.baseboard_serial_number, - )?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::board::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_board)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::git_commit::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_git_commit)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::name::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_name)?; - pass.push_sql(" AND "); - self.from_sw_caboose.walk_ast(pass.reborrow())?; - pass.push_sql("."); - pass.push_identifier(dsl_sw_caboose::version::NAME)?; - pass.push_sql(" = "); - pass.push_bind_param::(&self.caboose_version)?; - - pass.push_sql(")\n"); // end of the SELECT query within the WITH - - pass.push_sql("INSERT INTO "); - self.into_inv_caboose.walk_ast(pass.reborrow())?; - - pass.push_sql("("); - pass.push_identifier(dsl_inv_caboose::hw_baseboard_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::sw_caboose_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::inv_collection_id::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::time_collected::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::source::NAME)?; - pass.push_sql(", "); - pass.push_identifier(dsl_inv_caboose::which::NAME)?; - pass.push_sql(")\n"); - pass.push_sql("SELECT * FROM my_new_row"); - - // See the comment in inventory_insert_collection() where we use - // `inv_service_processor::all_columns()`. The same applies here. - // If you update the statement below because the schema for - // `inv_caboose` has changed, be sure to update the code above, too! - let ( - _hw_baseboard_id, - _sw_caboose_id, - _inv_collection_id, - _time_collected, - _source, - _which, - ) = dsl_inv_caboose::inv_caboose::all_columns(); - - Ok(()) - } -} - -// This is required to be able to call `inv_caboose_insert.execute_async()`. -impl diesel::RunQueryDsl for InvCabooseInsert {} - -// This is required to be able to call `inv_caboose_insert.execute_async()`. -impl diesel::query_builder::QueryId for InvCabooseInsert { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} /// Extra interfaces that are not intended (and potentially unsafe) for use in /// Nexus, but useful for testing and `omdb` From eac331960797fdc29da0a345d8dd27b1fbea40b9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 30 Oct 2023 11:07:07 -0700 Subject: [PATCH 15/20] add chicken switch --- common/src/nexus_config.rs | 9 ++++++ nexus/examples/config.toml | 2 ++ nexus/src/app/background/init.rs | 1 + .../app/background/inventory_collection.rs | 32 +++++++++++++++++-- nexus/tests/config.test.toml | 2 ++ smf/nexus/multi-sled/config-partial.toml | 2 ++ smf/nexus/single-sled/config-partial.toml | 2 ++ 7 files changed, 48 insertions(+), 2 deletions(-) diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 44de433603..9be58d3222 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -363,6 +363,12 @@ pub struct InventoryConfig { /// This is a very coarse mechanism to keep the system from overwhelming /// itself with inventory data. pub nkeep: u32, + + /// disable inventory collection altogether + /// + /// This is an emergency lever for support / operations. It should never be + /// necessary. + pub disable: bool, } /// Configuration for a nexus server @@ -615,6 +621,7 @@ mod test { external_endpoints.period_secs = 9 inventory.period_secs = 10 inventory.nkeep = 11 + inventory.disable = false [default_region_allocation_strategy] type = "random" seed = 0 @@ -703,6 +710,7 @@ mod test { inventory: InventoryConfig { period_secs: Duration::from_secs(10), nkeep: 11, + disable: false, } }, default_region_allocation_strategy: @@ -758,6 +766,7 @@ mod test { external_endpoints.period_secs = 9 inventory.period_secs = 10 inventory.nkeep = 3 + inventory.disable = false [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index c7345156a7..efc9aa9c27 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -97,6 +97,8 @@ external_endpoints.period_secs = 60 inventory.period_secs = 600 # Maximum number of past collections to keep in the database inventory.nkeep = 5 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 1c178175fe..bdcfedd065 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -103,6 +103,7 @@ impl BackgroundTasks { resolver, &nexus_id.to_string(), config.inventory.nkeep, + config.inventory.disable, ); let task = driver.register( String::from("inventory_collection"), diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs index d9ab9fe4be..96a0941524 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/inventory_collection.rs @@ -5,6 +5,7 @@ //! Background task for reading inventory for the rack use super::common::BackgroundTask; +use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; @@ -21,6 +22,7 @@ pub struct InventoryCollector { resolver: internal_dns::resolver::Resolver, creator: String, nkeep: u32, + disable: bool, } impl InventoryCollector { @@ -29,12 +31,14 @@ impl InventoryCollector { resolver: internal_dns::resolver::Resolver, creator: &str, nkeep: u32, + disable: bool, ) -> InventoryCollector { InventoryCollector { datastore, resolver, creator: creator.to_owned(), nkeep, + disable, } } } @@ -55,6 +59,7 @@ impl BackgroundTask for InventoryCollector { &self.resolver, &self.creator, self.nkeep, + self.disable, ) .await .context("failed to collect inventory") @@ -88,7 +93,12 @@ async fn inventory_activate( resolver: &internal_dns::resolver::Resolver, creator: &str, nkeep: u32, + disabled: bool, ) -> Result { + // If we're disabled, don't do anything. (This switch is only intended for + // unforeseen production emergencies.) + ensure!(!disabled, "disabled by explicit configuration"); + // Prune old collections. We do this first, here, to ensure that we never // develop an unbounded backlog of collections. (If this process were done // by a separate task, it would be possible for the backlog to grow @@ -185,8 +195,13 @@ mod test { // a bunch and make sure that it always creates a new collection and // does not allow a backlog to accumulate. let nkeep = 3; - let mut task = - InventoryCollector::new(datastore.clone(), resolver, "me", nkeep); + let mut task = InventoryCollector::new( + datastore.clone(), + resolver.clone(), + "me", + nkeep, + false, + ); let nkeep = usize::try_from(nkeep).unwrap(); for i in 0..10 { let _ = task.activate(&opctx).await; @@ -208,5 +223,18 @@ mod test { assert_eq!(collections.len(), std::cmp::min(i + 2, nkeep + 1)); last_collections = collections; } + + // Create a disabled task and make sure that does nothing. + let mut task = InventoryCollector::new( + datastore.clone(), + resolver, + "disabled", + 3, + true, + ); + let previous = datastore.inventory_collections().await.unwrap(); + let _ = task.activate(&opctx).await; + let latest = datastore.inventory_collections().await.unwrap(); + assert_eq!(previous, latest); } } diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 3e50a1ef18..3629ae9cb2 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -94,6 +94,8 @@ external_endpoints.period_secs = 60 inventory.period_secs = 600 # Maximum number of past collections to keep in the database inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index c9b2f3fdc2..cae1f650c9 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -43,6 +43,8 @@ external_endpoints.period_secs = 60 inventory.period_secs = 600 # Maximum number of past collections to keep in the database inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 65bd020e0b..be8683be54 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -43,6 +43,8 @@ external_endpoints.period_secs = 60 inventory.period_secs = 600 # Maximum number of past collections to keep in the database inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From 5b7152ccabc7938f9276622b3dc05993150b1b8b Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 30 Oct 2023 14:03:35 -0700 Subject: [PATCH 16/20] update database schema for the upgrade case too --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/9.0.0/up01.sql | 5 +++++ schema/crdb/9.0.0/up02.sql | 2 ++ schema/crdb/9.0.0/up03.sql | 5 +++++ schema/crdb/9.0.0/up04.sql | 4 ++++ schema/crdb/9.0.0/up05.sql | 9 +++++++++ schema/crdb/9.0.0/up06.sql | 2 ++ schema/crdb/9.0.0/up07.sql | 6 ++++++ schema/crdb/9.0.0/up08.sql | 2 ++ schema/crdb/9.0.0/up09.sql | 5 +++++ schema/crdb/9.0.0/up10.sql | 2 ++ schema/crdb/9.0.0/up11.sql | 5 +++++ schema/crdb/9.0.0/up12.sql | 15 +++++++++++++++ schema/crdb/9.0.0/up13.sql | 15 +++++++++++++++ schema/crdb/9.0.0/up14.sql | 6 ++++++ schema/crdb/9.0.0/up15.sql | 11 +++++++++++ schema/crdb/dbinit.sql | 19 ++++++++++--------- 17 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 schema/crdb/9.0.0/up01.sql create mode 100644 schema/crdb/9.0.0/up02.sql create mode 100644 schema/crdb/9.0.0/up03.sql create mode 100644 schema/crdb/9.0.0/up04.sql create mode 100644 schema/crdb/9.0.0/up05.sql create mode 100644 schema/crdb/9.0.0/up06.sql create mode 100644 schema/crdb/9.0.0/up07.sql create mode 100644 schema/crdb/9.0.0/up08.sql create mode 100644 schema/crdb/9.0.0/up09.sql create mode 100644 schema/crdb/9.0.0/up10.sql create mode 100644 schema/crdb/9.0.0/up11.sql create mode 100644 schema/crdb/9.0.0/up12.sql create mode 100644 schema/crdb/9.0.0/up13.sql create mode 100644 schema/crdb/9.0.0/up14.sql create mode 100644 schema/crdb/9.0.0/up15.sql diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index cefdd9f006..cff261e01a 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1243,7 +1243,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(8, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(9, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/schema/crdb/9.0.0/up01.sql b/schema/crdb/9.0.0/up01.sql new file mode 100644 index 0000000000..88439c433b --- /dev/null +++ b/schema/crdb/9.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up02.sql b/schema/crdb/9.0.0/up02.sql new file mode 100644 index 0000000000..d98f896fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up02.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); diff --git a/schema/crdb/9.0.0/up03.sql b/schema/crdb/9.0.0/up03.sql new file mode 100644 index 0000000000..3bd036be7e --- /dev/null +++ b/schema/crdb/9.0.0/up03.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); diff --git a/schema/crdb/9.0.0/up04.sql b/schema/crdb/9.0.0/up04.sql new file mode 100644 index 0000000000..1590ec4e88 --- /dev/null +++ b/schema/crdb/9.0.0/up04.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); diff --git a/schema/crdb/9.0.0/up05.sql b/schema/crdb/9.0.0/up05.sql new file mode 100644 index 0000000000..1042282fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up05.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up06.sql b/schema/crdb/9.0.0/up06.sql new file mode 100644 index 0000000000..aa614fa2fb --- /dev/null +++ b/schema/crdb/9.0.0/up06.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); diff --git a/schema/crdb/9.0.0/up07.sql b/schema/crdb/9.0.0/up07.sql new file mode 100644 index 0000000000..945f5a44c8 --- /dev/null +++ b/schema/crdb/9.0.0/up07.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up08.sql b/schema/crdb/9.0.0/up08.sql new file mode 100644 index 0000000000..1abeb9203f --- /dev/null +++ b/schema/crdb/9.0.0/up08.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); diff --git a/schema/crdb/9.0.0/up09.sql b/schema/crdb/9.0.0/up09.sql new file mode 100644 index 0000000000..770c771775 --- /dev/null +++ b/schema/crdb/9.0.0/up09.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); diff --git a/schema/crdb/9.0.0/up10.sql b/schema/crdb/9.0.0/up10.sql new file mode 100644 index 0000000000..57665ee468 --- /dev/null +++ b/schema/crdb/9.0.0/up10.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); diff --git a/schema/crdb/9.0.0/up11.sql b/schema/crdb/9.0.0/up11.sql new file mode 100644 index 0000000000..40da69af5b --- /dev/null +++ b/schema/crdb/9.0.0/up11.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); diff --git a/schema/crdb/9.0.0/up12.sql b/schema/crdb/9.0.0/up12.sql new file mode 100644 index 0000000000..9089ac93ba --- /dev/null +++ b/schema/crdb/9.0.0/up12.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up13.sql b/schema/crdb/9.0.0/up13.sql new file mode 100644 index 0000000000..241c5d9e80 --- /dev/null +++ b/schema/crdb/9.0.0/up13.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, + slot_a_sha3_256 TEXT, + slot_b_sha3_256 TEXT, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up14.sql b/schema/crdb/9.0.0/up14.sql new file mode 100644 index 0000000000..6725d35acf --- /dev/null +++ b/schema/crdb/9.0.0/up14.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); diff --git a/schema/crdb/9.0.0/up15.sql b/schema/crdb/9.0.0/up15.sql new file mode 100644 index 0000000000..48a68d167a --- /dev/null +++ b/schema/crdb/9.0.0/up15.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 307a2888b7..64da76adbb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2514,14 +2514,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.bootstore_keys ( generation INT8 NOT NULL ); -/* - * The `sled_instance` view's definition needs to be modified in a separate - * transaction from the transaction that created it. - */ - -COMMIT; -BEGIN; - /* * Hardware/software inventory * @@ -2736,6 +2728,15 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) ); +/*******************************************************************/ + +/* + * The `sled_instance` view's definition needs to be modified in a separate + * transaction from the transaction that created it. + */ + +COMMIT; +BEGIN; /*******************************************************************/ @@ -2837,7 +2838,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '8.0.0', NULL) + ( TRUE, NOW(), NOW(), '9.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From f0aa1529489d309f6c56c72323eb94974a6f5255 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 30 Oct 2023 14:08:49 -0700 Subject: [PATCH 17/20] fix hakari --- Cargo.lock | 1 + nexus/inventory/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index fba96d19e7..eb56e84cff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4466,6 +4466,7 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "nexus-types", + "omicron-workspace-hack", "slog", "strum", "tokio", diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index 7fc3602596..3ed7e8b2db 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -14,6 +14,7 @@ nexus-types.workspace = true slog.workspace = true strum.workspace = true uuid.workspace = true +omicron-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true From e2b25e3b2124d9639fcc2f7d88eff73eb7f6cee7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 30 Oct 2023 15:38:54 -0700 Subject: [PATCH 18/20] fix test on ubuntu --- Cargo.lock | 1 + nexus/inventory/Cargo.toml | 1 + nexus/inventory/src/collector.rs | 8 +++++++- nexus/inventory/tests/output/collector_errors.txt | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eb56e84cff..2fb7d9dd44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4467,6 +4467,7 @@ dependencies = [ "gateway-test-utils", "nexus-types", "omicron-workspace-hack", + "regex", "slog", "strum", "tokio", diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index 3ed7e8b2db..965ff3f02a 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -19,4 +19,5 @@ omicron-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true gateway-test-utils.workspace = true +regex.workspace = true tokio.workspace = true diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 82e513ec30..d40b09d2be 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -274,7 +274,13 @@ mod test { write!(&mut s, "\nerrors:\n").unwrap(); for e in &collection.errors { - write!(&mut s, "error: {}\n", e).unwrap(); + // Some error strings have OS error numbers in them. We want to + // ignore those, particularly for CI, which runs these tests on + // multiple OSes. + let message = regex::Regex::new(r"os error \d+") + .unwrap() + .replace_all(&e, "os error <>"); + write!(&mut s, "error: {}\n", message).unwrap(); } s diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt index 08806d0fe8..f231cc7d97 100644 --- a/nexus/inventory/tests/output/collector_errors.txt +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -41,4 +41,4 @@ cabooses found: RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" errors: -error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error 128): error trying to connect: tcp connect error: Network is unreachable (os error 128): tcp connect error: Network is unreachable (os error 128): Network is unreachable (os error 128) +error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error trying to connect: tcp connect error: Network is unreachable (os error <>): tcp connect error: Network is unreachable (os error <>): Network is unreachable (os error <>) From 79723b708ca1acb66becb6d64f38aa53076028c1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 31 Oct 2023 10:19:14 -0700 Subject: [PATCH 19/20] fix deploy case? --- schema/crdb/dbinit.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 64da76adbb..da842cbfeb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2624,7 +2624,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties /* Inventory Collections */ -- list of all collections -CREATE TABLE IF NOT EXISTS inv_collection ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection ( id UUID PRIMARY KEY, time_started TIMESTAMPTZ NOT NULL, time_done TIMESTAMPTZ NOT NULL, From a55216d4048b745fca49ccd5213df3dc36622191 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 1 Nov 2023 13:41:13 -0700 Subject: [PATCH 20/20] review feedback --- nexus/db-model/src/inventory.rs | 20 +++---- .../db-queries/src/db/datastore/inventory.rs | 56 +------------------ nexus/src/app/background/init.rs | 4 +- .../app/background/inventory_collection.rs | 7 ++- 4 files changed, 20 insertions(+), 67 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index fd8c0952b5..5b09f289bb 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -208,12 +208,12 @@ pub struct HwBaseboardId { pub serial_number: String, } -impl<'a> From<&'a BaseboardId> for HwBaseboardId { - fn from(c: &'a BaseboardId) -> Self { +impl From for HwBaseboardId { + fn from(c: BaseboardId) -> Self { HwBaseboardId { id: Uuid::new_v4(), - part_number: c.part_number.clone(), - serial_number: c.serial_number.clone(), + part_number: c.part_number, + serial_number: c.serial_number, } } } @@ -248,14 +248,14 @@ pub struct SwCaboose { pub version: String, } -impl<'a> From<&'a Caboose> for SwCaboose { - fn from(c: &'a Caboose) -> Self { +impl From for SwCaboose { + fn from(c: Caboose) -> Self { SwCaboose { id: Uuid::new_v4(), - board: c.board.clone(), - git_commit: c.git_commit.clone(), - name: c.name.clone(), - version: c.version.clone(), + board: c.board, + git_commit: c.git_commit, + name: c.name, + version: c.version, } } } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 30aa946467..6b7d97754a 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -70,12 +70,12 @@ impl DataStore { let baseboards = collection .baseboards .iter() - .map(|b| HwBaseboardId::from(b.as_ref())) + .map(|b| HwBaseboardId::from((**b).clone())) .collect::>(); let cabooses = collection .cabooses .iter() - .map(|s| SwCaboose::from(s.as_ref())) + .map(|s| SwCaboose::from((**s).clone())) .collect::>(); let error_values = collection .errors @@ -85,7 +85,7 @@ impl DataStore { let index = u16::try_from(i).map_err(|e| { Error::internal_error(&format!( "failed to convert error index to u16 (too \ - many errors in inventory collection?): {}", + many errors in inventory collection?): {}", e )) })?; @@ -800,56 +800,6 @@ impl DataStore { } } -/// A SQL common table expression (CTE) used to insert into `inv_caboose` -/// -/// Concretely, we have these three tables: -/// -/// - `hw_baseboard` with an "id" primary key and lookup columns "part_number" -/// and "serial_number" -/// - `sw_caboose` with an "id" primary key and lookup columns "board", -/// "git_commit", "name", and "version" -/// - `inv_caboose` with foreign keys "hw_baseboard_id", "sw_caboose_id", and -/// various other columns -/// -/// We want to INSERT INTO `inv_caboose` a row with: -/// -/// - hw_baseboard_id (foreign key) the result of looking up an hw_baseboard row -/// by part number and serial number provided by the caller -/// -/// - sw_caboose_id (foreign key) the result of looking up a sw_caboose row by -/// board, git_commit, name, and version provided by the caller -/// -/// - the other columns being literals provided by the caller -/// -/// To achieve this, we're going to generate something like: -/// -/// WITH -/// my_new_row -/// AS ( -/// SELECT -/// hw_baseboard.id, /* `hw_baseboard` foreign key */ -/// sw_caboose.id, /* `sw_caboose` foreign key */ -/// ... /* caller-provided literal values for the rest */ -/// /* of the new inv_caboose row */ -/// FROM -/// hw_baseboard, -/// sw_caboose -/// WHERE -/// hw_baseboard.part_number = ... /* caller-provided part number */ -/// hw_baseboard.serial_number = ... /* caller-provided serial number */ -/// sw_caboose.board = ... /* caller-provided board */ -/// sw_caboose.git_commit = ... /* caller-provided git_commit */ -/// sw_caboose.name = ... /* caller-provided name */ -/// sw_caboose.version = ... /* caller-provided version */ -/// ) INSERT INTO -/// inv_caboose (... /* inv_caboose columns */) -/// SELECT * from my_new_row; -/// -/// The whole point is to avoid back-and-forth between the client and the -/// database. Those back-and-forth interactions can significantly increase -/// latency and the probability of transaction conflicts. See RFD 192 for -/// details. - /// Extra interfaces that are not intended (and potentially unsafe) for use in /// Nexus, but useful for testing and `omdb` pub trait DataStoreInventoryTest: Send + Sync { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index bdcfedd065..b000dd9bda 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -98,7 +98,7 @@ impl BackgroundTasks { // Background task: inventory collector let task_inventory_collection = { - let watcher = inventory_collection::InventoryCollector::new( + let collector = inventory_collection::InventoryCollector::new( datastore, resolver, &nexus_id.to_string(), @@ -112,7 +112,7 @@ impl BackgroundTasks { whole system", ), config.inventory.period_secs, - Box::new(watcher), + Box::new(collector), opctx.child(BTreeMap::new()), vec![], ); diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs index 96a0941524..f095b094db 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/inventory_collection.rs @@ -164,8 +164,11 @@ mod test { datastore.clone(), ); - // Nexus starts our very background task, so we should find a collection - // in the database before too long. + // Nexus starts the very background task that we're also testing + // manually here. As a result, we should find a collection in the + // database before too long. Wait for it so that after it appears, we + // can assume the rest of the collections came from the instance that + // we're testing. let mut last_collections = poll::wait_for_condition::<_, anyhow::Error, _, _>( || async {