From 8b5c6e5c59f85c4b667845ed483fc05654232d22 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 14:08:08 -0700 Subject: [PATCH 01/14] chore(deps): update rust crate regex to 1.10.2 (#4388) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 26 +++++++++++++++++++------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 12 ++++++------ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a3f38cf7b0..a371eb9a91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5439,8 +5439,8 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "regex", - "regex-automata 0.3.8", - "regex-syntax 0.7.5", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", "reqwest", "ring 0.16.20", "rustix 0.38.9", @@ -7034,14 +7034,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.5" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.8", - "regex-syntax 0.7.5", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", ] [[package]] @@ -7055,10 +7055,16 @@ name = "regex-automata" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.5", + "regex-syntax 0.8.2", ] [[package]] @@ -7073,6 +7079,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "regress" version = "0.7.1" diff --git a/Cargo.toml b/Cargo.toml index edf10917d2..c1c55eba48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -294,7 +294,7 @@ ratatui = "0.23.0" rayon = "1.8" rcgen = "0.10.0" ref-cast = "1.0" -regex = "1.9.5" +regex = "1.10.2" regress = "0.7.1" reqwest = { version = "0.11", default-features = false } ring = "0.16" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 6b40b98db6..f433fe881a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -71,9 +71,9 @@ ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", predicates = { version = "3.0.4" } rand = { version = "0.8.5", features = ["min_const_gen", "small_rng"] } rand_chacha = { version = "0.3.1" } -regex = { version = "1.9.5" } -regex-automata = { version = "0.3.8", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { version = "0.7.5" } +regex = { version = "1.10.2" } +regex-automata = { version = "0.4.3", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8.2" } reqwest = { version = "0.11.20", features = ["blocking", "json", "rustls-tls", "stream"] } ring = { version = "0.16.20", features = ["std"] } schemars = { version = "0.8.13", features = ["bytes", "chrono", "uuid1"] } @@ -163,9 +163,9 @@ ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", predicates = { version = "3.0.4" } rand = { version = "0.8.5", features = ["min_const_gen", "small_rng"] } rand_chacha = { version = "0.3.1" } -regex = { version = "1.9.5" } -regex-automata = { version = "0.3.8", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { version = "0.7.5" } +regex = { version = "1.10.2" } +regex-automata = { version = "0.4.3", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8.2" } reqwest = { version = "0.11.20", features = ["blocking", "json", "rustls-tls", "stream"] } ring = { version = "0.16.20", features = ["std"] } schemars = { version = "0.8.13", features = ["bytes", "chrono", "uuid1"] } From 9771567a7aae7fb11d7240d979c2c8336dc3d25b Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 14:08:40 -0700 Subject: [PATCH 02/14] chore(deps): update rust crate rcgen to 0.11.3 (#4387) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 24 +++++++++++++++++------- Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a371eb9a91..e8e16d0fba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4409,7 +4409,7 @@ dependencies = [ "oso", "oximeter 0.1.0", "paste", - "pem", + "pem 1.1.1", "petgraph", "pq-sys", "rand 0.8.5", @@ -5096,7 +5096,7 @@ dependencies = [ "oximeter-producer 0.1.0", "parse-display", "paste", - "pem", + "pem 1.1.1", "petgraph", "pq-sys", "pretty_assertions", @@ -5357,7 +5357,7 @@ dependencies = [ "libc", "omicron-common 0.1.0", "omicron-workspace-hack", - "pem", + "pem 1.1.1", "rcgen", "regex", "reqwest", @@ -6106,6 +6106,16 @@ dependencies = [ "base64 0.13.1", ] +[[package]] +name = "pem" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3163d2912b7c3b52d651a055f2c7eec9ba5cd22d26ef75b8dd3a59980b185923" +dependencies = [ + "base64 0.21.5", + "serde", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -6944,11 +6954,11 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.10.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b" +checksum = "52c4f3084aa3bc7dfbba4eff4fab2a54db4324965d8872ab933565e6fbd83bc6" dependencies = [ - "pem", + "pem 3.0.2", "ring 0.16.20", "time", "yasna", @@ -9210,7 +9220,7 @@ dependencies = [ "log", "olpc-cjson", "path-absolutize", - "pem", + "pem 1.1.1", "percent-encoding", "reqwest", "ring 0.16.20", diff --git a/Cargo.toml b/Cargo.toml index c1c55eba48..e56e558c6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -292,7 +292,7 @@ quote = "1.0" rand = "0.8.5" ratatui = "0.23.0" rayon = "1.8" -rcgen = "0.10.0" +rcgen = "0.11.3" ref-cast = "1.0" regex = "1.10.2" regress = "0.7.1" From 73c3571fe591af14bbe9c341f6ed4a4169446f91 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 04:17:13 +0000 Subject: [PATCH 03/14] chore(deps): update taiki-e/install-action digest to e0367a2 (#4395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`f860c89` -> `e0367a2`](https://togithub.com/taiki-e/install-action/compare/f860c89...e0367a2) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 4dc6578b7f..b717f425de 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -22,7 +22,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@f860c89ccbfa08ae7bc92502c27ab631f48b8f9d # v2 + uses: taiki-e/install-action@e0367a2d3f3d9fc43c3f25fe58692570375e19f0 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 1cf6a0f49d8a10ca15d6e0b1529d1c1606ad5398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Wed, 1 Nov 2023 10:04:49 +1300 Subject: [PATCH 04/14] [oximeter] Port all single node ClickHouse tests to a replicated set up (#4372) This is a follow up to https://github.com/oxidecomputer/omicron/pull/4149 . In this commit all single node tests are now run against a clustered set up. Additionally, a few bugs on the replicated init SQL file have been fixed. Note: All of the individual field and measurement tests all run under a single test now. The reason for this is test times. Each test needs to wipe out it's database before the next test runs. In a replicated installation, this means that all nodes must sync, which takes time. Before I merged all the field/measurement tests into a single one, the testing time for a replicated set up took about 10 minutes, which really is too much. Now it takes ~3.5 minutes. It is still a lot, but only a temporary measure until we can run these tests in parallel again. An unintended bonus of this approach is that running the tests this way means that you are testing that the data is being inserted in the correct table. Related: https://github.com/oxidecomputer/omicron/issues/4148 Related: https://github.com/oxidecomputer/omicron/issues/4001 --- oximeter/db/src/client.rs | 1101 +++++++++++++++--------- oximeter/db/src/db-replicated-init.sql | 90 ++ oximeter/db/src/db-wipe-replicated.sql | 2 +- test-utils/src/dev/clickhouse.rs | 40 +- 4 files changed, 793 insertions(+), 440 deletions(-) diff --git a/oximeter/db/src/client.rs b/oximeter/db/src/client.rs index 92b9ed96bd..69e91f888a 100644 --- a/oximeter/db/src/client.rs +++ b/oximeter/db/src/client.rs @@ -838,12 +838,46 @@ mod tests { use oximeter::FieldValue; use oximeter::Metric; use oximeter::Target; - use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::time::Duration; use tokio::time::sleep; use uuid::Uuid; + pub enum InstallationType { + Cluster, + SingleNode, + } + + impl InstallationType { + pub async fn init_db(&self, client: &Client) -> Result<(), Error> { + match *self { + InstallationType::SingleNode => client + .init_single_node_db() + .await + .expect("Failed to initialize timeseries database"), + InstallationType::Cluster => client + .init_replicated_db() + .await + .expect("Failed to initialize timeseries database"), + } + Ok(()) + } + + pub async fn wipe_db(&self, client: &Client) -> Result<(), Error> { + match *self { + InstallationType::SingleNode => client + .wipe_single_node_db() + .await + .expect("Failed to remove timeseries database"), + InstallationType::Cluster => client + .wipe_replicated_db() + .await + .expect("Failed to remove timeseries database"), + } + Ok(()) + } + } + // NOTE: Each test requires a clean slate. Because of this, tests run sequentially. // // This is at least partially because ClickHouse by default provides pretty weak consistency @@ -854,198 +888,524 @@ mod tests { #[tokio::test] async fn test_single_node() { - let logctx = test_setup_log("test_single_node"); - let log = &logctx.log; - // Let the OS assign a port and discover it after ClickHouse starts let mut db = ClickHouseInstance::new_single_node(0) .await .expect("Failed to start ClickHouse"); - let address = SocketAddr::new("::1".parse().unwrap(), db.port()); - // Test bad database connection - let client = Client::new("127.0.0.1:443".parse().unwrap(), &log); - assert!(matches!( - client.ping().await, - Err(Error::DatabaseUnavailable(_)) - )); + // Tests that the expected error is returned on a wrong address + bad_db_connection_test().await.unwrap(); // Tests that a new client has started and it is not part of a cluster - is_not_oximeter_cluster_test(address).await.unwrap(); + is_oximeter_cluster_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests that data can be inserted via the client - insert_samples_test(address).await.unwrap(); + insert_samples_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests for a schema mismatch - schema_mismatch_test(address).await.unwrap(); + schema_mismatch_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests for a schema update - schema_updated_test(address).await.unwrap(); + schema_updated_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests for specific timeseries selection - client_select_timeseries_one_test(address).await.unwrap(); + client_select_timeseries_one_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests for specific timeseries selection - field_record_count_test(address).await.unwrap(); + field_record_count_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // ClickHouse regression test - unquoted_64bit_integers_test(address).await.unwrap(); + unquoted_64bit_integers_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests to verify that we can distinguish between metrics by name - differentiate_by_timeseries_name_test(address).await.unwrap(); + differentiate_by_timeseries_name_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests selecting a single timeseries - select_timeseries_with_select_one_test(address).await.unwrap(); + select_timeseries_with_select_one_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests selecting two timeseries select_timeseries_with_select_one_field_with_multiple_values_test( - address, + db.address, + InstallationType::SingleNode, ) .await .unwrap(); // Tests selecting multiple timeseries - select_timeseries_with_select_multiple_fields_with_multiple_values_test(address).await.unwrap(); + select_timeseries_with_select_multiple_fields_with_multiple_values_test(db.address, InstallationType::SingleNode).await.unwrap(); // Tests selecting all timeseries - select_timeseries_with_all_test(address).await.unwrap(); + select_timeseries_with_all_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests selecting all timeseries with start time - select_timeseries_with_start_time_test(address).await.unwrap(); + select_timeseries_with_start_time_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests selecting all timeseries with start time - select_timeseries_with_limit_test(address).await.unwrap(); + select_timeseries_with_limit_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests selecting all timeseries with order - select_timeseries_with_order_test(address).await.unwrap(); + select_timeseries_with_order_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests schema does not change - get_schema_no_new_values_test(address).await.unwrap(); + get_schema_no_new_values_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests listing timeseries schema - timeseries_schema_list_test(address).await.unwrap(); + timeseries_schema_list_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests listing timeseries - list_timeseries_test(address).await.unwrap(); + list_timeseries_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests no changes are made when version is not updated - database_version_update_idempotent_test(address).await.unwrap(); + database_version_update_idempotent_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests that downgrading is impossible - database_version_will_not_downgrade_test(address).await.unwrap(); + database_version_will_not_downgrade_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests old data is dropped if version is updated - database_version_wipes_old_version_test(address).await.unwrap(); + database_version_wipes_old_version_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests schema cache is updated when a new sample is inserted - update_schema_cache_on_new_sample_test(address).await.unwrap(); + update_schema_cache_on_new_sample_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests that we can successfully query all extant datum types from the schema table. - select_all_datum_types_test(address).await.unwrap(); + select_all_datum_types_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); // Tests that, when cache new schema but _fail_ to insert them, // we also remove them from the internal cache. - new_schema_removed_when_not_inserted_test(address).await.unwrap(); + new_schema_removed_when_not_inserted_test( + db.address, + InstallationType::SingleNode, + ) + .await + .unwrap(); // Tests for fields and measurements - recall_field_value_bool_test(address).await.unwrap(); + recall_of_all_fields_test(db.address, InstallationType::SingleNode) + .await + .unwrap(); - recall_field_value_u8_test(address).await.unwrap(); + db.cleanup().await.expect("Failed to cleanup ClickHouse server"); + } - recall_field_value_i8_test(address).await.unwrap(); + #[tokio::test] + async fn test_replicated() { + let mut cluster = ClickHouseCluster::new() + .await + .expect("Failed to initialise ClickHouse Cluster"); - recall_field_value_u16_test(address).await.unwrap(); + // Tests that the expected error is returned on a wrong address + bad_db_connection_test().await.unwrap(); - recall_field_value_i16_test(address).await.unwrap(); + // Tests data is replicated in a cluster + data_is_replicated_test(&cluster).await.unwrap(); - recall_field_value_u32_test(address).await.unwrap(); + // Tests that a new client has started and it is part of a cluster + is_oximeter_cluster_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_i32_test(address).await.unwrap(); + // Tests that data can be inserted via the client + insert_samples_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_u64_test(address).await.unwrap(); + // Tests for a schema mismatch + schema_mismatch_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_i64_test(address).await.unwrap(); + // Tests for a schema update + schema_updated_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_string_test(address).await.unwrap(); + // Tests for specific timeseries selection + client_select_timeseries_one_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_ipv4addr_test(address).await.unwrap(); + // Tests for specific timeseries selection + field_record_count_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_ipv6addr_test(address).await.unwrap(); + // ClickHouse regression test + unquoted_64bit_integers_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_field_value_uuid_test(address).await.unwrap(); + // Tests to verify that we can distinguish between metrics by name + differentiate_by_timeseries_name_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_bool_test(address).await.unwrap(); + // Tests selecting a single timeseries + select_timeseries_with_select_one_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_i8_test(address).await.unwrap(); + // Tests selecting two timeseries + select_timeseries_with_select_one_field_with_multiple_values_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_u8_test(address).await.unwrap(); + // Tests selecting multiple timeseries + select_timeseries_with_select_multiple_fields_with_multiple_values_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_i16_test(address).await.unwrap(); + // Tests selecting all timeseries + select_timeseries_with_all_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_u16_test(address).await.unwrap(); + // Tests selecting all timeseries with start time + select_timeseries_with_start_time_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_i32_test(address).await.unwrap(); + // Tests selecting all timeseries with start time + select_timeseries_with_limit_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_u32_test(address).await.unwrap(); + // Tests selecting all timeseries with order + select_timeseries_with_order_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_i64_test(address).await.unwrap(); + // Tests schema does not change + get_schema_no_new_values_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_u64_test(address).await.unwrap(); + // Tests listing timeseries schema + timeseries_schema_list_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_f32_test(address).await.unwrap(); + // Tests listing timeseries + list_timeseries_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_f64_test(address).await.unwrap(); + // Tests no changes are made when version is not updated + database_version_update_idempotent_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_cumulative_i64_test(address).await.unwrap(); + // Tests that downgrading is impossible + database_version_will_not_downgrade_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_cumulative_u64_test(address).await.unwrap(); + // Tests old data is dropped if version is updated + database_version_wipes_old_version_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_cumulative_f64_test(address).await.unwrap(); + // Tests schema cache is updated when a new sample is inserted + update_schema_cache_on_new_sample_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_histogram_i8_test(address).await.unwrap(); + // Tests that we can successfully query all extant datum types from the schema table. + select_all_datum_types_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_histogram_u8_test(address).await.unwrap(); + // Tests that, when cache new schema but _fail_ to insert them, + // we also remove them from the internal cache. + new_schema_removed_when_not_inserted_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_histogram_i16_test(address).await.unwrap(); + // Tests for fields and measurements + recall_of_all_fields_test( + cluster.replica_1.address, + InstallationType::Cluster, + ) + .await + .unwrap(); - recall_measurement_histogram_u16_test(address).await.unwrap(); + cluster + .keeper_1 + .cleanup() + .await + .expect("Failed to cleanup ClickHouse keeper 1"); + cluster + .keeper_2 + .cleanup() + .await + .expect("Failed to cleanup ClickHouse keeper 2"); + cluster + .keeper_3 + .cleanup() + .await + .expect("Failed to cleanup ClickHouse keeper 3"); + cluster + .replica_1 + .cleanup() + .await + .expect("Failed to cleanup ClickHouse server 1"); + cluster + .replica_2 + .cleanup() + .await + .expect("Failed to cleanup ClickHouse server 2"); + } - recall_measurement_histogram_i32_test(address).await.unwrap(); + async fn bad_db_connection_test() -> Result<(), Error> { + let logctx = test_setup_log("test_bad_db_connection"); + let log = &logctx.log; - recall_measurement_histogram_u32_test(address).await.unwrap(); + let client = Client::new("127.0.0.1:443".parse().unwrap(), &log); + assert!(matches!( + client.ping().await, + Err(Error::DatabaseUnavailable(_)) + )); - recall_measurement_histogram_i64_test(address).await.unwrap(); + logctx.cleanup_successful(); + Ok(()) + } - recall_measurement_histogram_u64_test(address).await.unwrap(); + async fn data_is_replicated_test( + cluster: &ClickHouseCluster, + ) -> Result<(), Error> { + let logctx = test_setup_log("test_data_is_replicated"); + let log = &logctx.log; + + // Create database in node 1 + let client_1 = Client::new(cluster.replica_1.address, &log); + assert!(client_1.is_oximeter_cluster().await.unwrap()); + client_1 + .init_replicated_db() + .await + .expect("Failed to initialize timeseries database"); - recall_measurement_histogram_f64_test(address).await.unwrap(); + // Verify database exists in node 2 + let client_2 = Client::new(cluster.replica_2.address, &log); + assert!(client_2.is_oximeter_cluster().await.unwrap()); + let sql = String::from("SHOW DATABASES FORMAT JSONEachRow;"); - db.cleanup().await.expect("Failed to cleanup ClickHouse server"); + // Try a few times to make sure data has been synchronised. + let mut result = String::from(""); + let tries = 5; + for _ in 0..tries { + result = client_2.execute_with_body(sql.clone()).await.unwrap(); + if !result.contains("oximeter") { + sleep(Duration::from_secs(1)).await; + continue; + } else { + break; + } + } + + assert!(result.contains("oximeter")); + + // Insert row into one of the tables + let sql = String::from( + "INSERT INTO oximeter.measurements_string (datum) VALUES ('hiya');", + ); + client_2.execute_with_body(sql).await.unwrap(); + + // Make sure replicas are synched + let sql = String::from( + "SYSTEM SYNC REPLICA oximeter.measurements_string_local;", + ); + client_1.execute_with_body(sql).await.unwrap(); + + // Make sure data exists in the other replica + let sql = String::from( + "SELECT * FROM oximeter.measurements_string FORMAT JSONEachRow;", + ); + let result = client_1.execute_with_body(sql).await.unwrap(); + assert!(result.contains("hiya")); + + client_1.wipe_replicated_db().await?; logctx.cleanup_successful(); + Ok(()) } - async fn is_not_oximeter_cluster_test( + async fn is_oximeter_cluster_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { - let logctx = test_setup_log("test_is_not_oximeter_cluster"); + let logctx = test_setup_log("test_is_oximeter_cluster"); let log = &logctx.log; let client = Client::new(address, &log); - assert!(!client.is_oximeter_cluster().await.unwrap()); - client.wipe_single_node_db().await?; + + match db_type { + InstallationType::Cluster => { + assert!(client.is_oximeter_cluster().await.unwrap()); + client.wipe_replicated_db().await?; + } + InstallationType::SingleNode => { + assert!(!client.is_oximeter_cluster().await.unwrap()); + client.wipe_single_node_db().await?; + } + } logctx.cleanup_successful(); Ok(()) } - async fn insert_samples_test(address: SocketAddr) -> Result<(), Error> { + async fn insert_samples_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { let logctx = test_setup_log("test_insert_samples"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = { let mut s = Vec::with_capacity(8); for _ in 0..s.capacity() { @@ -1054,7 +1414,7 @@ mod tests { s }; client.insert_samples(&samples).await?; - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } @@ -1079,15 +1439,15 @@ mod tests { } } - async fn schema_mismatch_test(address: SocketAddr) -> Result<(), Error> { + async fn schema_mismatch_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { let logctx = test_setup_log("test_schema_mismatch"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let sample = test_util::make_sample(); client.insert_samples(&[sample]).await.unwrap(); @@ -1104,20 +1464,20 @@ mod tests { let sample = Sample::new(&bad_name, &metric).unwrap(); let result = client.verify_or_cache_sample_schema(&sample).await; assert!(matches!(result, Err(Error::SchemaMismatch { .. }))); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } - async fn schema_updated_test(address: SocketAddr) -> Result<(), Error> { + async fn schema_updated_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { let logctx = test_setup_log("test_schema_updated"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let sample = test_util::make_sample(); // Verify that this sample is considered new, i.e., we return rows to update the timeseries @@ -1179,22 +1539,20 @@ mod tests { .collect::>(); assert_eq!(schema.len(), 1); assert_eq!(expected_schema, schema[0]); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn client_select_timeseries_one_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_client_select_timeseries_one"); let log = &logctx.log; - - let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + + let client = Client::new(address, &log); + db_type.init_db(&client).await.unwrap(); let samples = test_util::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; @@ -1266,12 +1624,15 @@ mod tests { .iter() .all(|field| field_cmp(field, sample.metric_fields())); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } - async fn field_record_count_test(address: SocketAddr) -> Result<(), Error> { + async fn field_record_count_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { let logctx = test_setup_log("test_field_record_count"); let log = &logctx.log; @@ -1280,10 +1641,7 @@ mod tests { // Because of the schema change, inserting field records per field per unique timeseries, // we'd like to exercise the logic of ClickHouse's replacing merge tree engine. let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = test_util::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; @@ -1320,7 +1678,7 @@ mod tests { ) .await; - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } @@ -1333,16 +1691,14 @@ mod tests { // details. This test verifies that we get back _unquoted_ integers from the database. async fn unquoted_64bit_integers_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { use serde_json::Value; let logctx = test_setup_log("test_unquoted_64bit_integers"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let output = client .execute_with_body( "SELECT toUInt64(1) AS foo FORMAT JSONEachRow;".to_string(), @@ -1352,13 +1708,14 @@ mod tests { let json: Value = serde_json::from_str(&output).unwrap(); assert_eq!(json["foo"], Value::Number(1u64.into())); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn differentiate_by_timeseries_name_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_differentiate_by_timeseries_name"); let log = &logctx.log; @@ -1382,10 +1739,7 @@ mod tests { } let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let target = MyTarget::default(); let first_metric = FirstMetric::default(); @@ -1422,13 +1776,14 @@ mod tests { assert_eq!(timeseries.target.name, "my_target"); assert_eq!(timeseries.metric.name, "second_metric"); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_select_one_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_select_one"); let log = &logctx.log; @@ -1436,10 +1791,7 @@ mod tests { let (target, metrics, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1484,13 +1836,14 @@ mod tests { verify_target(×eries.target, &target); verify_metric(×eries.metric, metrics.get(0).unwrap()); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_select_one_field_with_multiple_values_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log( "test_select_timeseries_with_select_one_field_with_multiple_values", @@ -1500,10 +1853,7 @@ mod tests { let (target, metrics, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1554,13 +1904,14 @@ mod tests { verify_metric(&ts.metric, metric); } - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_select_multiple_fields_with_multiple_values_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_select_multiple_fields_with_multiple_values"); let log = &logctx.log; @@ -1568,10 +1919,7 @@ mod tests { let (target, metrics, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1630,13 +1978,14 @@ mod tests { } } - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_all_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_all"); let log = &logctx.log; @@ -1644,10 +1993,7 @@ mod tests { let (target, metrics, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1691,13 +2037,14 @@ mod tests { verify_metric(&ts.metric, metrics.get(i).unwrap()); } - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_start_time_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_start_time"); let log = &logctx.log; @@ -1705,10 +2052,7 @@ mod tests { let (_, metrics, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1742,23 +2086,21 @@ mod tests { } } - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_limit_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_limit"); let log = &logctx.log; let (_, _, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1860,23 +2202,21 @@ mod tests { timeseries.measurements ); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn select_timeseries_with_order_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_select_timeseries_with_order"); let log = &logctx.log; let (_, _, samples) = setup_select_test(); let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); client .insert_samples(&samples) .await @@ -1961,22 +2301,20 @@ mod tests { timeseries_asc.last().unwrap() ); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn get_schema_no_new_values_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_get_schema_no_new_values"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = test_util::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; @@ -1988,22 +2326,20 @@ mod tests { .expect("Failed to get timeseries schema"); assert_eq!(&original_schema, &*schema, "Schema shouldn't change"); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn timeseries_schema_list_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_timeseries_schema_list"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = test_util::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; @@ -2025,20 +2361,20 @@ mod tests { result.next_page.is_none(), "Expected the next page token to be None" ); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } - async fn list_timeseries_test(address: SocketAddr) -> Result<(), Error> { + async fn list_timeseries_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { let logctx = test_setup_log("test_list_timeseries"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = test_util::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; @@ -2101,148 +2437,109 @@ mod tests { "Paginating should pick up where it left off" ); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn recall_field_value_bool_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let field = FieldValue::Bool(true); let as_json = serde_json::Value::from(1_u64); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_u8_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_u8_test(client: &Client) -> Result<(), Error> { let field = FieldValue::U8(1); let as_json = serde_json::Value::from(1_u8); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_i8_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_i8_test(client: &Client) -> Result<(), Error> { let field = FieldValue::I8(1); let as_json = serde_json::Value::from(1_i8); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_u16_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_u16_test(client: &Client) -> Result<(), Error> { let field = FieldValue::U16(1); let as_json = serde_json::Value::from(1_u16); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_i16_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_i16_test(client: &Client) -> Result<(), Error> { let field = FieldValue::I16(1); let as_json = serde_json::Value::from(1_i16); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_u32_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_u32_test(client: &Client) -> Result<(), Error> { let field = FieldValue::U32(1); let as_json = serde_json::Value::from(1_u32); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_i32_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_i32_test(client: &Client) -> Result<(), Error> { let field = FieldValue::I32(1); let as_json = serde_json::Value::from(1_i32); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_u64_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_u64_test(client: &Client) -> Result<(), Error> { let field = FieldValue::U64(1); let as_json = serde_json::Value::from(1_u64); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } - async fn recall_field_value_i64_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_field_value_i64_test(client: &Client) -> Result<(), Error> { let field = FieldValue::I64(1); let as_json = serde_json::Value::from(1_i64); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } async fn recall_field_value_string_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let field = FieldValue::String("foo".into()); let as_json = serde_json::Value::from("foo"); - test_recall_field_value_impl(address, field, as_json).await?; - Ok(()) - } - - async fn recall_field_value_ipv4addr_test( - address: SocketAddr, - ) -> Result<(), Error> { - let field = FieldValue::from(Ipv4Addr::LOCALHOST); - let as_json = serde_json::Value::from( - Ipv4Addr::LOCALHOST.to_ipv6_mapped().to_string(), - ); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } async fn recall_field_value_ipv6addr_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let field = FieldValue::from(Ipv6Addr::LOCALHOST); let as_json = serde_json::Value::from(Ipv6Addr::LOCALHOST.to_string()); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } async fn recall_field_value_uuid_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let id = Uuid::new_v4(); let field = FieldValue::from(id); let as_json = serde_json::Value::from(id.to_string()); - test_recall_field_value_impl(address, field, as_json).await?; + test_recall_field_value_impl(field, as_json, client).await?; Ok(()) } async fn test_recall_field_value_impl( - address: SocketAddr, field_value: FieldValue, as_json: serde_json::Value, + client: &Client, ) -> Result<(), Error> { - let logctx = test_setup_log( - format!("test_recall_field_value_{}", field_value.field_type()) - .as_str(), - ); - let log = &logctx.log; - - let client = Client::new(address, log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); - // Insert a record from this field. const TIMESERIES_NAME: &str = "foo:bar"; const TIMESERIES_KEY: u64 = 101; @@ -2282,157 +2579,134 @@ mod tests { actual_row, inserted_row, "Actual and expected field rows do not match" ); - - client.wipe_single_node_db().await?; - logctx.cleanup_successful(); Ok(()) } async fn recall_measurement_bool_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let datum = Datum::Bool(true); let as_json = serde_json::Value::from(1_u64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_i8_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_i8_test(client: &Client) -> Result<(), Error> { let datum = Datum::I8(1); let as_json = serde_json::Value::from(1_i8); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_u8_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_u8_test(client: &Client) -> Result<(), Error> { let datum = Datum::U8(1); let as_json = serde_json::Value::from(1_u8); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_i16_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_i16_test(client: &Client) -> Result<(), Error> { let datum = Datum::I16(1); let as_json = serde_json::Value::from(1_i16); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_u16_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_u16_test(client: &Client) -> Result<(), Error> { let datum = Datum::U16(1); let as_json = serde_json::Value::from(1_u16); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_i32_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_i32_test(client: &Client) -> Result<(), Error> { let datum = Datum::I32(1); let as_json = serde_json::Value::from(1_i32); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_u32_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_u32_test(client: &Client) -> Result<(), Error> { let datum = Datum::U32(1); let as_json = serde_json::Value::from(1_u32); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_i64_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_i64_test(client: &Client) -> Result<(), Error> { let datum = Datum::I64(1); let as_json = serde_json::Value::from(1_i64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_u64_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_u64_test(client: &Client) -> Result<(), Error> { let datum = Datum::U64(1); let as_json = serde_json::Value::from(1_u64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_f32_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_f32_test(client: &Client) -> Result<(), Error> { const VALUE: f32 = 1.1; let datum = Datum::F32(VALUE); // NOTE: This is intentionally an f64. let as_json = serde_json::Value::from(1.1_f64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } - async fn recall_measurement_f64_test( - address: SocketAddr, - ) -> Result<(), Error> { + async fn recall_measurement_f64_test(client: &Client) -> Result<(), Error> { const VALUE: f64 = 1.1; let datum = Datum::F64(VALUE); let as_json = serde_json::Value::from(VALUE); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } async fn recall_measurement_cumulative_i64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let datum = Datum::CumulativeI64(1.into()); let as_json = serde_json::Value::from(1_i64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } async fn recall_measurement_cumulative_u64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let datum = Datum::CumulativeU64(1.into()); let as_json = serde_json::Value::from(1_u64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } async fn recall_measurement_cumulative_f64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let datum = Datum::CumulativeF64(1.1.into()); let as_json = serde_json::Value::from(1.1_f64); - test_recall_measurement_impl::(address, datum, None, as_json) + test_recall_measurement_impl::(datum, None, as_json, client) .await?; Ok(()) } async fn histogram_test_impl( - address: SocketAddr, + client: &Client, hist: Histogram, ) -> Result<(), Error> where @@ -2445,72 +2719,72 @@ mod tests { let as_json = serde_json::Value::Array( counts.into_iter().map(Into::into).collect(), ); - test_recall_measurement_impl(address, datum, Some(bins), as_json) + test_recall_measurement_impl(datum, Some(bins), as_json, client) .await?; Ok(()) } async fn recall_measurement_histogram_i8_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0i8, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_u8_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0u8, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_i16_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0i16, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_u16_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0u16, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_i32_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0i32, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_u32_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0u32, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_i64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0i64, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_u64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0u64, 1, 2]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } @@ -2528,38 +2802,27 @@ mod tests { // discussion. #[allow(dead_code)] async fn recall_measurement_histogram_f32_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0.1f32, 0.2, 0.3]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn recall_measurement_histogram_f64_test( - address: SocketAddr, + client: &Client, ) -> Result<(), Error> { let hist = Histogram::new(&[0.1f64, 0.2, 0.3]).unwrap(); - histogram_test_impl(address, hist).await?; + histogram_test_impl(client, hist).await?; Ok(()) } async fn test_recall_measurement_impl + Copy>( - address: SocketAddr, datum: Datum, maybe_bins: Option>, json_datum: serde_json::Value, + client: &Client, ) -> Result<(), Error> { - let logctx = test_setup_log( - format!("test_recall_measurement_{}", datum.datum_type()).as_str(), - ); - let log = &logctx.log; - - let client = Client::new(address, log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); - // Insert a record from this datum. const TIMESERIES_NAME: &str = "foo:bar"; const TIMESERIES_KEY: u64 = 101; @@ -2609,7 +2872,7 @@ mod tests { // Select it exactly back out. let select_sql = format!( - "SELECT * FROM oximeter.{} LIMIT 1 FORMAT {};", + "SELECT * FROM oximeter.{} LIMIT 2 FORMAT {};", measurement_table, crate::DATABASE_SELECT_FORMAT, ); @@ -2617,6 +2880,7 @@ mod tests { .execute_with_body(select_sql) .await .expect("Failed to select measurement row"); + println!("{}", body); let actual_row: serde_json::Value = serde_json::from_str(&body) .expect("Failed to parse measurement row JSON"); println!("{actual_row:?}"); @@ -2625,8 +2889,6 @@ mod tests { actual_row, inserted_row, "Actual and expected measurement rows do not match" ); - client.wipe_single_node_db().await?; - logctx.cleanup_successful(); Ok(()) } @@ -2642,8 +2904,94 @@ mod tests { .count() } + async fn recall_of_all_fields_test( + address: SocketAddr, + db_type: InstallationType, + ) -> Result<(), Error> { + let logctx = test_setup_log("test_recall_of_all_fields"); + let log = &logctx.log; + + let client = Client::new(address, log); + db_type.init_db(&client).await.unwrap(); + + recall_measurement_bool_test(&client).await.unwrap(); + + recall_measurement_i8_test(&client).await.unwrap(); + + recall_measurement_u8_test(&client).await.unwrap(); + + recall_measurement_i16_test(&client).await.unwrap(); + + recall_measurement_u16_test(&client).await.unwrap(); + + recall_measurement_i32_test(&client).await.unwrap(); + + recall_measurement_u32_test(&client).await.unwrap(); + + recall_measurement_i64_test(&client).await.unwrap(); + + recall_measurement_u64_test(&client).await.unwrap(); + + recall_measurement_f32_test(&client).await.unwrap(); + + recall_measurement_f64_test(&client).await.unwrap(); + + recall_measurement_cumulative_i64_test(&client).await.unwrap(); + + recall_measurement_cumulative_u64_test(&client).await.unwrap(); + + recall_measurement_cumulative_f64_test(&client).await.unwrap(); + + recall_measurement_histogram_i8_test(&client).await.unwrap(); + + recall_measurement_histogram_u8_test(&client).await.unwrap(); + + recall_measurement_histogram_i16_test(&client).await.unwrap(); + + recall_measurement_histogram_u16_test(&client).await.unwrap(); + + recall_measurement_histogram_i32_test(&client).await.unwrap(); + + recall_measurement_histogram_u32_test(&client).await.unwrap(); + + recall_measurement_histogram_i64_test(&client).await.unwrap(); + + recall_measurement_histogram_u64_test(&client).await.unwrap(); + + recall_measurement_histogram_f64_test(&client).await.unwrap(); + + recall_field_value_bool_test(&client).await.unwrap(); + + recall_field_value_u8_test(&client).await.unwrap(); + + recall_field_value_i8_test(&client).await.unwrap(); + + recall_field_value_u16_test(&client).await.unwrap(); + + recall_field_value_i16_test(&client).await.unwrap(); + + recall_field_value_u32_test(&client).await.unwrap(); + + recall_field_value_i32_test(&client).await.unwrap(); + + recall_field_value_u64_test(&client).await.unwrap(); + + recall_field_value_i64_test(&client).await.unwrap(); + + recall_field_value_string_test(&client).await.unwrap(); + + recall_field_value_ipv6addr_test(&client).await.unwrap(); + + recall_field_value_uuid_test(&client).await.unwrap(); + + db_type.wipe_db(&client).await.unwrap(); + logctx.cleanup_successful(); + Ok(()) + } + async fn database_version_update_idempotent_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_database_version_update_idempotent"); let log = &logctx.log; @@ -2674,13 +3022,14 @@ mod tests { assert_eq!(1, get_schema_count(&client).await); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn database_version_will_not_downgrade_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_database_version_will_not_downgrade"); let log = &logctx.log; @@ -2709,13 +3058,14 @@ mod tests { .await .expect_err("Should have failed, downgrades are not supported"); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn database_version_wipes_old_version_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { let logctx = test_setup_log("test_database_version_wipes_old_version"); let log = &logctx.log; @@ -2745,23 +3095,21 @@ mod tests { .expect("Should have initialized database successfully"); assert_eq!(0, get_schema_count(&client).await); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } async fn update_schema_cache_on_new_sample_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { usdt::register_probes().unwrap(); let logctx = test_setup_log("test_update_schema_cache_on_new_sample"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = [test_util::make_sample()]; client.insert_samples(&samples).await.unwrap(); @@ -2793,7 +3141,7 @@ mod tests { "Expected exactly 1 schema again" ); assert_eq!(client.schema.lock().await.len(), 1); - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } @@ -2805,6 +3153,7 @@ mod tests { // succeed. async fn select_all_datum_types_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { use strum::IntoEnumIterator; usdt::register_probes().unwrap(); @@ -2812,10 +3161,7 @@ mod tests { let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); // Attempt to select all schema with each datum type. for ty in oximeter::DatumType::iter() { @@ -2830,7 +3176,7 @@ mod tests { let count = res.trim().parse::().unwrap(); assert_eq!(count, 0); } - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); logctx.cleanup_successful(); Ok(()) } @@ -2841,16 +3187,14 @@ mod tests { // remove them from the internal cache. async fn new_schema_removed_when_not_inserted_test( address: SocketAddr, + db_type: InstallationType, ) -> Result<(), Error> { usdt::register_probes().unwrap(); let logctx = test_setup_log("test_update_schema_cache_on_new_sample"); let log = &logctx.log; let client = Client::new(address, &log); - client - .init_single_node_db() - .await - .expect("Failed to initialize timeseries database"); + db_type.init_db(&client).await.unwrap(); let samples = [test_util::make_sample()]; // We're using the components of the `insert_samples()` method here, @@ -2866,7 +3210,7 @@ mod tests { // Next, we'll kill the database, and then try to insert the schema. // That will fail, since the DB is now inaccessible. - client.wipe_single_node_db().await?; + db_type.wipe_db(&client).await.unwrap(); let res = client.save_new_schema_or_remove(new_schema).await; assert!(res.is_err(), "Should have failed since the DB is gone"); assert!( @@ -2878,87 +3222,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_build_replicated() { - let logctx = test_setup_log("test_build_replicated"); - let log = &logctx.log; - - let mut cluster = ClickHouseCluster::new() - .await - .expect("Failed to initialise ClickHouse Cluster"); - - // Create database in node 1 - let client_1 = Client::new(cluster.replica_1.address.unwrap(), &log); - assert!(client_1.is_oximeter_cluster().await.unwrap()); - client_1 - .init_replicated_db() - .await - .expect("Failed to initialize timeseries database"); - - // Verify database exists in node 2 - let client_2 = Client::new(cluster.replica_2.address.unwrap(), &log); - assert!(client_2.is_oximeter_cluster().await.unwrap()); - let sql = String::from("SHOW DATABASES FORMAT JSONEachRow;"); - let result = client_2.execute_with_body(sql).await.unwrap(); - - // Try a few times to make sure data has been synchronised. - let tries = 5; - for _ in 0..tries { - if !result.contains("oximeter") { - sleep(Duration::from_secs(1)).await; - continue; - } else { - break; - } - } - - assert!(result.contains("oximeter")); - - // Insert row into one of the tables - let sql = String::from( - "INSERT INTO oximeter.measurements_string (datum) VALUES ('hiya');", - ); - client_2.execute_with_body(sql).await.unwrap(); - - let sql = String::from( - "SELECT * FROM oximeter.measurements_string FORMAT JSONEachRow;", - ); - let result = client_2.execute_with_body(sql.clone()).await.unwrap(); - assert!(result.contains("hiya")); - - // TODO(https://github.com/oxidecomputer/omicron/issues/4001): With distributed - // engine, it can take a long time to sync the data. This means it's tricky to - // test that the data exists on both nodes. - - cluster - .keeper_1 - .cleanup() - .await - .expect("Failed to cleanup ClickHouse keeper 1"); - cluster - .keeper_2 - .cleanup() - .await - .expect("Failed to cleanup ClickHouse keeper 2"); - cluster - .keeper_3 - .cleanup() - .await - .expect("Failed to cleanup ClickHouse keeper 3"); - cluster - .replica_1 - .cleanup() - .await - .expect("Failed to cleanup ClickHouse server 1"); - cluster - .replica_2 - .cleanup() - .await - .expect("Failed to cleanup ClickHouse server 2"); - - logctx.cleanup_successful(); - } - // Testing helper functions #[derive(Debug, Clone, oximeter::Target)] diff --git a/oximeter/db/src/db-replicated-init.sql b/oximeter/db/src/db-replicated-init.sql index 21a647b1a5..ec11854e44 100644 --- a/oximeter/db/src/db-replicated-init.sql +++ b/oximeter/db/src/db-replicated-init.sql @@ -188,6 +188,26 @@ CREATE TABLE IF NOT EXISTS oximeter.measurements_u64 ON CLUSTER oximeter_cluster ) ENGINE = Distributed('oximeter_cluster', 'oximeter', 'measurements_u64_local', xxHash64(splitByChar(':', timeseries_name)[1])); -- +CREATE TABLE IF NOT EXISTS oximeter.measurements_f32_local ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + timestamp DateTime64(9, 'UTC'), + datum Float32 +) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/measurements_f32_local', '{replica}') +ORDER BY (timeseries_name, timeseries_key, timestamp) +TTL toDateTime(timestamp) + INTERVAL 30 DAY; +-- +CREATE TABLE IF NOT EXISTS oximeter.measurements_f32 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + timestamp DateTime64(9, 'UTC'), + datum Float32 +) +ENGINE = Distributed('oximeter_cluster', 'oximeter', 'measurements_f32_local', xxHash64(splitByChar(':', timeseries_name)[1])); +-- CREATE TABLE IF NOT EXISTS oximeter.measurements_f64_local ON CLUSTER oximeter_cluster ( timeseries_name String, @@ -586,6 +606,66 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_bool ON CLUSTER oximeter_cluster ENGINE = ReplicatedReplacingMergeTree() ORDER BY (timeseries_name, field_name, field_value, timeseries_key); -- +CREATE TABLE IF NOT EXISTS oximeter.fields_i8 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value Int8 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- +CREATE TABLE IF NOT EXISTS oximeter.fields_u8 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value UInt8 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- +CREATE TABLE IF NOT EXISTS oximeter.fields_i16 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value Int16 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- +CREATE TABLE IF NOT EXISTS oximeter.fields_u16 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value UInt16 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- +CREATE TABLE IF NOT EXISTS oximeter.fields_i32 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value Int32 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- +CREATE TABLE IF NOT EXISTS oximeter.fields_u32 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value UInt32 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- CREATE TABLE IF NOT EXISTS oximeter.fields_i64 ON CLUSTER oximeter_cluster ( timeseries_name String, @@ -596,6 +676,16 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_i64 ON CLUSTER oximeter_cluster ENGINE = ReplicatedReplacingMergeTree() ORDER BY (timeseries_name, field_name, field_value, timeseries_key); -- +CREATE TABLE IF NOT EXISTS oximeter.fields_u64 ON CLUSTER oximeter_cluster +( + timeseries_name String, + timeseries_key UInt64, + field_name String, + field_value UInt64 +) +ENGINE = ReplicatedReplacingMergeTree() +ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +-- CREATE TABLE IF NOT EXISTS oximeter.fields_ipaddr ON CLUSTER oximeter_cluster ( timeseries_name String, diff --git a/oximeter/db/src/db-wipe-replicated.sql b/oximeter/db/src/db-wipe-replicated.sql index 1ed4d270b7..5874da7561 100644 --- a/oximeter/db/src/db-wipe-replicated.sql +++ b/oximeter/db/src/db-wipe-replicated.sql @@ -1 +1 @@ -DROP DATABASE IF EXISTS oximeter ON CLUSTER oximeter_cluster; +DROP DATABASE IF EXISTS oximeter ON CLUSTER oximeter_cluster SYNC; diff --git a/test-utils/src/dev/clickhouse.rs b/test-utils/src/dev/clickhouse.rs index e96f969bbc..6fb495627f 100644 --- a/test-utils/src/dev/clickhouse.rs +++ b/test-utils/src/dev/clickhouse.rs @@ -9,7 +9,7 @@ use std::process::Stdio; use std::time::Duration; use anyhow::{anyhow, Context}; -use std::net::SocketAddr; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use tempfile::{Builder, TempDir}; use thiserror::Error; use tokio::{ @@ -32,7 +32,7 @@ pub struct ClickHouseInstance { // The HTTP port the server is listening on port: u16, // The address the server is listening on - pub address: Option, + pub address: SocketAddr, // Full list of command-line arguments args: Vec, // Subprocess handle @@ -109,11 +109,13 @@ impl ClickHouseInstance { let data_path = data_dir.path().to_path_buf(); let port = wait_for_port(log_path).await?; + let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), port); + Ok(Self { data_dir: Some(data_dir), data_path, port, - address: None, + address: address, args, child: Some(child), }) @@ -161,21 +163,21 @@ impl ClickHouseInstance { .env("CH_USER_LOCAL_DIR", access_path) .env("CH_FORMAT_SCHEMA_PATH", format_schemas_path) .env("CH_REPLICA_NUMBER", r_number) - // There seems to be a bug using ipv6 with a replicated set up - // when installing all servers and coordinator nodes on the same - // server. For this reason we will be using ipv4 for testing. - .env("CH_REPLICA_HOST_01", "127.0.0.1") - .env("CH_REPLICA_HOST_02", "127.0.0.1") - .env("CH_KEEPER_HOST_01", "127.0.0.1") - .env("CH_KEEPER_HOST_02", "127.0.0.1") - .env("CH_KEEPER_HOST_03", "127.0.0.1") + .env("CH_REPLICA_HOST_01", "::1") + .env("CH_REPLICA_HOST_02", "::1") + // ClickHouse servers have a small quirk, where when setting the keeper hosts as IPv6 localhost + // addresses in the replica configuration file, they must be wrapped in square brackets + // Otherwise, when running any query, a "Service not found" error appears. + .env("CH_KEEPER_HOST_01", "[::1]") + .env("CH_KEEPER_HOST_02", "[::1]") + .env("CH_KEEPER_HOST_03", "[::1]") .spawn() .with_context(|| { format!("failed to spawn `clickhouse` (with args: {:?})", &args) })?; let data_path = data_dir.path().to_path_buf(); - let address = SocketAddr::new("127.0.0.1".parse().unwrap(), port); + let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), port); let result = wait_for_ready(log_path).await; match result { @@ -183,7 +185,7 @@ impl ClickHouseInstance { data_dir: Some(data_dir), data_path, port, - address: Some(address), + address: address, args, child: Some(child), }), @@ -237,12 +239,9 @@ impl ClickHouseInstance { .env("CH_KEEPER_ID_01", "1") .env("CH_KEEPER_ID_02", "2") .env("CH_KEEPER_ID_03", "3") - // There seems to be a bug using ipv6 and localhost with a replicated - // set up when installing all servers and coordinator nodes on the same - // server. For this reason we will be using ipv4 for testing. - .env("CH_KEEPER_HOST_01", "127.0.0.1") - .env("CH_KEEPER_HOST_02", "127.0.0.1") - .env("CH_KEEPER_HOST_03", "127.0.0.1") + .env("CH_KEEPER_HOST_01", "::1") + .env("CH_KEEPER_HOST_02", "::1") + .env("CH_KEEPER_HOST_03", "::1") .spawn() .with_context(|| { format!( @@ -252,6 +251,7 @@ impl ClickHouseInstance { })?; let data_path = data_dir.path().to_path_buf(); + let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), port); let result = wait_for_ready(log_path).await; match result { @@ -259,7 +259,7 @@ impl ClickHouseInstance { data_dir: Some(data_dir), data_path, port, - address: None, + address: address, args, child: Some(child), }), From 4b0905f78de7fcf7bb8189487d903a96c9d36562 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 31 Oct 2023 14:13:08 -0700 Subject: [PATCH 05/14] [update-engine] make last_seen a mutable cursor (#4398) All correct uses of incremental report generation must update the external `last_seen` cursor. To guarantee that, make last_seen be `&mut Option`, and effectively an inout parameter. --- update-engine/src/buffer.rs | 26 +++++++++++++------------- update-engine/src/context.rs | 3 +-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/update-engine/src/buffer.rs b/update-engine/src/buffer.rs index 3e7db63cb9..2426814444 100644 --- a/update-engine/src/buffer.rs +++ b/update-engine/src/buffer.rs @@ -122,19 +122,23 @@ impl EventBuffer { /// /// This report can be serialized and sent over the wire. pub fn generate_report(&self) -> EventReport { - self.generate_report_since(None) + self.generate_report_since(&mut None) } + /// Generates an [`EventReport`] for this buffer, updating `last_seen` to a + /// new value for incremental report generation. + /// + /// This report can be serialized and sent over the wire. pub fn generate_report_since( &self, - mut last_seen: Option, + last_seen: &mut Option, ) -> EventReport { // Gather step events across all keys. let mut step_events = Vec::new(); let mut progress_events = Vec::new(); for (_, step_data) in self.steps().as_slice() { step_events - .extend(step_data.step_events_since_impl(last_seen).cloned()); + .extend(step_data.step_events_since_impl(*last_seen).cloned()); progress_events .extend(step_data.step_status.progress_event().cloned()); } @@ -145,14 +149,14 @@ impl EventBuffer { if let Some(last) = step_events.last() { // Only update last_seen if there are new step events (otherwise it // stays the same). - last_seen = Some(last.event_index); + *last_seen = Some(last.event_index); } EventReport { step_events, progress_events, root_execution_id: self.root_execution_id(), - last_seen, + last_seen: *last_seen, } } @@ -1783,7 +1787,7 @@ mod tests { for (i, event) in self.generated_events.iter().enumerate() { for time in 0..times { (event_fn)(&mut buffer, event); - let report = buffer.generate_report_since(last_seen); + let report = buffer.generate_report_since(&mut last_seen); let is_last_event = i == self.generated_events.len() - 1; self.assert_general_properties( &buffer, @@ -1798,7 +1802,6 @@ mod tests { }) .unwrap(); reported_step_events.extend(report.step_events); - last_seen = report.last_seen; // Ensure that the last root index was updated for this // event's corresponding steps, but not for any others. @@ -1814,7 +1817,8 @@ mod tests { // Call last_seen without feeding a new event in to ensure that // a report with no step events is produced. - let report = buffer.generate_report_since(last_seen); + let mut last_seen_2 = last_seen; + let report = buffer.generate_report_since(&mut last_seen_2); ensure!( report.step_events.is_empty(), "{description}, at index {i} (time {time}),\ @@ -1893,16 +1897,12 @@ mod tests { for (i, event) in self.generated_events.iter().enumerate() { let event_added = (event_fn)(&mut buffer, event); - let report = match last_seen_opt { + let report = match &mut last_seen_opt { Some(last_seen) => buffer.generate_report_since(last_seen), None => buffer.generate_report(), }; let is_last_event = i == self.generated_events.len() - 1; - if let Some(last_seen) = &mut last_seen_opt { - *last_seen = report.last_seen; - } - self.assert_general_properties(&buffer, &report, is_last_event) .with_context(|| { format!( diff --git a/update-engine/src/context.rs b/update-engine/src/context.rs index cd85687cf9..c2c1e32119 100644 --- a/update-engine/src/context.rs +++ b/update-engine/src/context.rs @@ -242,8 +242,7 @@ impl NestedEventBuffer { report: EventReport, ) -> EventReport { self.buffer.add_event_report(report.into_generic()); - let ret = self.buffer.generate_report_since(self.last_seen); - self.last_seen = ret.last_seen; + let ret = self.buffer.generate_report_since(&mut self.last_seen); ret } } From f9fb2b83b9a994d26cf6486df3c6c82075dc38ae Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:13:43 -0700 Subject: [PATCH 06/14] chore(deps): update rust crate tempfile to 3.8 (#4397) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e56e558c6e..6292394e34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -347,7 +347,7 @@ syn = { version = "2.0" } tabled = "0.14" tar = "0.4" tempdir = "0.3" -tempfile = "3.6" +tempfile = "3.8" term = "0.7" termios = "0.3" textwrap = "0.16.0" From dadbc22c55782af8614ef237e844b9fc9f1fcb32 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:14:10 -0700 Subject: [PATCH 07/14] chore(deps): update rust crate serde_json to 1.0.108 (#4396) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8e16d0fba..2df98809a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7835,9 +7835,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.107" +version = "1.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65" +checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" dependencies = [ "itoa", "ryu", diff --git a/Cargo.toml b/Cargo.toml index 6292394e34..999fc680a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -309,7 +309,7 @@ semver = { version = "1.0.20", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive" ] } serde_derive = "1.0" serde_human_bytes = { git = "http://github.com/oxidecomputer/serde_human_bytes", branch = "main" } -serde_json = "1.0.107" +serde_json = "1.0.108" serde_path_to_error = "0.1.14" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index f433fe881a..72854ed29a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -79,7 +79,7 @@ ring = { version = "0.16.20", features = ["std"] } schemars = { version = "0.8.13", features = ["bytes", "chrono", "uuid1"] } semver = { version = "1.0.20", features = ["serde"] } serde = { version = "1.0.188", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.107", features = ["raw_value"] } +serde_json = { version = "1.0.108", features = ["raw_value"] } sha2 = { version = "0.10.8", features = ["oid"] } signature = { version = "2.1.0", default-features = false, features = ["digest", "rand_core", "std"] } similar = { version = "2.2.1", features = ["inline", "unicode"] } @@ -171,7 +171,7 @@ ring = { version = "0.16.20", features = ["std"] } schemars = { version = "0.8.13", features = ["bytes", "chrono", "uuid1"] } semver = { version = "1.0.20", features = ["serde"] } serde = { version = "1.0.188", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.107", features = ["raw_value"] } +serde_json = { version = "1.0.108", features = ["raw_value"] } sha2 = { version = "0.10.8", features = ["oid"] } signature = { version = "2.1.0", default-features = false, features = ["digest", "rand_core", "std"] } similar = { version = "2.2.1", features = ["inline", "unicode"] } From c976cb800e5aa1dd5aea26f6ecbfe077a280c645 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 1 Nov 2023 04:25:21 +0000 Subject: [PATCH 08/14] chore(deps): update taiki-e/install-action digest to 1286723 (#4401) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`e0367a2` -> `1286723`](https://togithub.com/taiki-e/install-action/compare/e0367a2...1286723) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index b717f425de..102e0dca87 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -22,7 +22,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@e0367a2d3f3d9fc43c3f25fe58692570375e19f0 # v2 + uses: taiki-e/install-action@1286723668b881a97f5cae2ef322c6b43efa610c # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From b64c6f1e86db62b690c942bb3d9dad5a2c933e6b Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:43:24 -0700 Subject: [PATCH 09/14] chore(deps): update rust crate test-strategy to 0.3.1 (#4402) --- Cargo.lock | 35 +++++++++++++++++++++++++++++------ Cargo.toml | 2 +- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2df98809a1..0d5168e23b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6002,7 +6002,7 @@ dependencies = [ "quote", "regex", "regex-syntax 0.6.29", - "structmeta", + "structmeta 0.1.6", "syn 1.0.109", ] @@ -8547,10 +8547,22 @@ checksum = "104842d6278bf64aa9d2f182ba4bde31e8aec7a131d29b7f444bb9b344a09e2a" dependencies = [ "proc-macro2", "quote", - "structmeta-derive", + "structmeta-derive 0.1.6", "syn 1.0.109", ] +[[package]] +name = "structmeta" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive 0.2.0", + "syn 2.0.32", +] + [[package]] name = "structmeta-derive" version = "0.1.6" @@ -8562,6 +8574,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "structmeta-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "structopt" version = "0.3.26" @@ -8788,14 +8811,14 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "test-strategy" -version = "0.2.1" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62d6408d1406657be2f9d1701fbae379331d30d2f6e92050710edb0d34eeb480" +checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9" dependencies = [ "proc-macro2", "quote", - "structmeta", - "syn 1.0.109", + "structmeta 0.2.0", + "syn 2.0.32", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 999fc680a5..db47bd8dbe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -351,7 +351,7 @@ tempfile = "3.8" term = "0.7" termios = "0.3" textwrap = "0.16.0" -test-strategy = "0.2.1" +test-strategy = "0.3.1" thiserror = "1.0" tofino = { git = "http://github.com/oxidecomputer/tofino", branch = "main" } tokio = "1.33.0" From 5be4c93ed6d0b6ef0d6e2233fc5a1329ce3a51ef Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 1 Nov 2023 15:00:08 -0700 Subject: [PATCH 10/14] initial inventory for automated update (#4291) --- Cargo.lock | 26 + Cargo.toml | 5 +- common/src/nexus_config.rs | 52 +- dev-tools/omdb/src/bin/omdb/db.rs | 474 ++++- dev-tools/omdb/src/bin/omdb/mgs.rs | 2 +- dev-tools/omdb/src/bin/omdb/nexus.rs | 38 +- dev-tools/omdb/tests/env.out | 12 + dev-tools/omdb/tests/successes.out | 13 + dev-tools/omdb/tests/usage_errors.out | 2 + dev-tools/omicron-dev/src/bin/omicron-dev.rs | 4 + gateway/src/http_entrypoints.rs | 13 +- nexus/Cargo.toml | 1 + nexus/db-model/src/inventory.rs | 443 +++++ nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 87 +- nexus/db-model/src/unsigned.rs | 2 + nexus/db-queries/Cargo.toml | 2 + nexus/db-queries/src/authz/api_resources.rs | 55 + nexus/db-queries/src/authz/omicron.polar | 10 + nexus/db-queries/src/authz/oso_generic.rs | 1 + .../src/authz/policy_test/resource_builder.rs | 3 +- .../src/authz/policy_test/resources.rs | 1 + .../db-queries/src/db/datastore/inventory.rs | 1518 +++++++++++++++++ nexus/db-queries/src/db/datastore/mod.rs | 25 +- nexus/db-queries/src/db/pool.rs | 2 + nexus/db-queries/tests/output/authz-roles.out | 14 + nexus/examples/config.toml | 7 + nexus/inventory/Cargo.toml | 23 + nexus/inventory/src/builder.rs | 786 +++++++++ nexus/inventory/src/collector.rs | 389 +++++ nexus/inventory/src/examples.rs | 254 +++ nexus/inventory/src/lib.rs | 27 + .../tests/output/collector_basic.txt | 43 + .../tests/output/collector_errors.txt | 44 + nexus/src/app/background/common.rs | 6 +- nexus/src/app/background/init.rs | 37 +- .../app/background/inventory_collection.rs | 243 +++ nexus/src/app/background/mod.rs | 1 + nexus/src/app/mod.rs | 2 + nexus/src/app/rack.rs | 1 + nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 45 +- nexus/tests/config.test.toml | 9 +- nexus/types/Cargo.toml | 1 + nexus/types/src/inventory.rs | 179 ++ nexus/types/src/lib.rs | 1 + openapi/gateway.json | 4 +- openapi/wicketd.json | 4 +- schema/crdb/9.0.0/up01.sql | 5 + schema/crdb/9.0.0/up02.sql | 2 + schema/crdb/9.0.0/up03.sql | 5 + schema/crdb/9.0.0/up04.sql | 4 + schema/crdb/9.0.0/up05.sql | 9 + schema/crdb/9.0.0/up06.sql | 2 + schema/crdb/9.0.0/up07.sql | 6 + schema/crdb/9.0.0/up08.sql | 2 + schema/crdb/9.0.0/up09.sql | 5 + schema/crdb/9.0.0/up10.sql | 2 + schema/crdb/9.0.0/up11.sql | 5 + schema/crdb/9.0.0/up12.sql | 15 + schema/crdb/9.0.0/up13.sql | 15 + schema/crdb/9.0.0/up14.sql | 6 + schema/crdb/9.0.0/up15.sql | 11 + schema/crdb/dbinit.sql | 220 ++- smf/nexus/multi-sled/config-partial.toml | 9 +- smf/nexus/single-sled/config-partial.toml | 9 +- wicket/src/state/inventory.rs | 2 +- wicket/src/ui/panes/overview.rs | 7 +- wicketd/src/update_tracker.rs | 22 +- 69 files changed, 5198 insertions(+), 80 deletions(-) create mode 100644 nexus/db-model/src/inventory.rs create mode 100644 nexus/db-queries/src/db/datastore/inventory.rs create mode 100644 nexus/inventory/Cargo.toml create mode 100644 nexus/inventory/src/builder.rs create mode 100644 nexus/inventory/src/collector.rs create mode 100644 nexus/inventory/src/examples.rs create mode 100644 nexus/inventory/src/lib.rs create mode 100644 nexus/inventory/tests/output/collector_basic.txt create mode 100644 nexus/inventory/tests/output/collector_errors.txt create mode 100644 nexus/src/app/background/inventory_collection.rs create mode 100644 nexus/types/src/inventory.rs create mode 100644 schema/crdb/9.0.0/up01.sql create mode 100644 schema/crdb/9.0.0/up02.sql create mode 100644 schema/crdb/9.0.0/up03.sql create mode 100644 schema/crdb/9.0.0/up04.sql create mode 100644 schema/crdb/9.0.0/up05.sql create mode 100644 schema/crdb/9.0.0/up06.sql create mode 100644 schema/crdb/9.0.0/up07.sql create mode 100644 schema/crdb/9.0.0/up08.sql create mode 100644 schema/crdb/9.0.0/up09.sql create mode 100644 schema/crdb/9.0.0/up10.sql create mode 100644 schema/crdb/9.0.0/up11.sql create mode 100644 schema/crdb/9.0.0/up12.sql create mode 100644 schema/crdb/9.0.0/up13.sql create mode 100644 schema/crdb/9.0.0/up14.sql create mode 100644 schema/crdb/9.0.0/up15.sql diff --git a/Cargo.lock b/Cargo.lock index 0d5168e23b..92dd15044a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4380,6 +4380,7 @@ dependencies = [ "dropshot", "expectorate", "futures", + "gateway-client", "headers", "hex", "http", @@ -4393,6 +4394,7 @@ dependencies = [ "newtype_derive", "nexus-db-model", "nexus-defaults", + "nexus-inventory", "nexus-test-utils", "nexus-types", "omicron-common 0.1.0", @@ -4452,6 +4454,26 @@ dependencies = [ "serde_json", ] +[[package]] +name = "nexus-inventory" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "expectorate", + "futures", + "gateway-client", + "gateway-messages", + "gateway-test-utils", + "nexus-types", + "omicron-workspace-hack", + "regex", + "slog", + "strum", + "tokio", + "uuid", +] + [[package]] name = "nexus-test-interface" version = "0.1.0" @@ -4479,6 +4501,8 @@ dependencies = [ "dns-server", "dns-service-client 0.1.0", "dropshot", + "gateway-messages", + "gateway-test-utils", "headers", "http", "hyper", @@ -4526,6 +4550,7 @@ dependencies = [ "chrono", "dns-service-client 0.1.0", "futures", + "gateway-client", "newtype_derive", "omicron-common 0.1.0", "omicron-passwords 0.1.0", @@ -5070,6 +5095,7 @@ dependencies = [ "nexus-db-model", "nexus-db-queries", "nexus-defaults", + "nexus-inventory", "nexus-test-interface", "nexus-test-utils", "nexus-test-utils-macros", diff --git a/Cargo.toml b/Cargo.toml index db47bd8dbe..ab56e052e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/test-interface", "nexus/test-utils-macros", "nexus/test-utils", @@ -108,6 +109,7 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/types", "oximeter/collector", "oximeter/db", @@ -234,6 +236,7 @@ nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } +nexus-inventory = { path = "nexus/inventory" } omicron-certificates = { path = "certificates" } omicron-passwords = { path = "passwords" } omicron-workspace-hack = "0.1.0" @@ -372,8 +375,8 @@ tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } unicode-width = "0.1.11" update-engine = { path = "update-engine" } -uuid = { version = "1.4.1", features = ["serde", "v4"] } usdt = "0.3" +uuid = { version = "1.4.1", features = ["serde", "v4"] } walkdir = "2.4" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index da50356d2e..4e821e2676 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -335,6 +335,8 @@ pub struct BackgroundTaskConfig { pub dns_external: DnsTasksConfig, /// configuration for external endpoint list watcher pub external_endpoints: ExternalEndpointsConfig, + /// configuration for inventory tasks + pub inventory: InventoryConfig, } #[serde_as] @@ -369,6 +371,30 @@ pub struct ExternalEndpointsConfig { // allow/disallow wildcard certs, don't serve expired certs, etc.) } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct InventoryConfig { + /// period (in seconds) for periodic activations of this background task + /// + /// Each activation fetches information about all harware and software in + /// the system and inserts it into the database. This generates a moderate + /// amount of data. + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// maximum number of past collections to keep in the database + /// + /// This is a very coarse mechanism to keep the system from overwhelming + /// itself with inventory data. + pub nkeep: u32, + + /// disable inventory collection altogether + /// + /// This is an emergency lever for support / operations. It should never be + /// necessary. + pub disable: bool, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -467,19 +493,16 @@ impl std::fmt::Display for SchemeName { #[cfg(test)] mod test { - use super::Tunables; use super::{ - default_techport_external_server_port, AuthnConfig, Config, - ConsoleConfig, LoadError, PackageConfig, SchemeName, - TimeseriesDbConfig, UpdatesConfig, + default_techport_external_server_port, AuthnConfig, + BackgroundTaskConfig, Config, ConfigDropshotWithTls, ConsoleConfig, + Database, DeploymentConfig, DnsTasksConfig, DpdConfig, + ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, + LoadErrorKind, MgdConfig, PackageConfig, SchemeName, + TimeseriesDbConfig, Tunables, UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; - use crate::nexus_config::{ - BackgroundTaskConfig, ConfigDropshotWithTls, Database, - DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, - InternalDns, LoadErrorKind, MgdConfig, - }; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingIfExists; @@ -626,6 +649,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 11 + inventory.disable = false [default_region_allocation_strategy] type = "random" seed = 0 @@ -719,6 +745,11 @@ mod test { }, external_endpoints: ExternalEndpointsConfig { period_secs: Duration::from_secs(9), + }, + inventory: InventoryConfig { + period_secs: Duration::from_secs(10), + nkeep: 11, + disable: false, } }, default_region_allocation_strategy: @@ -773,6 +804,9 @@ mod test { dns_external.period_secs_propagation = 7 dns_external.max_concurrent_server_updates = 8 external_endpoints.period_secs = 9 + inventory.period_secs = 10 + inventory.nkeep = 3 + inventory.disable = false [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index e3a0debbbb..efcefdea43 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -12,7 +12,7 @@ //! would be the only consumer -- and in that case it's okay to query the //! database directly. -// NOTE: eminates from Tabled macros +// NOTE: emanates from Tabled macros #![allow(clippy::useless_vec)] use crate::Omdb; @@ -30,6 +30,7 @@ use diesel::BoolExpressionMethods; use diesel::ExpressionMethods; use diesel::JoinOnDsl; use diesel::NullableExpressionMethods; +use gateway_client::types::SpType; use nexus_db_model::Dataset; use nexus_db_model::Disk; use nexus_db_model::DnsGroup; @@ -37,17 +38,22 @@ use nexus_db_model::DnsName; use nexus_db_model::DnsVersion; use nexus_db_model::DnsZone; use nexus_db_model::ExternalIp; +use nexus_db_model::HwBaseboardId; use nexus_db_model::Instance; +use nexus_db_model::InvCollection; use nexus_db_model::Project; use nexus_db_model::Region; use nexus_db_model::RegionSnapshot; use nexus_db_model::Sled; use nexus_db_model::Snapshot; use nexus_db_model::SnapshotState; +use nexus_db_model::SwCaboose; use nexus_db_model::Vmm; use nexus_db_model::Zpool; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::DataStoreConnection; +use nexus_db_queries::db::datastore::DataStoreInventoryTest; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; @@ -56,11 +62,14 @@ use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::internal_api::params::DnsRecord; use nexus_types::internal_api::params::Srv; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Generation; use omicron_common::postgres_config::PostgresConfigWithUrl; use std::cmp::Ordering; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; @@ -132,6 +141,8 @@ enum DbCommands { Disks(DiskArgs), /// Print information about internal and external DNS Dns(DnsArgs), + /// Print information about collected hardware/software inventory + Inventory(InventoryArgs), /// Print information about control plane services Services(ServicesArgs), /// Print information about sleds @@ -212,6 +223,42 @@ impl CliDnsGroup { } } +#[derive(Debug, Args)] +struct InventoryArgs { + #[command(subcommand)] + command: InventoryCommands, +} + +#[derive(Debug, Subcommand)] +enum InventoryCommands { + /// list all baseboards ever found + BaseboardIds, + /// list all cabooses ever found + Cabooses, + /// list and show details from particular collections + Collections(CollectionsArgs), +} + +#[derive(Debug, Args)] +struct CollectionsArgs { + #[command(subcommand)] + command: CollectionsCommands, +} + +#[derive(Debug, Subcommand)] +enum CollectionsCommands { + /// list collections + List, + /// show what was found in a particular collection + Show(CollectionsShowArgs), +} + +#[derive(Debug, Args)] +struct CollectionsShowArgs { + /// id of the collection + id: Uuid, +} + #[derive(Debug, Args)] struct ServicesArgs { #[command(subcommand)] @@ -335,6 +382,10 @@ impl DbArgs { cmd_db_dns_names(&opctx, &datastore, self.fetch_limit, args) .await } + DbCommands::Inventory(inventory_args) => { + cmd_db_inventory(&datastore, self.fetch_limit, inventory_args) + .await + } DbCommands::Services(ServicesArgs { command: ServicesCommands::ListInstances, }) => { @@ -429,15 +480,23 @@ where D: Display, { if items.len() == usize::try_from(limit.get()).unwrap() { - eprintln!( - "WARN: {}: found {} items (the limit). There may be more items \ - that were ignored. Consider overriding with --fetch-limit.", - context(), - items.len(), - ); + limit_error(limit, context); } } +fn limit_error(limit: NonZeroU32, context: F) +where + F: FnOnce() -> D, + D: Display, +{ + eprintln!( + "WARN: {}: found {} items (the limit). There may be more items \ + that were ignored. Consider overriding with --fetch-limit.", + context(), + limit, + ); +} + /// Returns pagination parameters to fetch the first page of results for a /// paginated endpoint fn first_page<'a, T>(limit: NonZeroU32) -> DataPageParams<'a, T> { @@ -1688,3 +1747,404 @@ fn format_record(record: &DnsRecord) -> impl Display { } } } + +// Inventory + +async fn cmd_db_inventory( + datastore: &DataStore, + limit: NonZeroU32, + inventory_args: &InventoryArgs, +) -> Result<(), anyhow::Error> { + let conn = datastore.pool_connection_for_tests().await?; + match inventory_args.command { + InventoryCommands::BaseboardIds => { + cmd_db_inventory_baseboard_ids(&conn, limit).await + } + InventoryCommands::Cabooses => { + cmd_db_inventory_cabooses(&conn, limit).await + } + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::List, + }) => cmd_db_inventory_collections_list(&conn, limit).await, + InventoryCommands::Collections(CollectionsArgs { + command: CollectionsCommands::Show(CollectionsShowArgs { id }), + }) => cmd_db_inventory_collections_show(datastore, id, limit).await, + } +} + +async fn cmd_db_inventory_baseboard_ids( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct BaseboardRow { + id: Uuid, + part_number: String, + serial_number: String, + } + + use db::schema::hw_baseboard_id::dsl; + let baseboard_ids = dsl::hw_baseboard_id + .order_by((dsl::part_number, dsl::serial_number)) + .limit(i64::from(u32::from(limit))) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboard ids")?; + check_limit(&baseboard_ids, limit, || "loading baseboard ids"); + + let rows = baseboard_ids.into_iter().map(|baseboard_id| BaseboardRow { + id: baseboard_id.id, + part_number: baseboard_id.part_number, + serial_number: baseboard_id.serial_number, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_cabooses( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow { + id: Uuid, + board: String, + git_commit: String, + name: String, + version: String, + } + + use db::schema::sw_caboose::dsl; + let mut cabooses = dsl::sw_caboose + .limit(i64::from(u32::from(limit))) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading cabooses")?; + check_limit(&cabooses, limit, || "loading cabooses"); + cabooses.sort(); + + let rows = cabooses.into_iter().map(|caboose| CabooseRow { + id: caboose.id, + board: caboose.board, + name: caboose.name, + version: caboose.version, + git_commit: caboose.git_commit, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_list( + conn: &DataStoreConnection<'_>, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CollectionRow { + id: Uuid, + started: String, + took: String, + nsps: i64, + nerrors: i64, + } + + let collections = { + use db::schema::inv_collection::dsl; + dsl::inv_collection + .order_by(dsl::time_started) + .limit(i64::from(u32::from(limit))) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collections")? + }; + check_limit(&collections, limit, || "loading collections"); + + let mut rows = Vec::new(); + for collection in collections { + let nerrors = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting errors")? + }; + + let nsps = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection.id)) + .select(diesel::dsl::count_star()) + .first_async(&**conn) + .await + .context("counting SPs")? + }; + + let took = format!( + "{} ms", + collection + .time_done + .signed_duration_since(&collection.time_started) + .num_milliseconds() + ); + rows.push(CollectionRow { + id: collection.id, + started: humantime::format_rfc3339_seconds( + collection.time_started.into(), + ) + .to_string(), + took, + nsps, + nerrors, + }); + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +async fn cmd_db_inventory_collections_show( + datastore: &DataStore, + id: Uuid, + limit: NonZeroU32, +) -> Result<(), anyhow::Error> { + let (collection, incomplete) = datastore + .inventory_collection_read_best_effort(id, limit) + .await + .context("reading collection")?; + if incomplete { + limit_error(limit, || "loading collection"); + } + + inv_collection_print(&collection).await?; + let nerrors = inv_collection_print_errors(&collection).await?; + inv_collection_print_devices(&collection).await?; + + if nerrors > 0 { + eprintln!( + "warning: {} collection error{} {} reported above", + nerrors, + if nerrors == 1 { "was" } else { "were" }, + if nerrors == 1 { "" } else { "s" } + ); + } + + Ok(()) +} + +async fn inv_collection_print( + collection: &Collection, +) -> Result<(), anyhow::Error> { + println!("collection: {}", collection.id); + println!( + "collector: {}{}", + collection.collector, + if collection.collector.parse::().is_ok() { + " (likely a Nexus instance)" + } else { + "" + } + ); + println!( + "started: {}", + humantime::format_rfc3339_millis(collection.time_started.into()) + ); + println!( + "done: {}", + humantime::format_rfc3339_millis(collection.time_done.into()) + ); + + Ok(()) +} + +async fn inv_collection_print_errors( + collection: &Collection, +) -> Result { + println!("errors: {}", collection.errors.len()); + for (index, message) in collection.errors.iter().enumerate() { + println!(" error {}: {}", index, message); + } + + Ok(collection + .errors + .len() + .try_into() + .expect("could not convert error count into u32 (yikes)")) +} + +async fn inv_collection_print_devices( + collection: &Collection, +) -> Result<(), anyhow::Error> { + // Assemble a list of baseboard ids, sorted first by device type (sled, + // switch, power), then by slot number. This is the order in which we will + // print everything out. + let mut sorted_baseboard_ids: Vec<_> = + collection.sps.keys().cloned().collect(); + sorted_baseboard_ids.sort_by(|s1, s2| { + let sp1 = collection.sps.get(s1).unwrap(); + let sp2 = collection.sps.get(s2).unwrap(); + sp1.sp_type.cmp(&sp2.sp_type).then(sp1.sp_slot.cmp(&sp2.sp_slot)) + }); + + // Now print them. + for baseboard_id in &sorted_baseboard_ids { + // This unwrap should not fail because the collection we're iterating + // over came from the one we're looking into now. + let sp = collection.sps.get(baseboard_id).unwrap(); + let baseboard = collection.baseboards.get(baseboard_id); + let rot = collection.rots.get(baseboard_id); + + println!(""); + match baseboard { + None => { + // It should be impossible to find an SP whose baseboard + // information we didn't previously fetch. That's either a bug + // in this tool (for failing to fetch or find the right + // baseboard information) or the inventory system (for failing + // to insert a record into the hw_baseboard_id table). + println!( + "{:?} (serial number unknown -- this is a bug)", + sp.sp_type + ); + println!(" part number: unknown"); + } + Some(baseboard) => { + println!("{:?} {}", sp.sp_type, baseboard.serial_number); + println!(" part number: {}", baseboard.part_number); + } + }; + + println!(" power: {:?}", sp.power_state); + println!(" revision: {}", sp.baseboard_revision); + print!(" MGS slot: {:?} {}", sp.sp_type, sp.sp_slot); + if let SpType::Sled = sp.sp_type { + print!(" (cubby {})", sp.sp_slot); + } + println!(""); + println!(" found at: {} from {}", sp.time_collected, sp.source); + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct CabooseRow<'a> { + slot: String, + board: &'a str, + name: &'a str, + version: &'a str, + git_commit: &'a str, + } + + println!(" cabooses:"); + let caboose_rows: Vec<_> = CabooseWhich::iter() + .filter_map(|c| { + collection.caboose_for(c, baseboard_id).map(|d| (c, d)) + }) + .map(|(c, found_caboose)| CabooseRow { + slot: format!("{:?}", c), + board: &found_caboose.caboose.board, + name: &found_caboose.caboose.name, + version: &found_caboose.caboose.version, + git_commit: &found_caboose.caboose.git_commit, + }) + .collect(); + let table = tabled::Table::new(caboose_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", textwrap::indent(&table.to_string(), " ")); + + if let Some(rot) = rot { + println!(" RoT: active slot: slot {:?}", rot.active_slot); + println!( + " RoT: persistent boot preference: slot {:?}", + rot.persistent_boot_preference, + ); + println!( + " RoT: pending persistent boot preference: {}", + rot.pending_persistent_boot_preference + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + println!( + " RoT: transient boot preference: {}", + rot.transient_boot_preference + .map(|s| format!("slot {:?}", s)) + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot A SHA3-256: {}", + rot.slot_a_sha3_256_digest + .clone() + .unwrap_or_else(|| String::from("-")) + ); + + println!( + " RoT: slot B SHA3-256: {}", + rot.slot_b_sha3_256_digest + .clone() + .unwrap_or_else(|| String::from("-")) + ); + } else { + println!(" RoT: no information found"); + } + } + + println!(""); + for sp_missing_rot in collection + .sps + .keys() + .collect::>() + .difference(&collection.rots.keys().collect::>()) + { + // It's not a bug in either omdb or the inventory system to find an SP + // with no RoT. It just means that when we collected inventory from the + // SP, it couldn't communicate with its RoT. + let sp = collection.sps.get(*sp_missing_rot).unwrap(); + println!( + "warning: found SP with no RoT: {:?} slot {}", + sp.sp_type, sp.sp_slot + ); + } + + for rot_missing_sp in collection + .rots + .keys() + .collect::>() + .difference(&collection.sps.keys().collect::>()) + { + // It *is* a bug in the inventory system (or omdb) to find an RoT with + // no SP, since we get the RoT information from the SP in the first + // place. + println!( + "error: found RoT with no SP: \ + hw_baseboard_id {:?} -- this is a bug", + rot_missing_sp + ); + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index d2938418e1..770cba9f62 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -433,7 +433,7 @@ async fn show_sp_details( board: caboose.board, git_commit: caboose.git_commit, name: caboose.name, - version: caboose.version.unwrap_or_else(|| "-".to_string()), + version: caboose.version, } } } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 7599fc209d..128d4315f2 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -6,6 +6,7 @@ use crate::Omdb; use anyhow::Context; +use chrono::DateTime; use chrono::SecondsFormat; use chrono::Utc; use clap::Args; @@ -144,7 +145,10 @@ async fn cmd_nexus_background_tasks_show( ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; - let mut tasks = response.into_inner(); + // Convert the HashMap to a BTreeMap because we want the keys in sorted + // order. + let mut tasks = + response.into_inner().into_iter().collect::>(); // We want to pick the order that we print some tasks intentionally. Then // we want to print anything else that we find. @@ -478,6 +482,38 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } } } + } else if name == "inventory_collection" { + #[derive(Deserialize)] + struct InventorySuccess { + collection_id: Uuid, + time_started: DateTime, + time_done: DateTime, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(found_inventory) => { + println!( + " last collection id: {}", + found_inventory.collection_id + ); + println!( + " last collection started: {}", + found_inventory + .time_started + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + println!( + " last collection done: {}", + found_inventory + .time_done + .to_rfc3339_opts(SecondsFormat::Secs, true), + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 0fbef95f27..7949c1eb61 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,6 +57,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT @@ -113,6 +117,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. @@ -156,6 +164,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: Nexus URL not specified. Will pick one from DNS. diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index a830cf671a..8162b6d9de 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -251,6 +251,10 @@ task: "external_endpoints" on each one +task: "inventory_collection" + collects hardware and software inventory data from the whole system + + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ @@ -332,6 +336,15 @@ task: "external_endpoints" TLS certificates: 0 +task: "inventory_collection" + configured period: every 10m + currently executing: no + last completed activation: iter 3, triggered by an explicit signal + started at (s ago) and ran for ms + last collection id: REDACTED_UUID_REDACTED_UUID_REDACTED + last collection started: + last collection done: + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 6ab6cb33fc..e859c325a5 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -92,6 +92,7 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory services Print information about control plane services sleds Print information about sleds instances Print information about customer instances @@ -115,6 +116,7 @@ Usage: omdb db [OPTIONS] Commands: disks Print information about disks dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory services Print information about control plane services sleds Print information about sleds instances Print information about customer instances diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index e79184f7e5..66778d96e7 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -403,6 +403,10 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { cptestctx.silo_name, cptestctx.external_dns_zone_name, ); + println!( + "omicron-dev: management gateway: http://{}", + cptestctx.gateway.client.bind_address, + ); println!("omicron-dev: silo name: {}", cptestctx.silo_name,); println!( "omicron-dev: privileged user name: {}", diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 12bc7b465a..2db6121f1d 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -29,8 +29,6 @@ use dropshot::WebsocketEndpointResult; use dropshot::WebsocketUpgrade; use futures::TryFutureExt; use gateway_messages::SpComponent; -use gateway_messages::SpError; -use gateway_sp_comms::error::CommunicationError; use gateway_sp_comms::HostPhase2Provider; use omicron_common::update::ArtifactHash; use schemars::JsonSchema; @@ -488,7 +486,7 @@ pub struct SpComponentCaboose { pub git_commit: String, pub board: String, pub name: String, - pub version: Option, + pub version: String, } /// Identity of a host phase2 recovery image. @@ -725,18 +723,15 @@ async fn sp_component_caboose_get( .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME) .await .map_err(SpCommsError::from)?; - let version = match sp + let version = sp .read_component_caboose(component, firmware_slot, CABOOSE_KEY_VERSION) .await - { - Ok(value) => Some(from_utf8(&CABOOSE_KEY_VERSION, value)?), - Err(CommunicationError::SpError(SpError::NoSuchCabooseKey(_))) => None, - Err(err) => return Err(SpCommsError::from(err).into()), - }; + .map_err(SpCommsError::from)?; let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; let name = from_utf8(&CABOOSE_KEY_NAME, name)?; + let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; let caboose = SpComponentCaboose { git_commit, board, name, version }; diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 323386ba25..feb25eb1f1 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -84,6 +84,7 @@ usdt.workspace = true nexus-defaults.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true +nexus-inventory.workspace = true nexus-types.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs new file mode 100644 index 0000000000..5b09f289bb --- /dev/null +++ b/nexus/db-model/src/inventory.rs @@ -0,0 +1,443 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing the hardware/software inventory in the database + +use crate::schema::{ + hw_baseboard_id, inv_caboose, inv_collection, inv_collection_error, + inv_root_of_trust, inv_service_processor, sw_caboose, +}; +use crate::{impl_enum_type, SqlU16, SqlU32}; +use chrono::DateTime; +use chrono::Utc; +use diesel::backend::Backend; +use diesel::deserialize::{self, FromSql}; +use diesel::expression::AsExpression; +use diesel::pg::Pg; +use diesel::serialize::ToSql; +use diesel::{serialize, sql_types}; +use nexus_types::inventory::{ + BaseboardId, Caboose, Collection, PowerState, RotSlot, +}; +use uuid::Uuid; + +// See [`nexus_types::inventory::PowerState`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_power_state"))] + pub struct HwPowerStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwPowerStateEnum)] + pub enum HwPowerState; + + // Enum values + A0 => b"A0" + A1 => b"A1" + A2 => b"A2" +); + +impl From for HwPowerState { + fn from(p: PowerState) -> Self { + match p { + PowerState::A0 => HwPowerState::A0, + PowerState::A1 => HwPowerState::A1, + PowerState::A2 => HwPowerState::A2, + } + } +} + +impl From for PowerState { + fn from(value: HwPowerState) -> Self { + match value { + HwPowerState::A0 => PowerState::A0, + HwPowerState::A1 => PowerState::A1, + HwPowerState::A2 => PowerState::A2, + } + } +} + +// See [`nexus_types::inventory::RotSlot`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "hw_rot_slot"))] + pub struct HwRotSlotEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = HwRotSlotEnum)] + pub enum HwRotSlot; + + // Enum values + A => b"A" + B => b"B" +); + +impl From for HwRotSlot { + fn from(value: RotSlot) -> Self { + match value { + RotSlot::A => HwRotSlot::A, + RotSlot::B => HwRotSlot::B, + } + } +} + +impl From for RotSlot { + fn from(value: HwRotSlot) -> RotSlot { + match value { + HwRotSlot::A => RotSlot::A, + HwRotSlot::B => RotSlot::B, + } + } +} + +// See [`nexus_types::inventory::CabooseWhich`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "caboose_which"))] + pub struct CabooseWhichEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + #[diesel(sql_type = CabooseWhichEnum)] + pub enum CabooseWhich; + + // Enum values + SpSlot0 => b"sp_slot_0" + SpSlot1 => b"sp_slot_1" + RotSlotA => b"rot_slot_A" + RotSlotB => b"rot_slot_B" +); + +impl From for CabooseWhich { + fn from(c: nexus_types::inventory::CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; + match c { + nexus_inventory::CabooseWhich::SpSlot0 => CabooseWhich::SpSlot0, + nexus_inventory::CabooseWhich::SpSlot1 => CabooseWhich::SpSlot1, + nexus_inventory::CabooseWhich::RotSlotA => CabooseWhich::RotSlotA, + nexus_inventory::CabooseWhich::RotSlotB => CabooseWhich::RotSlotB, + } + } +} + +impl From for nexus_types::inventory::CabooseWhich { + fn from(row: CabooseWhich) -> Self { + use nexus_types::inventory as nexus_inventory; + match row { + CabooseWhich::SpSlot0 => nexus_inventory::CabooseWhich::SpSlot0, + CabooseWhich::SpSlot1 => nexus_inventory::CabooseWhich::SpSlot1, + CabooseWhich::RotSlotA => nexus_inventory::CabooseWhich::RotSlotA, + CabooseWhich::RotSlotB => nexus_inventory::CabooseWhich::RotSlotB, + } + } +} + +// See [`nexus_types::inventory::SpType`]. +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "sp_type"))] + pub struct SpTypeEnum; + + #[derive( + Copy, + Clone, + Debug, + AsExpression, + FromSqlRow, + PartialOrd, + Ord, + PartialEq, + Eq + )] + #[diesel(sql_type = SpTypeEnum)] + pub enum SpType; + + // Enum values + Sled => b"sled" + Switch => b"switch" + Power => b"power" +); + +impl From for SpType { + fn from(value: nexus_types::inventory::SpType) -> Self { + match value { + nexus_types::inventory::SpType::Sled => SpType::Sled, + nexus_types::inventory::SpType::Power => SpType::Power, + nexus_types::inventory::SpType::Switch => SpType::Switch, + } + } +} + +impl From for nexus_types::inventory::SpType { + fn from(value: SpType) -> Self { + match value { + SpType::Sled => nexus_types::inventory::SpType::Sled, + SpType::Switch => nexus_types::inventory::SpType::Switch, + SpType::Power => nexus_types::inventory::SpType::Power, + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection)] +pub struct InvCollection { + pub id: Uuid, + pub time_started: DateTime, + pub time_done: DateTime, + pub collector: String, +} + +impl<'a> From<&'a Collection> for InvCollection { + fn from(c: &'a Collection) -> Self { + InvCollection { + id: c.id, + time_started: c.time_started, + time_done: c.time_done, + collector: c.collector.clone(), + } + } +} + +/// See [`nexus_types::inventory::BaseboardId`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = hw_baseboard_id)] +pub struct HwBaseboardId { + pub id: Uuid, + pub part_number: String, + pub serial_number: String, +} + +impl From for HwBaseboardId { + fn from(c: BaseboardId) -> Self { + HwBaseboardId { + id: Uuid::new_v4(), + part_number: c.part_number, + serial_number: c.serial_number, + } + } +} + +impl From for BaseboardId { + fn from(row: HwBaseboardId) -> Self { + BaseboardId { + part_number: row.part_number, + serial_number: row.serial_number, + } + } +} + +/// See [`nexus_types::inventory::Caboose`]. +#[derive( + Queryable, + Insertable, + Clone, + Debug, + Selectable, + Eq, + PartialEq, + Ord, + PartialOrd, +)] +#[diesel(table_name = sw_caboose)] +pub struct SwCaboose { + pub id: Uuid, + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for SwCaboose { + fn from(c: Caboose) -> Self { + SwCaboose { + id: Uuid::new_v4(), + board: c.board, + git_commit: c.git_commit, + name: c.name, + version: c.version, + } + } +} + +impl From for Caboose { + fn from(row: SwCaboose) -> Self { + Self { + board: row.board, + git_commit: row.git_commit, + name: row.name, + version: row.version, + } + } +} + +/// See [`nexus_types::inventory::Collection`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_collection_error)] +pub struct InvCollectionError { + pub inv_collection_id: Uuid, + pub idx: SqlU16, + pub message: String, +} + +impl InvCollectionError { + pub fn new(inv_collection_id: Uuid, idx: u16, message: String) -> Self { + InvCollectionError { + inv_collection_id, + idx: SqlU16::from(idx), + message, + } + } +} + +/// See [`nexus_types::inventory::ServiceProcessor`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_service_processor)] +pub struct InvServiceProcessor { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + + pub baseboard_revision: BaseboardRevision, + pub hubris_archive_id: String, + pub power_state: HwPowerState, +} + +impl From for nexus_types::inventory::ServiceProcessor { + fn from(row: InvServiceProcessor) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + sp_type: nexus_types::inventory::SpType::from(row.sp_type), + sp_slot: **row.sp_slot, + baseboard_revision: **row.baseboard_revision, + hubris_archive: row.hubris_archive_id, + power_state: PowerState::from(row.power_state), + } + } +} + +/// Newtype wrapping the MGS-reported slot number for an SP +/// +/// Current racks only have 32 slots for any given SP type. MGS represents the +/// slot number with a u32. We truncate it to a u16 (which still requires +/// storing it as an i32 in the database, since the database doesn't natively +/// support signed integers). +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int4)] +pub struct SpMgsSlot(SqlU16); + +NewtypeFrom! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDeref! { () pub struct SpMgsSlot(SqlU16); } +NewtypeDisplay! { () pub struct SpMgsSlot(SqlU16); } + +impl ToSql for SpMgsSlot { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for SpMgsSlot +where + DB: Backend, + SqlU16: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(SpMgsSlot(SqlU16::from_sql(bytes)?)) + } +} + +/// Newtype wrapping the revision number for a particular baseboard +/// +/// MGS reports this as a u32 and we represent it the same way, though that +/// would be quite a lot of hardware revisions to go through! +#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow)] +#[diesel(sql_type = sql_types::Int8)] +pub struct BaseboardRevision(SqlU32); + +NewtypeFrom! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDeref! { () pub struct BaseboardRevision(SqlU32); } +NewtypeDisplay! { () pub struct BaseboardRevision(SqlU32); } + +impl ToSql for BaseboardRevision { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &self.0, + &mut out.reborrow(), + ) + } +} + +impl FromSql for BaseboardRevision +where + DB: Backend, + SqlU32: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(BaseboardRevision(SqlU32::from_sql(bytes)?)) + } +} + +/// See [`nexus_types::inventory::RotState`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_root_of_trust)] +pub struct InvRootOfTrust { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub slot_active: HwRotSlot, + pub slot_boot_pref_transient: Option, + pub slot_boot_pref_persistent: HwRotSlot, + pub slot_boot_pref_persistent_pending: Option, + pub slot_a_sha3_256: Option, + pub slot_b_sha3_256: Option, +} + +impl From for nexus_types::inventory::RotState { + fn from(row: InvRootOfTrust) -> Self { + Self { + time_collected: row.time_collected, + source: row.source, + active_slot: RotSlot::from(row.slot_active), + persistent_boot_preference: RotSlot::from( + row.slot_boot_pref_persistent, + ), + pending_persistent_boot_preference: row + .slot_boot_pref_persistent_pending + .map(RotSlot::from), + transient_boot_preference: row + .slot_boot_pref_transient + .map(RotSlot::from), + slot_a_sha3_256_digest: row.slot_a_sha3_256, + slot_b_sha3_256_digest: row.slot_b_sha3_256, + } + } +} + +/// See [`nexus_types::inventory::CabooseFound`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_caboose)] +pub struct InvCaboose { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub which: CabooseWhich, + pub sw_caboose_id: Uuid, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index f399605f55..7aa8a6b076 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -32,6 +32,7 @@ mod image; mod instance; mod instance_cpu_count; mod instance_state; +mod inventory; mod ip_pool; mod ipv4net; mod ipv6; @@ -121,6 +122,7 @@ pub use image::*; pub use instance::*; pub use instance_cpu_count::*; pub use instance_state::*; +pub use inventory::*; pub use ip_pool::*; pub use ipv4net::*; pub use ipv6::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index e079432e5a..cff261e01a 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1140,6 +1140,87 @@ table! { } } +/* hardware inventory */ + +table! { + hw_baseboard_id (id) { + id -> Uuid, + part_number -> Text, + serial_number -> Text, + } +} + +table! { + sw_caboose (id) { + id -> Uuid, + board -> Text, + git_commit -> Text, + name -> Text, + version -> Text, + } +} + +table! { + inv_collection (id) { + id -> Uuid, + time_started -> Timestamptz, + time_done -> Timestamptz, + collector -> Text, + } +} + +table! { + inv_collection_error (inv_collection_id, idx) { + inv_collection_id -> Uuid, + idx -> Int4, + message -> Text, + } +} + +table! { + inv_service_processor (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + sp_type -> crate::SpTypeEnum, + sp_slot -> Int4, + + baseboard_revision -> Int8, + hubris_archive_id -> Text, + power_state -> crate::HwPowerStateEnum, + } +} + +table! { + inv_root_of_trust (inv_collection_id, hw_baseboard_id) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + slot_active -> crate::HwRotSlotEnum, + slot_boot_pref_transient -> Nullable, + slot_boot_pref_persistent -> crate::HwRotSlotEnum, + slot_boot_pref_persistent_pending -> Nullable, + slot_a_sha3_256 -> Nullable, + slot_b_sha3_256 -> Nullable, + } +} + +table! { + inv_caboose (inv_collection_id, hw_baseboard_id, which) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + which -> crate::CabooseWhichEnum, + sw_caboose_id -> Uuid, + } +} + table! { bootstore_keys (key, generation) { key -> Text, @@ -1162,7 +1243,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(8, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(9, 0, 0); allow_tables_to_appear_in_same_query!( system_update, @@ -1174,6 +1255,10 @@ joinable!(system_update_component_update -> component_update (component_update_i allow_tables_to_appear_in_same_query!(ip_pool_range, ip_pool); joinable!(ip_pool_range -> ip_pool (ip_pool_id)); +allow_tables_to_appear_in_same_query!(inv_collection, inv_collection_error); +joinable!(inv_collection_error -> inv_collection (inv_collection_id)); +allow_tables_to_appear_in_same_query!(hw_baseboard_id, sw_caboose, inv_caboose); + allow_tables_to_appear_in_same_query!( dataset, disk, diff --git a/nexus/db-model/src/unsigned.rs b/nexus/db-model/src/unsigned.rs index 7059c6bcad..b4e9db2308 100644 --- a/nexus/db-model/src/unsigned.rs +++ b/nexus/db-model/src/unsigned.rs @@ -83,6 +83,7 @@ pub struct SqlU16(pub u16); NewtypeFrom! { () pub struct SqlU16(u16); } NewtypeDeref! { () pub struct SqlU16(u16); } +NewtypeDisplay! { () pub struct SqlU16(u16); } impl SqlU16 { pub fn new(value: u16) -> Self { @@ -134,6 +135,7 @@ pub struct SqlU32(pub u32); NewtypeFrom! { () pub struct SqlU32(u32); } NewtypeDeref! { () pub struct SqlU32(u32); } +NewtypeDisplay! { () pub struct SqlU32(u32); } impl SqlU32 { pub fn new(value: u32) -> Self { diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index c16c0f5319..5edf4f1e89 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -70,8 +70,10 @@ omicron-workspace-hack.workspace = true assert_matches.workspace = true expectorate.workspace = true hyper-rustls.workspace = true +gateway-client.workspace = true internal-dns.workspace = true itertools.workspace = true +nexus-inventory.workspace = true nexus-test-utils.workspace = true omicron-sled-agent.workspace = true omicron-test-utils.workspace = true diff --git a/nexus/db-queries/src/authz/api_resources.rs b/nexus/db-queries/src/authz/api_resources.rs index ec959e2907..b22fe1ac25 100644 --- a/nexus/db-queries/src/authz/api_resources.rs +++ b/nexus/db-queries/src/authz/api_resources.rs @@ -473,6 +473,61 @@ impl AuthorizedResource for DeviceAuthRequestList { } } +/// Synthetic resource used for modeling access to low-level hardware inventory +/// data +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Inventory; +pub const INVENTORY: Inventory = Inventory {}; + +impl oso::PolarClass for Inventory { + fn get_polar_class_builder() -> oso::ClassBuilder { + // Roles are not directly attached to Inventory + oso::Class::builder() + .with_equality_check() + .add_method( + "has_role", + |_: &Inventory, _actor: AuthenticatedActor, _role: String| { + false + }, + ) + .add_attribute_getter("fleet", |_| FLEET) + } +} + +impl AuthorizedResource for Inventory { + fn load_roles<'a, 'b, 'c, 'd, 'e, 'f>( + &'a self, + opctx: &'b OpContext, + datastore: &'c DataStore, + authn: &'d authn::Context, + roleset: &'e mut RoleSet, + ) -> futures::future::BoxFuture<'f, Result<(), Error>> + where + 'a: 'f, + 'b: 'f, + 'c: 'f, + 'd: 'f, + 'e: 'f, + { + load_roles_for_resource_tree(&FLEET, opctx, datastore, authn, roleset) + .boxed() + } + + fn on_unauthorized( + &self, + _: &Authz, + error: Error, + _: AnyActor, + _: Action, + ) -> Error { + error + } + + fn polar_class(&self) -> oso::Class { + Self::get_polar_class() + } +} + /// Synthetic resource describing the list of Certificates associated with a /// Silo #[derive(Clone, Debug, Eq, PartialEq)] diff --git a/nexus/db-queries/src/authz/omicron.polar b/nexus/db-queries/src/authz/omicron.polar index 119eccc8e9..87fdf72f6a 100644 --- a/nexus/db-queries/src/authz/omicron.polar +++ b/nexus/db-queries/src/authz/omicron.polar @@ -365,6 +365,16 @@ resource DnsConfig { has_relation(fleet: Fleet, "parent_fleet", dns_config: DnsConfig) if dns_config.fleet = fleet; +# Describes the policy for reading and modifying low-level inventory +resource Inventory { + permissions = [ "read", "modify" ]; + relations = { parent_fleet: Fleet }; + "read" if "viewer" on "parent_fleet"; + "modify" if "admin" on "parent_fleet"; +} +has_relation(fleet: Fleet, "parent_fleet", inventory: Inventory) + if inventory.fleet = fleet; + # Describes the policy for accessing "/v1/system/ip-pools" in the API resource IpPoolList { permissions = [ diff --git a/nexus/db-queries/src/authz/oso_generic.rs b/nexus/db-queries/src/authz/oso_generic.rs index bcd7a42945..e642062ead 100644 --- a/nexus/db-queries/src/authz/oso_generic.rs +++ b/nexus/db-queries/src/authz/oso_generic.rs @@ -106,6 +106,7 @@ pub fn make_omicron_oso(log: &slog::Logger) -> Result { Database::get_polar_class(), DnsConfig::get_polar_class(), Fleet::get_polar_class(), + Inventory::get_polar_class(), IpPoolList::get_polar_class(), ConsoleSessionList::get_polar_class(), DeviceAuthRequestList::get_polar_class(), diff --git a/nexus/db-queries/src/authz/policy_test/resource_builder.rs b/nexus/db-queries/src/authz/policy_test/resource_builder.rs index a4c68ea000..f10c969038 100644 --- a/nexus/db-queries/src/authz/policy_test/resource_builder.rs +++ b/nexus/db-queries/src/authz/policy_test/resource_builder.rs @@ -244,9 +244,10 @@ macro_rules! impl_dyn_authorized_resource_for_global { impl_dyn_authorized_resource_for_global!(authz::oso_generic::Database); impl_dyn_authorized_resource_for_global!(authz::ConsoleSessionList); +impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); impl_dyn_authorized_resource_for_global!(authz::DnsConfig); impl_dyn_authorized_resource_for_global!(authz::IpPoolList); -impl_dyn_authorized_resource_for_global!(authz::DeviceAuthRequestList); +impl_dyn_authorized_resource_for_global!(authz::Inventory); impl DynAuthorizedResource for authz::SiloCertificateList { fn do_authorize<'a, 'b>( diff --git a/nexus/db-queries/src/authz/policy_test/resources.rs b/nexus/db-queries/src/authz/policy_test/resources.rs index 054fe6430b..3049f3b9bf 100644 --- a/nexus/db-queries/src/authz/policy_test/resources.rs +++ b/nexus/db-queries/src/authz/policy_test/resources.rs @@ -67,6 +67,7 @@ pub async fn make_resources( builder.new_resource(authz::CONSOLE_SESSION_LIST); builder.new_resource(authz::DNS_CONFIG); builder.new_resource(authz::DEVICE_AUTH_REQUEST_LIST); + builder.new_resource(authz::INVENTORY); builder.new_resource(authz::IP_POOL_LIST); // Silo/organization/project hierarchy diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs new file mode 100644 index 0000000000..6b7d97754a --- /dev/null +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -0,0 +1,1518 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; +use crate::db::TransactionError; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use async_bb8_diesel::AsyncConnection; +use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::AsyncSimpleConnection; +use diesel::expression::SelectableHelper; +use diesel::sql_types::Nullable; +use diesel::BoolExpressionMethods; +use diesel::ExpressionMethods; +use diesel::IntoSql; +use diesel::JoinOnDsl; +use diesel::NullableExpressionMethods; +use diesel::QueryDsl; +use diesel::Table; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::CabooseWhichEnum; +use nexus_db_model::HwBaseboardId; +use nexus_db_model::HwPowerState; +use nexus_db_model::HwPowerStateEnum; +use nexus_db_model::HwRotSlot; +use nexus_db_model::HwRotSlotEnum; +use nexus_db_model::InvCaboose; +use nexus_db_model::InvCollection; +use nexus_db_model::InvCollectionError; +use nexus_db_model::InvRootOfTrust; +use nexus_db_model::InvServiceProcessor; +use nexus_db_model::SpType; +use nexus_db_model::SpTypeEnum; +use nexus_db_model::SwCaboose; +use nexus_types::inventory::Collection; +use omicron_common::api::external::Error; +use omicron_common::api::external::InternalContext; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::num::NonZeroU32; +use std::sync::Arc; +use uuid::Uuid; + +impl DataStore { + /// Store a complete inventory collection into the database + pub async fn inventory_insert_collection( + &self, + opctx: &OpContext, + collection: &Collection, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + // In the database, the collection is represented essentially as a tree + // rooted at an `inv_collection` row. Other nodes in the tree point + // back at the `inv_collection` via `inv_collection_id`. + // + // It's helpful to assemble some values before entering the transaction + // so that we can produce the `Error` type that we want here. + let row_collection = InvCollection::from(collection); + let collection_id = row_collection.id; + let baseboards = collection + .baseboards + .iter() + .map(|b| HwBaseboardId::from((**b).clone())) + .collect::>(); + let cabooses = collection + .cabooses + .iter() + .map(|s| SwCaboose::from((**s).clone())) + .collect::>(); + let error_values = collection + .errors + .iter() + .enumerate() + .map(|(i, message)| { + let index = u16::try_from(i).map_err(|e| { + Error::internal_error(&format!( + "failed to convert error index to u16 (too \ + many errors in inventory collection?): {}", + e + )) + })?; + Ok(InvCollectionError::new( + collection_id, + index, + message.clone(), + )) + }) + .collect::, Error>>()?; + + // This implementation inserts all records associated with the + // collection in one transaction. This is primarily for simplicity. It + // means we don't have to worry about other readers seeing a + // half-inserted collection, nor leaving detritus around if we start + // inserting records and then crash. However, it does mean this is + // likely to be a big transaction and if that becomes a problem we could + // break this up as long as we address those problems. + // + // The SQL here is written so that it doesn't have to be an + // *interactive* transaction. That is, it should in principle be + // possible to generate all this SQL up front and send it as one big + // batch rather than making a bunch of round-trips to the database. + // We'd do that if we had an interface for doing that with bound + // parameters, etc. See oxidecomputer/omicron#973. + let pool = self.pool_connection_authorized(opctx).await?; + pool.transaction_async(|conn| async move { + // Insert records (and generate ids) for any baseboards that do not + // already exist in the database. These rows are not scoped to a + // particular collection. They contain only immutable data -- + // they're just a mapping between hardware-provided baseboard + // identifiers (part number and model number) and an + // Omicron-specific primary key (a UUID). + { + use db::schema::hw_baseboard_id::dsl; + let _ = diesel::insert_into(dsl::hw_baseboard_id) + .values(baseboards) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert records (and generate ids) for each distinct caboose that + // we've found. Like baseboards, these might already be present and + // rows in this table are not scoped to a particular collection + // because they only map (immutable) identifiers to UUIDs. + { + use db::schema::sw_caboose::dsl; + let _ = diesel::insert_into(dsl::sw_caboose) + .values(cabooses) + .on_conflict_do_nothing() + .execute_async(&conn) + .await?; + } + + // Insert a record describing the collection itself. + { + use db::schema::inv_collection::dsl; + let _ = diesel::insert_into(dsl::inv_collection) + .values(row_collection) + .execute_async(&conn) + .await?; + } + + // Insert rows for the service processors we found. These have a + // foreign key into the hw_baseboard_id table. We don't have those + // id values, though. We may have just inserted them, or maybe not + // (if they previously existed). To avoid dozens of unnecessary + // round-trips, we use INSERT INTO ... SELECT, which looks like + // this: + // + // INSERT INTO inv_service_processor + // SELECT + // id + // [other service_processor column values as literals] + // FROM hw_baseboard_id + // WHERE part_number = ... AND serial_number = ...; + // + // This way, we don't need to know the id. The database looks it up + // for us as it does the INSERT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_service_processor::dsl as sp_dsl; + + for (baseboard_id, sp) in &collection.sps { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + sp.time_collected + .into_sql::(), + sp.source + .clone() + .into_sql::(), + SpType::from(sp.sp_type).into_sql::(), + i32::from(sp.sp_slot) + .into_sql::(), + i64::from(sp.baseboard_revision) + .into_sql::(), + sp.hubris_archive + .clone() + .into_sql::(), + HwPowerState::from(sp.power_state) + .into_sql::(), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_service_processor::table, + ) + .values(selection) + .into_columns(( + sp_dsl::inv_collection_id, + sp_dsl::hw_baseboard_id, + sp_dsl::time_collected, + sp_dsl::source, + sp_dsl::sp_type, + sp_dsl::sp_slot, + sp_dsl::baseboard_revision, + sp_dsl::hubris_archive_id, + sp_dsl::power_state, + )) + .execute_async(&conn) + .await?; + + // This statement is just here to force a compilation error + // if the set of columns in `inv_service_processor` changes. + // The code above attempts to insert a row into + // `inv_service_processor` using an explicit list of columns + // and values. Without the following statement, If a new + // required column were added, this would only fail at + // runtime. + // + // If you're here because of a compile error, you might be + // changing the `inv_service_processor` table. Update the + // statement below and be sure to update the code above, + // too! + // + // See also similar comments in blocks below, near other + // uses of `all_columns(). + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _sp_type, + _sp_slot, + _baseboard_revision, + _hubris_archive_id, + _power_state, + ) = sp_dsl::inv_service_processor::all_columns(); + } + } + + // Insert rows for the roots of trust that we found. Like service + // processors, we do this using INSERT INTO ... SELECT. + { + use db::schema::hw_baseboard_id::dsl as baseboard_dsl; + use db::schema::inv_root_of_trust::dsl as rot_dsl; + + for (baseboard_id, rot) in &collection.rots { + let selection = db::schema::hw_baseboard_id::table + .select(( + collection_id.into_sql::(), + baseboard_dsl::id, + rot.time_collected + .into_sql::(), + rot.source + .clone() + .into_sql::(), + HwRotSlot::from(rot.active_slot) + .into_sql::(), + HwRotSlot::from(rot.persistent_boot_preference) + .into_sql::(), + rot.pending_persistent_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.transient_boot_preference + .map(HwRotSlot::from) + .into_sql::>(), + rot.slot_a_sha3_256_digest + .clone() + .into_sql::>( + ), + rot.slot_b_sha3_256_digest + .clone() + .into_sql::>( + ), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + db::schema::inv_root_of_trust::table, + ) + .values(selection) + .into_columns(( + rot_dsl::inv_collection_id, + rot_dsl::hw_baseboard_id, + rot_dsl::time_collected, + rot_dsl::source, + rot_dsl::slot_active, + rot_dsl::slot_boot_pref_persistent, + rot_dsl::slot_boot_pref_persistent_pending, + rot_dsl::slot_boot_pref_transient, + rot_dsl::slot_a_sha3_256, + rot_dsl::slot_b_sha3_256, + )) + .execute_async(&conn) + .await?; + + // See the comment in the previous block (where we use + // `inv_service_processor::all_columns()`). The same + // applies here. + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _slot_active, + _slot_boot_pref_persistent, + _slot_boot_pref_persistent_pending, + _slot_boot_pref_transient, + _slot_a_sha3_256, + _slot_b_sha3_256, + ) = rot_dsl::inv_root_of_trust::all_columns(); + } + } + + // Insert rows for the cabooses that we found. Like service + // processors and roots of trust, we do this using INSERT INTO ... + // SELECT. This one's a little more complicated because there are + // two foreign keys. Concretely, we have these three tables: + // + // - `hw_baseboard` with an "id" primary key and lookup columns + // "part_number" and "serial_number" + // - `sw_caboose` with an "id" primary key and lookup columns + // "board", "git_commit", "name", and "version" + // - `inv_caboose` with foreign keys "hw_baseboard_id", + // "sw_caboose_id", and various other columns + // + // We want to INSERT INTO `inv_caboose` a row with: + // + // - hw_baseboard_id (foreign key) the result of looking up an + // hw_baseboard row by a specific part number and serial number + // + // - sw_caboose_id (foreign key) the result of looking up a + // specific sw_caboose row by board, git_commit, name, and version + // + // - the other columns being literals + // + // To achieve this, we're going to generate something like: + // + // INSERT INTO + // inv_caboose ( + // hw_baseboard_id, + // sw_caboose_id, + // inv_collection_id, + // time_collected, + // source, + // which, + // ) + // SELECT ( + // hw_baseboard_id.id, + // sw_caboose.id, + // ... /* literal collection id */ + // ... /* literal time collected */ + // ... /* literal source */ + // ... /* literal 'which' */ + // ) + // FROM + // hw_baseboard + // INNER JOIN + // sw_caboose + // ON hw_baseboard.part_number = ... + // AND hw_baseboard.serial_number = ... + // AND sw_caboose.board = ... + // AND sw_caboose.git_commit = ... + // AND sw_caboose.name = ... + // AND sw_caboose.version = ...; + // + // Again, the whole point is to avoid back-and-forth between the + // client and the database. Those back-and-forth interactions can + // significantly increase latency and the probability of transaction + // conflicts. See RFD 192 for details. (Unfortunately, we still + // _are_ going back and forth here to issue each of these queries. + // But that's an artifact of the interface we currently have for + // sending queries. It should be possible to send all of these in + // one batch. + for (which, tree) in &collection.cabooses_found { + let db_which = nexus_db_model::CabooseWhich::from(*which); + for (baseboard_id, found_caboose) in tree { + use db::schema::hw_baseboard_id::dsl as dsl_baseboard_id; + use db::schema::inv_caboose::dsl as dsl_inv_caboose; + use db::schema::sw_caboose::dsl as dsl_sw_caboose; + + let selection = db::schema::hw_baseboard_id::table + .inner_join( + db::schema::sw_caboose::table.on( + dsl_baseboard_id::part_number + .eq(baseboard_id.part_number.clone()) + .and( + dsl_baseboard_id::serial_number.eq( + baseboard_id.serial_number.clone(), + ), + ) + .and(dsl_sw_caboose::board.eq( + found_caboose.caboose.board.clone(), + )) + .and( + dsl_sw_caboose::git_commit.eq( + found_caboose + .caboose + .git_commit + .clone(), + ), + ) + .and( + dsl_sw_caboose::name.eq(found_caboose + .caboose + .name + .clone()), + ) + .and(dsl_sw_caboose::version.eq( + found_caboose.caboose.version.clone(), + )), + ), + ) + .select(( + dsl_baseboard_id::id, + dsl_sw_caboose::id, + collection_id.into_sql::(), + found_caboose + .time_collected + .into_sql::(), + found_caboose + .source + .clone() + .into_sql::(), + db_which.into_sql::(), + )); + + let _ = diesel::insert_into(db::schema::inv_caboose::table) + .values(selection) + .into_columns(( + dsl_inv_caboose::hw_baseboard_id, + dsl_inv_caboose::sw_caboose_id, + dsl_inv_caboose::inv_collection_id, + dsl_inv_caboose::time_collected, + dsl_inv_caboose::source, + dsl_inv_caboose::which, + )) + .execute_async(&conn) + .await?; + + // See the comments above. The same applies here. If you + // update the statement below because the schema for + // `inv_caboose` has changed, be sure to update the code + // above, too! + let ( + _hw_baseboard_id, + _sw_caboose_id, + _inv_collection_id, + _time_collected, + _source, + _which, + ) = dsl_inv_caboose::inv_caboose::all_columns(); + } + } + + // Finally, insert the list of errors. + { + use db::schema::inv_collection_error::dsl as errors_dsl; + let _ = diesel::insert_into(errors_dsl::inv_collection_error) + .values(error_values) + .execute_async(&conn) + .await?; + } + + Ok(()) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + info!( + &opctx.log, + "inserted inventory collection"; + "collection_id" => collection.id.to_string(), + ); + + Ok(()) + } + + /// Prune inventory collections stored in the database, keeping at least + /// `nkeep`. + /// + /// This function removes as many collections as possible while preserving + /// the last `nkeep`. This will also preserve at least one "complete" + /// collection (i.e., one having zero errors). + // It might seem surprising that such a high-level application policy is + // embedded in the DataStore. The reason is that we want to push a bunch of + // the logic into the SQL to avoid interactive queries. + pub async fn inventory_prune_collections( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result<(), Error> { + // Assumptions: + // + // - Most of the time, there will be about `nkeep + 1` collections in + // the database. That's because the normal expected case is: we had + // `nkeep`, we created another one, and now we're pruning the oldest + // one. + // + // - There could be fewer collections in the database, early in the + // system's lifetime (before we've accumulated `nkeep` of them). + // + // - There could be many more collections in the database, if something + // has gone wrong and we've fallen behind in our cleanup. + // + // - Due to transient errors during the collection process, it's + // possible that a collection is known to be potentially incomplete. + // We can tell this because it has rows in `inv_collection_errors`. + // (It's possible that a collection can be incomplete with zero + // errors, but we can't know that here and so we can't do anything + // about it.) + // + // Goals: + // + // - When this function returns without error, there were at most + // `nkeep` collections in the database. + // + // - If we have to remove any collections, we want to start from the + // oldest ones. (We want to maintain a window of the last `nkeep`, + // not the first `nkeep - 1` from the beginning of time plus the most + // recent one.) + // + // - We want to avoid removing the last collection that had zero errors. + // (If we weren't careful, we might do this if there were `nkeep` + // collections with errors that were newer than the last complete + // collection.) + // + // Here's the plan: + // + // - Select from the database the `nkeep + 1` oldest collections and the + // number of errors associated with each one. + // + // - If we got fewer than `nkeep + 1` back, we're done. We shouldn't + // prune anything. + // + // - Otherwise, if the oldest collection is the only complete one, + // remove the next-oldest collection and go back to the top (repeat). + // + // - Otherwise, remove the oldest collection and go back to the top + // (repeat). + // + // This seems surprisingly complicated. It's designed to meet the above + // goals. + // + // Is this going to work if multiple Nexuses are doing it concurrently? + // This cannot remove the last complete collection because a given Nexus + // will only remove a complete collection if it has seen a newer + // complete one. This cannot result in keeping fewer than "nkeep" + // collections because any Nexus will only remove a collection if there + // are "nkeep" newer ones. In both of these cases, another Nexus might + // remove one of the ones that the first Nexus was counting on keeping, + // but only if there was a newer one to replace it. + + opctx.authorize(authz::Action::Modify, &authz::INVENTORY).await?; + + loop { + match self.inventory_find_pruneable(opctx, nkeep).await? { + None => break, + Some(collection_id) => { + self.inventory_delete_collection(opctx, collection_id) + .await? + } + } + } + + Ok(()) + } + + /// Return the oldest inventory collection that's eligible for pruning, + /// if any + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_find_pruneable( + &self, + opctx: &OpContext, + nkeep: u32, + ) -> Result, Error> { + let conn = self.pool_connection_authorized(opctx).await?; + // Diesel requires us to use aliases in order to refer to the + // `inv_collection` table twice in the same query. + let (inv_collection1, inv_collection2) = diesel::alias!( + db::schema::inv_collection as inv_collection1, + db::schema::inv_collection as inv_collection2 + ); + + // This subquery essentially generates: + // + // SELECT id FROM inv_collection ORDER BY time_started" ASC LIMIT $1 + // + // where $1 becomes `nkeep + 1`. This just lists the `nkeep + 1` oldest + // collections. + let subquery = inv_collection1 + .select(inv_collection1.field(db::schema::inv_collection::id)) + .order_by( + inv_collection1 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .limit(i64::from(nkeep) + 1); + + // This essentially generates: + // + // SELECT + // inv_collection.id, + // count(inv_collection_error.inv_collection_id) + // FROM ( + // inv_collection + // LEFT OUTER JOIN + // inv_collection_error + // ON ( + // inv_collection_error.inv_collection_id = inv_collection.id + // ) + // ) WHERE ( + // inv_collection.id = ANY( <> ) + // ) + // GROUP BY inv_collection.id + // ORDER BY inv_collection.time_started ASC + // + // This looks a lot scarier than it is. The goal is to produce a + // two-column table that looks like this: + // + // collection_id1 count of errors from collection_id1 + // collection_id2 count of errors from collection_id2 + // collection_id3 count of errors from collection_id3 + // ... + // + let candidates: Vec<(Uuid, i64)> = inv_collection2 + .left_outer_join(db::schema::inv_collection_error::table) + .filter( + inv_collection2 + .field(db::schema::inv_collection::id) + .eq_any(subquery), + ) + .group_by(inv_collection2.field(db::schema::inv_collection::id)) + .select(( + inv_collection2.field(db::schema::inv_collection::id), + diesel::dsl::count( + db::schema::inv_collection_error::inv_collection_id + .nullable(), + ), + )) + .order_by( + inv_collection2 + .field(db::schema::inv_collection::time_started) + .asc(), + ) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .internal_context("listing oldest collections")?; + + if u32::try_from(candidates.len()).unwrap() <= nkeep { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal (too few)"; + "candidates" => ?candidates, + ); + return Ok(None); + } + + // We've now got up to "nkeep + 1" oldest collections, starting with the + // very oldest. We can get rid of the oldest one unless it's the only + // complete one. Another way to think about it: find the _last_ + // complete one. Remove it from the list of candidates. Now mark the + // first item in the remaining list for deletion. + let last_completed_idx = candidates + .iter() + .enumerate() + .rev() + .find(|(_i, (_collection_id, nerrors))| *nerrors == 0); + let candidate = match last_completed_idx { + Some((i, _)) if i == 0 => candidates.iter().skip(1).next(), + _ => candidates.iter().next(), + } + .map(|(collection_id, _nerrors)| *collection_id); + if let Some(c) = candidate { + debug!( + &opctx.log, + "inventory_prune_one: eligible for removal"; + "collection_id" => c.to_string(), + "candidates" => ?candidates, + ); + } else { + debug!( + &opctx.log, + "inventory_prune_one: nothing eligible for removal"; + "candidates" => ?candidates, + ); + } + Ok(candidate) + } + + /// Removes an inventory collection from the database + /// + /// The caller of this (non-pub) function is responsible for authz. + async fn inventory_delete_collection( + &self, + opctx: &OpContext, + collection_id: Uuid, + ) -> Result<(), Error> { + // As with inserting a whole collection, we remove it in one big + // transaction for simplicity. Similar considerations apply. We could + // break it up if these transactions become too big. But we'd need a + // way to stop other clients from discovering a collection after we + // start removing it and we'd also need to make sure we didn't leak a + // collection if we crash while deleting it. + let conn = self.pool_connection_authorized(opctx).await?; + let (ncollections, nsps, nrots, ncabooses, nerrors) = conn + .transaction_async(|conn| async move { + // Remove the record describing the collection itself. + let ncollections = { + use db::schema::inv_collection::dsl; + diesel::delete( + dsl::inv_collection.filter(dsl::id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for service processors. + let nsps = { + use db::schema::inv_service_processor::dsl; + diesel::delete( + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for roots of trust. + let nrots = { + use db::schema::inv_root_of_trust::dsl; + diesel::delete( + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for cabooses found. + let ncabooses = { + use db::schema::inv_caboose::dsl; + diesel::delete( + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + // Remove rows for errors encountered. + let nerrors = { + use db::schema::inv_collection_error::dsl; + diesel::delete( + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(collection_id)), + ) + .execute_async(&conn) + .await?; + }; + + Ok((ncollections, nsps, nrots, ncabooses, nerrors)) + }) + .await + .map_err(|error| match error { + TransactionError::CustomError(e) => e, + TransactionError::Database(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; + + info!(&opctx.log, "removed inventory collection"; + "collection_id" => collection_id.to_string(), + "ncollections" => ncollections, + "nsps" => nsps, + "nrots" => nrots, + "ncabooses" => ncabooses, + "nerrors" => nerrors, + ); + + Ok(()) + } +} + +/// Extra interfaces that are not intended (and potentially unsafe) for use in +/// Nexus, but useful for testing and `omdb` +pub trait DataStoreInventoryTest: Send + Sync { + /// List all collections + /// + /// This does not paginate. + fn inventory_collections(&self) -> BoxFuture>>; + + /// Make a best effort to read the given collection while limiting queries + /// to `limit` results. Returns as much as it was able to get. The + /// returned bool indicates whether the returned collection might be + /// incomplete because the limit was reached. + fn inventory_collection_read_best_effort( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture>; + + /// Attempt to read the given collection while limiting queries to `limit` + /// records + fn inventory_collection_read_all_or_nothing( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture> { + async move { + let (collection, limit_reached) = + self.inventory_collection_read_best_effort(id, limit).await?; + anyhow::ensure!( + !limit_reached, + "hit limit of {} records while loading collection", + limit + ); + Ok(collection) + } + .boxed() + } +} + +impl DataStoreInventoryTest for DataStore { + fn inventory_collections(&self) -> BoxFuture>> { + async { + let conn = self + .pool_connection_for_tests() + .await + .context("getting connectoin")?; + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) + .await + .context("failed to allow table scan")?; + + use db::schema::inv_collection::dsl; + dsl::inv_collection + .select(dsl::id) + .order_by(dsl::time_started) + .load_async(&conn) + .await + .context("failed to list collections") + }) + .await + } + .boxed() + } + + // This function could move into the datastore if it proves helpful. We'd + // need to work out how to report the usual type of Error. For now we don't + // need it so we limit its scope to the test suite. + fn inventory_collection_read_best_effort( + &self, + id: Uuid, + limit: NonZeroU32, + ) -> BoxFuture> { + async move { + let conn = &self + .pool_connection_for_tests() + .await + .context("getting connection")?; + let sql_limit = i64::from(u32::from(limit)); + let usize_limit = usize::try_from(u32::from(limit)).unwrap(); + let mut limit_reached = false; + let (time_started, time_done, collector) = { + use db::schema::inv_collection::dsl; + + let collections = dsl::inv_collection + .filter(dsl::id.eq(id)) + .limit(2) + .select(InvCollection::as_select()) + .load_async(&**conn) + .await + .context("loading collection")?; + anyhow::ensure!(collections.len() == 1); + let collection = collections.into_iter().next().unwrap(); + ( + collection.time_started, + collection.time_done, + collection.collector, + ) + }; + + let errors: Vec = { + use db::schema::inv_collection_error::dsl; + dsl::inv_collection_error + .filter(dsl::inv_collection_id.eq(id)) + .order_by(dsl::idx) + .limit(sql_limit) + .select(InvCollectionError::as_select()) + .load_async(&**conn) + .await + .context("loading collection errors")? + .into_iter() + .map(|e| e.message) + .collect() + }; + limit_reached = limit_reached || errors.len() == usize_limit; + + let sps: BTreeMap<_, _> = { + use db::schema::inv_service_processor::dsl; + dsl::inv_service_processor + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvServiceProcessor::as_select()) + .load_async(&**conn) + .await + .context("loading service processors")? + .into_iter() + .map(|sp_row| { + let baseboard_id = sp_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::ServiceProcessor::from( + sp_row, + ), + ) + }) + .collect() + }; + limit_reached = limit_reached || sps.len() == usize_limit; + + let rots: BTreeMap<_, _> = { + use db::schema::inv_root_of_trust::dsl; + dsl::inv_root_of_trust + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvRootOfTrust::as_select()) + .load_async(&**conn) + .await + .context("loading roots of trust")? + .into_iter() + .map(|rot_row| { + let baseboard_id = rot_row.hw_baseboard_id; + ( + baseboard_id, + nexus_types::inventory::RotState::from(rot_row), + ) + }) + .collect() + }; + limit_reached = limit_reached || rots.len() == usize_limit; + + // Collect the unique baseboard ids referenced by SPs and RoTs. + let baseboard_id_ids: BTreeSet<_> = + sps.keys().chain(rots.keys()).cloned().collect(); + // Fetch the corresponding baseboard records. + let baseboards_by_id: BTreeMap<_, _> = { + use db::schema::hw_baseboard_id::dsl; + dsl::hw_baseboard_id + .filter(dsl::id.eq_any(baseboard_id_ids)) + .limit(sql_limit) + .select(HwBaseboardId::as_select()) + .load_async(&**conn) + .await + .context("loading baseboards")? + .into_iter() + .map(|bb| { + ( + bb.id, + Arc::new( + nexus_types::inventory::BaseboardId::from(bb), + ), + ) + }) + .collect() + }; + limit_reached = + limit_reached || baseboards_by_id.len() == usize_limit; + + // Having those, we can replace the keys in the maps above with + // references to the actual baseboard rather than the uuid. + let sps = sps + .into_iter() + .map(|(id, sp)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), sp)) + .ok_or_else(|| { + anyhow!( + "missing baseboard that we should have fetched" + ) + }) + }) + .collect::, _>>()?; + let rots = + rots.into_iter() + .map(|(id, rot)| { + baseboards_by_id + .get(&id) + .map(|bb| (bb.clone(), rot)) + .ok_or_else(|| { + anyhow!("missing baseboard that we should have fetched") + }) + }) + .collect::, _>>()?; + + // Fetch records of cabooses found. + let inv_caboose_rows = { + use db::schema::inv_caboose::dsl; + dsl::inv_caboose + .filter(dsl::inv_collection_id.eq(id)) + .limit(sql_limit) + .select(InvCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading inv_cabooses")? + }; + limit_reached = + limit_reached || inv_caboose_rows.len() == usize_limit; + + // Collect the unique sw_caboose_ids for those cabooses. + let sw_caboose_ids: BTreeSet<_> = inv_caboose_rows + .iter() + .map(|inv_caboose| inv_caboose.sw_caboose_id) + .collect(); + // Fetch the corresponing records. + let cabooses_by_id: BTreeMap<_, _> = { + use db::schema::sw_caboose::dsl; + dsl::sw_caboose + .filter(dsl::id.eq_any(sw_caboose_ids)) + .limit(sql_limit) + .select(SwCaboose::as_select()) + .load_async(&**conn) + .await + .context("loading sw_cabooses")? + .into_iter() + .map(|sw_caboose_row| { + ( + sw_caboose_row.id, + Arc::new(nexus_types::inventory::Caboose::from( + sw_caboose_row, + )), + ) + }) + .collect() + }; + limit_reached = + limit_reached || cabooses_by_id.len() == usize_limit; + + // Assemble the lists of cabooses found. + let mut cabooses_found = BTreeMap::new(); + for c in inv_caboose_rows { + let by_baseboard = cabooses_found + .entry(nexus_types::inventory::CabooseWhich::from(c.which)) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&c.hw_baseboard_id) else { + bail!( + "unknown baseboard found in inv_caboose: {}", + c.hw_baseboard_id + ); + }; + let Some(sw_caboose) = cabooses_by_id.get(&c.sw_caboose_id) + else { + bail!( + "unknown caboose found in inv_caboose: {}", + c.sw_caboose_id + ); + }; + + let previous = by_baseboard.insert( + bb.clone(), + nexus_types::inventory::CabooseFound { + time_collected: c.time_collected, + source: c.source, + caboose: sw_caboose.clone(), + }, + ); + anyhow::ensure!( + previous.is_none(), + "duplicate caboose found: {:?} baseboard {:?}", + c.which, + c.hw_baseboard_id + ); + } + + Ok(( + Collection { + id, + errors, + time_started, + time_done, + collector, + baseboards: baseboards_by_id.values().cloned().collect(), + cabooses: cabooses_by_id.values().cloned().collect(), + sps, + rots, + cabooses_found, + }, + limit_reached, + )) + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use crate::db::datastore::datastore_test; + use crate::db::datastore::inventory::DataStoreInventoryTest; + use crate::db::datastore::DataStore; + use crate::db::datastore::DataStoreConnection; + use crate::db::schema; + use anyhow::Context; + use async_bb8_diesel::AsyncConnection; + use async_bb8_diesel::AsyncRunQueryDsl; + use async_bb8_diesel::AsyncSimpleConnection; + use diesel::QueryDsl; + use gateway_client::types::SpType; + use nexus_inventory::examples::representative; + use nexus_inventory::examples::Representative; + use nexus_test_utils::db::test_setup_database; + use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; + use nexus_types::inventory::CabooseWhich; + use nexus_types::inventory::Collection; + use omicron_test_utils::dev; + use std::num::NonZeroU32; + use uuid::Uuid; + + async fn read_collection( + datastore: &DataStore, + id: Uuid, + ) -> anyhow::Result { + let limit = NonZeroU32::new(1000).unwrap(); + datastore.inventory_collection_read_all_or_nothing(id, limit).await + } + + async fn count_baseboards_cabooses( + conn: &DataStoreConnection<'_>, + ) -> anyhow::Result<(usize, usize)> { + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let bb_count = schema::hw_baseboard_id::dsl::hw_baseboard_id + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count baseboards")?; + let caboose_count = schema::sw_caboose::dsl::sw_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .context("failed to count cabooses")?; + let bb_count_usize = usize::try_from(bb_count) + .context("failed to convert baseboard count to usize")?; + let caboose_count_usize = usize::try_from(caboose_count) + .context("failed to convert caboose count to usize")?; + Ok((bb_count_usize, caboose_count_usize)) + }) + .await + } + + /// Tests inserting several collections, reading them back, and making sure + /// they look the same. + #[tokio::test] + async fn test_inventory_insert() { + // Setup + let logctx = dev::test_setup_log("inventory_insert"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create an empty collection and write it to the database. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection1 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection1) + .await + .expect("failed to insert collection"); + + // Read it back. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + let collection_read = read_collection(&datastore, collection1.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection1, collection_read); + + // There ought to be no baseboards or cabooses in the databases from + // that collection. + assert_eq!(collection1.baseboards.len(), 0); + assert_eq!(collection1.cabooses.len(), 0); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection1.baseboards.len(), nbaseboards); + assert_eq!(collection1.cabooses.len(), ncabooses); + + // Now insert a more complex collection, write it to the database, and + // read it back. + let Representative { builder, .. } = representative(); + let collection2 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection2) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection2.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection2, collection_read); + // Verify that we have exactly the set of cabooses and baseboards in the + // databases that came from this first non-empty collection. + assert_ne!(collection2.baseboards.len(), collection1.baseboards.len()); + assert_ne!(collection2.cabooses.len(), collection1.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection2.baseboards.len(), nbaseboards); + assert_eq!(collection2.cabooses.len(), ncabooses); + + // Now insert an equivalent collection again. Verify the distinct + // baseboards and cabooses again. This is important: the insertion + // process should re-use the baseboards and cabooses from the previous + // collection. + let Representative { builder, .. } = representative(); + let collection3 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection3) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection3.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection3, collection_read); + // Verify that we have the same number of cabooses and baseboards, since + // those didn't change. + assert_eq!(collection3.baseboards.len(), collection2.baseboards.len()); + assert_eq!(collection3.cabooses.len(), collection2.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection3.baseboards.len(), nbaseboards); + assert_eq!(collection3.cabooses.len(), ncabooses); + + // Now insert a collection that's almost equivalent, but has an extra + // couple of baseboards and caboose. Verify that we re-use the existing + // ones, but still insert the new ones. + let Representative { mut builder, .. } = representative(); + builder.found_sp_state( + "test suite", + SpType::Switch, + 1, + nexus_inventory::examples::sp_state("2"), + ); + let bb = builder + .found_sp_state( + "test suite", + SpType::Power, + 1, + nexus_inventory::examples::sp_state("3"), + ) + .unwrap(); + builder + .found_caboose( + &bb, + CabooseWhich::SpSlot0, + "dummy", + nexus_inventory::examples::caboose("dummy"), + ) + .unwrap(); + let collection4 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection4) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection4.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection4, collection_read); + // Verify the number of baseboards and collections again. + assert_eq!( + collection4.baseboards.len(), + collection3.baseboards.len() + 2 + ); + assert_eq!( + collection4.cabooses.len(), + collection3.baseboards.len() + 1 + ); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // This time, go back to our earlier collection. This logically removes + // some baseboards. They should still be present in the database, but + // not in the collection. + let Representative { builder, .. } = representative(); + let collection5 = builder.build(); + datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect("failed to insert collection"); + let collection_read = read_collection(&datastore, collection5.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection5, collection_read); + assert_eq!(collection5.baseboards.len(), collection3.baseboards.len()); + assert_eq!(collection5.cabooses.len(), collection3.cabooses.len()); + assert_ne!(collection5.baseboards.len(), collection4.baseboards.len()); + assert_ne!(collection5.cabooses.len(), collection4.cabooses.len()); + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_eq!(collection4.baseboards.len(), nbaseboards); + assert_eq!(collection4.cabooses.len(), ncabooses); + + // Try to insert the same collection again and make sure it fails. + let error = datastore + .inventory_insert_collection(&opctx, &collection5) + .await + .expect_err("unexpectedly succeeded in inserting collection"); + assert!(format!("{:#}", error) + .contains("duplicate key value violates unique constraint")); + + // Now that we've inserted a bunch of collections, we can test pruning. + // + // The datastore should start by pruning the oldest collection, unless + // it's the only collection with no errors. The oldest one is + // `collection1`, which _is_ the only one with no errors. So we should + // get back `collection2`. + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + println!( + "all collections: {:?}\n", + &[ + collection1.id, + collection2.id, + collection3.id, + collection4.id, + collection5.id, + ] + ); + datastore + .inventory_prune_collections(&opctx, 4) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection3.id, collection4.id, collection5.id,] + ); + // Again, we should skip over collection1 and delete the next oldest: + // collection3. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + // At this point, if we're keeping 3, we don't need to prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id,] + ); + + // If we then insert an empty collection (which has no errors), + // collection1 becomes pruneable. + let builder = nexus_inventory::CollectionBuilder::new("test"); + let collection6 = builder.build(); + println!( + "collection 6: {} ({:?})", + collection6.id, collection6.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection6) + .await + .expect("failed to insert collection"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection1.id, collection4.id, collection5.id, collection6.id,] + ); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + // Again, at this point, we should not prune anything. + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection4.id, collection5.id, collection6.id,] + ); + + // If we insert another collection with errors, then prune, we should + // end up pruning collection 4. + let Representative { builder, .. } = representative(); + let collection7 = builder.build(); + println!( + "collection 7: {} ({:?})", + collection7.id, collection7.time_started + ); + datastore + .inventory_insert_collection(&opctx, &collection7) + .await + .expect("failed to insert collection"); + datastore + .inventory_prune_collections(&opctx, 3) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection5.id, collection6.id, collection7.id,] + ); + + // If we try to fetch a pruned collection, we should get nothing. + let _ = read_collection(&datastore, collection4.id) + .await + .expect_err("unexpectedly read pruned collection"); + + // But we should still be able to fetch the collections that do exist. + let collection_read = + read_collection(&datastore, collection5.id).await.unwrap(); + assert_eq!(collection5, collection_read); + let collection_read = + read_collection(&datastore, collection6.id).await.unwrap(); + assert_eq!(collection6, collection_read); + let collection_read = + read_collection(&datastore, collection7.id).await.unwrap(); + assert_eq!(collection7, collection_read); + + // We should prune more than one collection, if needed. We'll wind up + // with just collection6 because that's the latest one with no errors. + datastore + .inventory_prune_collections(&opctx, 1) + .await + .expect("failed to prune collections"); + assert_eq!( + datastore.inventory_collections().await.unwrap(), + &[collection6.id,] + ); + + // Remove the remaining collection and make sure the inventory tables + // are empty (i.e., we got everything). + datastore + .inventory_delete_collection(&opctx, collection6.id) + .await + .expect("failed to delete collection"); + assert_eq!(datastore.inventory_collections().await.unwrap(), &[]); + + conn.transaction_async(|conn| async move { + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + let count = schema::inv_collection::dsl::inv_collection + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_collection_error::dsl::inv_collection_error + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = + schema::inv_service_processor::dsl::inv_service_processor + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_root_of_trust::dsl::inv_root_of_trust + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_caboose::dsl::inv_caboose + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + Ok::<(), anyhow::Error>(()) + }) + .await + .expect("failed to check that tables were empty"); + + // We currently keep the baseboard ids and sw_cabooses around. + let (nbaseboards, ncabooses) = + count_baseboards_cabooses(&conn).await.unwrap(); + assert_ne!(nbaseboards, 0); + assert_ne!(ncabooses, 0); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 2dc1e69a6f..91373f6875 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -61,6 +61,7 @@ mod external_ip; mod identity_provider; mod image; mod instance; +mod inventory; mod ip_pool; mod network_interface; mod oximeter; @@ -96,6 +97,7 @@ pub use db_metadata::{ }; pub use dns::DnsVersionUpdateBuilder; pub use instance::InstanceAndActiveVmm; +pub use inventory::DataStoreInventoryTest; pub use rack::RackInit; pub use silo::Discoverability; pub use switch_port::SwitchPortSettingsCombinedResult; @@ -138,6 +140,9 @@ impl RunnableQuery for T where { } +pub type DataStoreConnection<'a> = + bb8::PooledConnection<'a, ConnectionManager>; + pub struct DataStore { pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, @@ -205,21 +210,13 @@ impl DataStore { .unwrap(); } - async fn pool_authorized( - &self, - opctx: &OpContext, - ) -> Result<&bb8::Pool>, Error> { - opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; - Ok(self.pool.pool()) - } - /// Returns a connection to a connection from the database connection pool. pub(super) async fn pool_connection_authorized( &self, opctx: &OpContext, - ) -> Result>, Error> - { - let pool = self.pool_authorized(opctx).await?; + ) -> Result { + opctx.authorize(authz::Action::Query, &authz::DATABASE).await?; + let pool = self.pool.pool(); let connection = pool.get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -233,8 +230,7 @@ impl DataStore { /// "pool_connection_authorized". pub(super) async fn pool_connection_unauthorized( &self, - ) -> Result>, Error> - { + ) -> Result { let connection = self.pool.pool().get().await.map_err(|err| { Error::unavail(&format!("Failed to access DB connection: {err}")) })?; @@ -245,8 +241,7 @@ impl DataStore { #[doc(hidden)] pub async fn pool_connection_for_tests( &self, - ) -> Result>, Error> - { + ) -> Result { self.pool_connection_unauthorized().await } diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs index 73c95f4e91..249852d832 100644 --- a/nexus/db-queries/src/db/pool.rs +++ b/nexus/db-queries/src/db/pool.rs @@ -45,6 +45,8 @@ pub struct Pool { impl Pool { pub fn new(log: &slog::Logger, db_config: &DbConfig) -> Self { + // Make sure diesel-dtrace's USDT probes are enabled. + usdt::register_probes().expect("Failed to register USDT DTrace probes"); Self::new_builder(log, db_config, bb8::Builder::new()) } diff --git a/nexus/db-queries/tests/output/authz-roles.out b/nexus/db-queries/tests/output/authz-roles.out index 72031c567e..963f00f7e8 100644 --- a/nexus/db-queries/tests/output/authz-roles.out +++ b/nexus/db-queries/tests/output/authz-roles.out @@ -68,6 +68,20 @@ resource: authz::DeviceAuthRequestList silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ unauthenticated ! ! ! ! ! ! ! ! +resource: authz::Inventory + + USER Q R LC RP M MP CC D + fleet-admin ✘ ✔ ✘ ✔ ✔ ✔ ✘ ✔ + fleet-collaborator ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + fleet-viewer ✘ ✔ ✘ ✔ ✘ ✘ ✘ ✘ + silo1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-admin ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-collaborator ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ + unauthenticated ! ! ! ! ! ! ! ! + resource: authz::IpPoolList USER Q R LC RP M MP CC D diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 1a9afbc6bd..efc9aa9c27 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -92,6 +92,13 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 5 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml new file mode 100644 index 0000000000..965ff3f02a --- /dev/null +++ b/nexus/inventory/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "nexus-inventory" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-messages.workspace = true +nexus-types.workspace = true +slog.workspace = true +strum.workspace = true +uuid.workspace = true +omicron-workspace-hack.workspace = true + +[dev-dependencies] +expectorate.workspace = true +gateway-test-utils.workspace = true +regex.workspace = true +tokio.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs new file mode 100644 index 0000000000..ad008ee4df --- /dev/null +++ b/nexus/inventory/src/builder.rs @@ -0,0 +1,786 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for building inventory [`Collection`] dynamically +//! +//! This separates the concerns of _collection_ (literally just fetching data +//! from sources like MGS) from assembling a representation of what was +//! collected. + +use anyhow::anyhow; +use chrono::DateTime; +use chrono::Utc; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::Caboose; +use nexus_types::inventory::CabooseFound; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use nexus_types::inventory::RotState; +use nexus_types::inventory::ServiceProcessor; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use uuid::Uuid; + +/// Build an inventory [`Collection`] +/// +/// This interface is oriented around the interfaces used by an actual +/// collector. Where possible, it accepts types directly provided by the data +/// sources (e.g., `gateway_client`). +#[derive(Debug)] +pub struct CollectionBuilder { + // For field documentation, see the corresponding fields in `Collection`. + errors: Vec, + time_started: DateTime, + collector: String, + baseboards: BTreeSet>, + cabooses: BTreeSet>, + sps: BTreeMap, ServiceProcessor>, + rots: BTreeMap, RotState>, + cabooses_found: + BTreeMap, CabooseFound>>, +} + +impl CollectionBuilder { + /// Start building a new `Collection` + /// + /// `collector` is an arbitrary string describing the agent that collected + /// this data. It's generally a Nexus instance uuid but it can be anything. + /// It's just for debugging. + pub fn new(collector: &str) -> Self { + CollectionBuilder { + errors: vec![], + time_started: now(), + collector: collector.to_owned(), + baseboards: BTreeSet::new(), + cabooses: BTreeSet::new(), + sps: BTreeMap::new(), + rots: BTreeMap::new(), + cabooses_found: BTreeMap::new(), + } + } + + /// Assemble a complete `Collection` representation + pub fn build(self) -> Collection { + Collection { + id: Uuid::new_v4(), + errors: self + .errors + .into_iter() + .map(|e| format!("{:#}", e)) + .collect(), + time_started: self.time_started, + time_done: now(), + collector: self.collector, + baseboards: self.baseboards, + cabooses: self.cabooses, + sps: self.sps, + rots: self.rots, + cabooses_found: self.cabooses_found, + } + } + + /// Record service processor state `sp_state` reported by MGS + /// + /// `sp_type` and `slot` identify which SP this was. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_sp_state( + &mut self, + source: &str, + sp_type: SpType, + slot: u32, + sp_state: SpState, + ) -> Option> { + // Much ado about very little: MGS reports that "slot" is a u32, though + // in practice this seems very unlikely to be bigger than a u8. (How + // many slots can there be within one rack?) The database only supports + // signed integers, so if we assumed this really could span the range of + // a u32, we'd need to store it in an i64. Instead, assume here that we + // can stick it into a u16 (which still seems generous). This will + // allow us to store it into an Int32 in the database. + let Ok(sp_slot) = u16::try_from(slot) else { + self.found_error(anyhow!( + "MGS {:?}: SP {:?} slot {}: slot number did not fit into u16", + source, + sp_type, + slot + )); + return None; + }; + + // Normalize the baseboard id: i.e., if we've seen this baseboard + // before, use the same baseboard id record. Otherwise, make a new one. + let baseboard = Self::normalize_item( + &mut self.baseboards, + BaseboardId { + serial_number: sp_state.serial_number, + part_number: sp_state.model, + }, + ); + + // Separate the SP state into the SP-specific state and the RoT state, + // if any. + let now = now(); + let _ = self.sps.entry(baseboard.clone()).or_insert_with(|| { + ServiceProcessor { + time_collected: now, + source: source.to_owned(), + + sp_type, + sp_slot, + + baseboard_revision: sp_state.revision, + hubris_archive: sp_state.hubris_archive_id, + power_state: sp_state.power_state, + } + }); + + match sp_state.rot { + gateway_client::types::RotState::Enabled { + active, + pending_persistent_boot_preference, + persistent_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + transient_boot_preference, + } => { + let _ = + self.rots.entry(baseboard.clone()).or_insert_with(|| { + RotState { + time_collected: now, + source: source.to_owned(), + active_slot: active, + persistent_boot_preference, + pending_persistent_boot_preference, + transient_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + } + }); + } + gateway_client::types::RotState::CommunicationFailed { + message, + } => { + self.found_error(anyhow!( + "MGS {:?}: reading RoT state for {:?}: {}", + source, + baseboard, + message + )); + } + } + + Some(baseboard) + } + + /// Returns true if we already found the caboose for `which` for baseboard + /// `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn found_caboose_already( + &self, + baseboard: &BaseboardId, + which: CabooseWhich, + ) -> bool { + self.cabooses_found + .get(&which) + .map(|map| map.contains_key(baseboard)) + .unwrap_or(false) + } + + /// Record the given caboose information found for the given baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_caboose( + &mut self, + baseboard: &BaseboardId, + which: CabooseWhich, + source: &str, + caboose: SpComponentCaboose, + ) -> Result<(), anyhow::Error> { + // Normalize the caboose contents: i.e., if we've seen this exact + // caboose contents before, use the same record from before. Otherwise, + // make a new one. + let sw_caboose = + Self::normalize_item(&mut self.cabooses, Caboose::from(caboose)); + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting caboose for unknown baseboard: {:?} ({:?})", + baseboard, + sw_caboose + ) + })?; + let by_id = + self.cabooses_found.entry(which).or_insert_with(|| BTreeMap::new()); + if let Some(previous) = by_id.insert( + baseboard.clone(), + CabooseFound { + time_collected: now(), + source: source.to_owned(), + caboose: sw_caboose.clone(), + }, + ) { + let error = if *previous.caboose == *sw_caboose { + anyhow!("reported multiple times (same value)",) + } else { + anyhow!( + "reported caboose multiple times (previously {:?}, \ + now {:?})", + previous, + sw_caboose + ) + }; + Err(error.context(format!( + "baseboard {:?} caboose {:?}", + baseboard, which + ))) + } else { + Ok(()) + } + } + + /// Helper function for normalizing items + /// + /// If `item` (or its equivalent) is not already in `items`, insert it. + /// Either way, return the item from `items`. (This will either be `item` + /// itself or whatever was already in `items`.) + fn normalize_item( + items: &mut BTreeSet>, + item: T, + ) -> Arc { + match items.get(&item) { + Some(found_item) => found_item.clone(), + None => { + let new_item = Arc::new(item); + items.insert(new_item.clone()); + new_item + } + } + } + + /// Record a collection error + /// + /// This is used for operational errors encountered during the collection + /// process (e.g., a down MGS instance). It's not intended for mis-uses of + /// this API, which are conveyed instead through returned errors (and should + /// probably cause the caller to stop collection altogether). + pub fn found_error(&mut self, error: anyhow::Error) { + self.errors.push(error); + } +} + +/// Returns the current time, truncated to the previous microsecond. +/// +/// This exists because the database doesn't store nanosecond-precision, so if +/// we store nanosecond-precision timestamps, then DateTime conversion is lossy +/// when round-tripping through the database. That's rather inconvenient. +fn now() -> DateTime { + let ts = Utc::now(); + let nanosecs = ts.timestamp_subsec_nanos(); + let micros = ts.timestamp_subsec_micros(); + let only_nanos = nanosecs - micros * 1000; + ts - std::time::Duration::from_nanos(u64::from(only_nanos)) +} + +#[cfg(test)] +mod test { + use super::now; + use super::CollectionBuilder; + use crate::examples::representative; + use crate::examples::sp_state; + use crate::examples::Representative; + use gateway_client::types::PowerState; + use gateway_client::types::RotSlot; + use gateway_client::types::RotState; + use gateway_client::types::SpComponentCaboose; + use gateway_client::types::SpState; + use gateway_client::types::SpType; + use nexus_types::inventory::BaseboardId; + use nexus_types::inventory::Caboose; + use nexus_types::inventory::CabooseWhich; + + // Verify the contents of an empty collection. + #[test] + fn test_empty() { + let time_before = now(); + let builder = CollectionBuilder::new("test_empty"); + let collection = builder.build(); + let time_after = now(); + + assert!(collection.errors.is_empty()); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "test_empty"); + assert!(collection.baseboards.is_empty()); + assert!(collection.cabooses.is_empty()); + assert!(collection.sps.is_empty()); + assert!(collection.rots.is_empty()); + assert!(collection.cabooses_found.is_empty()); + } + + // Simple test of a single, fairly typical collection that contains just + // about all kinds of valid data. That includes exercising: + // + // - all three baseboard types (switch, sled, PSC) + // - various valid values for all fields (sources, slot numbers, power + // states, baseboard revisions, cabooses, etc.) + // - some empty slots + // - some missing cabooses + // - some cabooses common to multiple baseboards; others not + // - serial number reused across different model numbers + // + // This test is admittedly pretty tedious and maybe not worthwhile but it's + // a useful quick check. + #[test] + fn test_basic() { + let time_before = now(); + let Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch, + psc, + } = representative(); + let collection = builder.build(); + let time_after = now(); + println!("{:#?}", collection); + assert!(time_before <= collection.time_started); + assert!(collection.time_started <= collection.time_done); + assert!(collection.time_done <= time_after); + assert_eq!(collection.collector, "example"); + + // Verify the one error that ought to have been produced for the SP with + // no RoT information. + assert_eq!( + collection.errors.iter().map(|e| e.to_string()).collect::>(), + ["MGS \"fake MGS 1\": reading RoT state for BaseboardId \ + { part_number: \"model1\", serial_number: \"s2\" }: test suite \ + injected error"] + ); + + // Verify the baseboard ids found. + let expected_baseboards = + &[&sled1_bb, &sled2_bb, &sled3_bb, &switch, &psc]; + for bb in expected_baseboards { + assert!(collection.baseboards.contains(*bb)); + } + assert_eq!(collection.baseboards.len(), expected_baseboards.len()); + + // Verify the stuff that's easy to verify for all SPs: timestamps. + assert_eq!(collection.sps.len(), collection.baseboards.len()); + for (bb, sp) in collection.sps.iter() { + assert!(collection.time_started <= sp.time_collected); + assert!(sp.time_collected <= collection.time_done); + + if let Some(rot) = collection.rots.get(bb) { + assert_eq!(rot.source, sp.source); + assert_eq!(rot.time_collected, sp.time_collected); + } + + for which in [CabooseWhich::SpSlot0, CabooseWhich::SpSlot1] { + let caboose = collection.caboose_for(which, bb); + if let Some(c) = caboose { + assert!(collection.time_started <= c.time_collected); + assert!(c.time_collected <= collection.time_done); + assert!(collection.cabooses.contains(&c.caboose)); + } + } + } + + // Verify the common caboose. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch]; + let common_caboose = Caboose { + board: String::from("board_1"), + git_commit: String::from("git_commit_1"), + name: String::from("name_1"), + version: String::from("version_1"), + }; + for bb in &common_caboose_baseboards { + let _ = collection.sps.get(*bb).unwrap(); + let c0 = collection.caboose_for(CabooseWhich::SpSlot0, bb).unwrap(); + let c1 = collection.caboose_for(CabooseWhich::SpSlot1, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + + let _ = collection.rots.get(*bb).unwrap(); + let c0 = + collection.caboose_for(CabooseWhich::RotSlotA, bb).unwrap(); + let c1 = + collection.caboose_for(CabooseWhich::RotSlotB, bb).unwrap(); + assert_eq!(c0.source, "test suite"); + assert_eq!(*c0.caboose, common_caboose); + assert_eq!(c1.source, "test suite"); + assert_eq!(*c1.caboose, common_caboose); + } + assert!(collection.cabooses.contains(&common_caboose)); + + // Verify the specific, different data for the healthy SPs and RoTs that + // we reported. + // sled1 + let sp = collection.sps.get(&sled1_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 3); + assert_eq!(sp.baseboard_revision, 0); + assert_eq!(sp.hubris_archive, "hubris1"); + assert_eq!(sp.power_state, PowerState::A0); + let rot = collection.rots.get(&sled1_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::A); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest1" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest1" + ); + assert_eq!(rot.transient_boot_preference, None); + + // sled2 + let sp = collection.sps.get(&sled2_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 4); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris2"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&sled2_bb).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, Some(RotSlot::A)); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest2" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest2" + ); + assert_eq!(rot.transient_boot_preference, Some(RotSlot::B)); + + // switch + let sp = collection.sps.get(&switch).unwrap(); + assert_eq!(sp.source, "fake MGS 2"); + assert_eq!(sp.sp_type, SpType::Switch); + assert_eq!(sp.sp_slot, 0); + assert_eq!(sp.baseboard_revision, 2); + assert_eq!(sp.hubris_archive, "hubris3"); + assert_eq!(sp.power_state, PowerState::A1); + let rot = collection.rots.get(&switch).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest3" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest3" + ); + assert_eq!(rot.transient_boot_preference, None); + + // PSC + let sp = collection.sps.get(&psc).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Power); + assert_eq!(sp.sp_slot, 1); + assert_eq!(sp.baseboard_revision, 3); + assert_eq!(sp.hubris_archive, "hubris4"); + assert_eq!(sp.power_state, PowerState::A2); + let rot = collection.rots.get(&psc).unwrap(); + assert_eq!(rot.active_slot, RotSlot::B); + assert_eq!(rot.pending_persistent_boot_preference, None); + assert_eq!(rot.persistent_boot_preference, RotSlot::A); + assert_eq!( + rot.slot_a_sha3_256_digest.as_ref().unwrap(), + "slotAdigest4" + ); + assert_eq!( + rot.slot_b_sha3_256_digest.as_ref().unwrap(), + "slotBdigest4" + ); + assert_eq!(rot.transient_boot_preference, None); + + // The PSC has four different cabooses! + let c = &collection + .caboose_for(CabooseWhich::SpSlot0, &psc) + .unwrap() + .caboose; + assert_eq!(c.board, "board_psc_sp_0"); + assert!(collection.cabooses.contains(c)); + let c = &collection + .caboose_for(CabooseWhich::SpSlot1, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_sp_1"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotA, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_rot_a"); + let c = &collection + .caboose_for(CabooseWhich::RotSlotB, &psc) + .unwrap() + .caboose; + assert!(collection.cabooses.contains(c)); + assert_eq!(c.board, "board_psc_rot_b"); + + // Verify the reported SP state for sled3, which did not have a healthy + // RoT, nor any cabooses. + let sp = collection.sps.get(&sled3_bb).unwrap(); + assert_eq!(sp.source, "fake MGS 1"); + assert_eq!(sp.sp_type, SpType::Sled); + assert_eq!(sp.sp_slot, 5); + assert_eq!(sp.baseboard_revision, 1); + assert_eq!(sp.hubris_archive, "hubris5"); + assert_eq!(sp.power_state, PowerState::A2); + assert!(collection + .caboose_for(CabooseWhich::SpSlot0, &sled3_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled3_bb) + .is_none()); + assert!(!collection.rots.contains_key(&sled3_bb)); + + // There shouldn't be any other RoTs. + assert_eq!(collection.sps.len(), collection.rots.len() + 1); + + // There should be five cabooses: the four used for the PSC (see above), + // plus the common one. + assert_eq!(collection.cabooses.len(), 5); + } + + // Exercises all the failure cases that shouldn't happen in real systems. + // Despite all of these failures, we should get a valid collection at the + // end. + #[test] + fn test_problems() { + let mut builder = CollectionBuilder::new("test_problems"); + + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // report the same SP again with the same contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report the same SP again with different contents + let sled1_bb_dup = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: None, + slot_b_sha3_256_digest: None, + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + assert_eq!(sled1_bb, sled1_bb_dup); + + // report an SP with an impossible slot number + let sled2_sp = builder.found_sp_state( + "fake MGS 1", + SpType::Sled, + u32::from(u16::MAX) + 1, + sp_state("1"), + ); + assert_eq!(sled2_sp, None); + + // report SP caboose for an unknown baseboard + let bogus_baseboard = BaseboardId { + part_number: String::from("p1"), + serial_number: String::from("bogus"), + }; + let caboose1 = SpComponentCaboose { + board: String::from("board1"), + git_commit: String::from("git_commit1"), + name: String::from("name1"), + version: String::from("version1"), + }; + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + let error = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "reporting caboose for unknown baseboard: \ + BaseboardId { part_number: \"p1\", serial_number: \"bogus\" } \ + (Caboose { board: \"board1\", git_commit: \"git_commit1\", \ + name: \"name1\", version: \"version1\" })" + ); + assert!(!builder + .found_caboose_already(&bogus_baseboard, CabooseWhich::SpSlot0)); + + // report RoT caboose for an unknown baseboard + let error2 = builder + .found_caboose( + &bogus_baseboard, + CabooseWhich::RotSlotA, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!(error.to_string(), error2.to_string(),); + + // report the same caboose twice with the same contents + let _ = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap(); + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + caboose1.clone(), + ) + .unwrap_err(); + assert_eq!( + format!("{:#}", error), + "baseboard BaseboardId { part_number: \"model1\", \ + serial_number: \"s1\" } caboose SpSlot0: reported multiple \ + times (same value)" + ); + // report the same caboose again with different contents + let error = builder + .found_caboose( + &sled1_bb, + CabooseWhich::SpSlot0, + "dummy", + SpComponentCaboose { + board: String::from("board2"), + git_commit: String::from("git_commit2"), + name: String::from("name2"), + version: String::from("version2"), + }, + ) + .unwrap_err(); + let message = format!("{:#}", error); + println!("found error: {}", message); + assert!(message.contains( + "caboose SpSlot0: reported caboose multiple times (previously" + )); + assert!(message.contains(", now ")); + + // We should still get a valid collection. + let collection = builder.build(); + println!("{:#?}", collection); + assert_eq!(collection.collector, "test_problems"); + + // We should still have the one sled and its SP slot0 caboose. + assert!(collection.baseboards.contains(&sled1_bb)); + let _ = collection.sps.get(&sled1_bb).unwrap(); + let caboose = + collection.caboose_for(CabooseWhich::SpSlot0, &sled1_bb).unwrap(); + assert_eq!(caboose.caboose.board, "board2"); + assert!(collection.cabooses.contains(&caboose.caboose)); + assert!(collection + .caboose_for(CabooseWhich::SpSlot1, &sled1_bb) + .is_none()); + let _ = collection.rots.get(&sled1_bb).unwrap(); + assert!(collection + .caboose_for(CabooseWhich::RotSlotA, &sled1_bb) + .is_none()); + assert!(collection + .caboose_for(CabooseWhich::RotSlotB, &sled1_bb) + .is_none()); + + // We should see an error. + assert_eq!( + collection + .errors + .iter() + .map(|e| format!("{:#}", e)) + .collect::>(), + vec![ + "MGS \"fake MGS 1\": SP Sled slot 65536: \ + slot number did not fit into u16" + ] + ); + } +} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs new file mode 100644 index 0000000000..d40b09d2be --- /dev/null +++ b/nexus/inventory/src/collector.rs @@ -0,0 +1,389 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collection of inventory from Omicron components + +use crate::builder::CollectionBuilder; +use anyhow::Context; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use slog::{debug, error}; +use std::sync::Arc; +use strum::IntoEnumIterator; + +pub struct Collector { + log: slog::Logger, + mgs_clients: Vec>, + in_progress: CollectionBuilder, +} + +impl Collector { + pub fn new( + creator: &str, + mgs_clients: &[Arc], + log: slog::Logger, + ) -> Self { + Collector { + log, + mgs_clients: mgs_clients.to_vec(), + in_progress: CollectionBuilder::new(creator), + } + } + + /// Begin the process of collecting a complete hardware/software inventory + /// of the rack + /// + /// The collection process makes a bunch of requests to a bunch of + /// components. This can take a while and produce any number of errors. + /// Such errors generally don't cause this function to fail. Rather, the + /// returned `Collection` keeps track of these errors. + pub async fn collect_all(mut self) -> Result { + // We're about to do a bunch of asynchronous operations. With a + // combination of async, futures, and some cleverness, we could do much + // of this in parallel. But this code path is not remotely + // latency-sensitive. And there's real risk of overloading our + // downstream services. So we just do one step at a time. This also + // keeps the code simpler. + + debug!(&self.log, "begin collection"); + + // When we add stages to collect from other components (e.g., sled + // agents), those will go here. + self.collect_all_mgs().await; + + debug!(&self.log, "finished collection"); + + Ok(self.in_progress.build()) + } + + /// Collect inventory from all MGS instances + async fn collect_all_mgs(&mut self) { + let clients = self.mgs_clients.clone(); + for client in &clients { + self.collect_one_mgs(&client).await; + } + } + + async fn collect_one_mgs(&mut self, client: &gateway_client::Client) { + debug!(&self.log, "begin collection from MGS"; + "mgs_url" => client.baseurl() + ); + + // First, see which SPs MGS can see via Ignition. + let ignition_result = client.ignition_list().await.with_context(|| { + format!("MGS {:?}: listing ignition targets", client.baseurl()) + }); + + // Select only the SPs that appear powered on. + // + // This choice is debatable. It's conceivable that an SP could be + // functioning but not visible to ignition. In that case, we'd be + // better off trying to ask MGS about it even though ignition reports it + // powered off. But in practice, if ignition can't see it, it's much + // more likely that there's just nothing plugged in. And in that case, + // if we try to ask MGS about it, we have to wait for MGS to time out + // its attempt to reach it (currently several seconds). This choice + // enables inventory to complete much faster, at the expense of not + // being able to identify this particular condition. + let sps = match ignition_result { + Err(error) => { + self.in_progress.found_error(error); + return; + } + + Ok(targets) => { + targets.into_inner().into_iter().filter_map(|sp_ignition| { + match sp_ignition.details { + gateway_client::types::SpIgnition::No => None, + gateway_client::types::SpIgnition::Yes { + power: false, + .. + } => None, + gateway_client::types::SpIgnition::Yes { + power: true, + .. + } => Some(sp_ignition.id), + } + }) + } + }; + + // For each SP that ignition reports up, fetch the state and caboose + // information. + for sp in sps { + // First, fetch the state of the SP. If that fails, report the + // error but continue. + let result = + client.sp_get(sp.type_, sp.slot).await.with_context(|| { + format!( + "MGS {:?}: fetching state of SP {:?}", + client.baseurl(), + sp + ) + }); + let sp_state = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + + // Record the state that we found. + let Some(baseboard_id) = self.in_progress.found_sp_state( + client.baseurl(), + sp.type_, + sp.slot, + sp_state, + ) else { + // We failed to parse this SP for some reason. The error was + // reported already. Move on. + continue; + }; + + // For each kind of caboose that we care about, if it hasn't been + // fetched already, fetch it and record it. Generally, we'd only + // get here for the first MGS client. Assuming that one succeeds, + // the other(s) will skip this loop. + for which in CabooseWhich::iter() { + if self.in_progress.found_caboose_already(&baseboard_id, which) + { + continue; + } + + let (component, slot) = match which { + CabooseWhich::SpSlot0 => ("sp", 0), + CabooseWhich::SpSlot1 => ("sp", 1), + CabooseWhich::RotSlotA => ("rot", 0), + CabooseWhich::RotSlotB => ("rot", 1), + }; + + let result = client + .sp_component_caboose_get( + sp.type_, sp.slot, component, slot, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {:?}: caboose {:?}", + client.baseurl(), + sp, + which + ) + }); + let caboose = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + if let Err(error) = self.in_progress.found_caboose( + &baseboard_id, + which, + client.baseurl(), + caboose, + ) { + error!( + &self.log, + "error reporting caboose: {:?} {:?} {:?}: {:#}", + baseboard_id, + which, + client.baseurl(), + error + ); + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::Collector; + use gateway_messages::SpPort; + use nexus_types::inventory::Collection; + use std::fmt::Write; + use std::sync::Arc; + + fn dump_collection(collection: &Collection) -> String { + // Construct a stable, human-readable summary of the Collection + // contents. We could use a `Debug` impl for this, but that's not quite + // right: when debugging, for example, we want fields like the ids, but + // these change each time and we don't want to include them here. + // `Serialize` has the same problem -- the set of fields to include + // depends on what the serialization is for. It's easy enough to just + // print what we want here. + let mut s = String::new(); + write!(&mut s, "baseboards:\n").unwrap(); + for b in &collection.baseboards { + write!( + &mut s, + " part {:?} serial {:?}\n", + b.part_number, b.serial_number + ) + .unwrap(); + } + + write!(&mut s, "\ncabooses:\n").unwrap(); + for c in &collection.cabooses { + write!( + &mut s, + " board {:?} name {:?} version {:?} git_commit {:?}\n", + c.board, c.name, c.version, c.git_commit, + ) + .unwrap(); + } + + // All we really need to check here is that we're reporting the right + // SPs, RoTs, and cabooses. The actual SP data, RoT data, and caboose + // data comes straight from MGS. And proper handling of that data is + // tested in the builder. + write!(&mut s, "\nSPs:\n").unwrap(); + for (bb, _) in &collection.sps { + write!( + &mut s, + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, + ) + .unwrap(); + } + + write!(&mut s, "\nRoTs:\n").unwrap(); + for (bb, _) in &collection.rots { + write!( + &mut s, + " baseboard part {:?} serial {:?}\n", + bb.part_number, bb.serial_number, + ) + .unwrap(); + } + + write!(&mut s, "\ncabooses found:\n").unwrap(); + for (kind, bb_to_found) in &collection.cabooses_found { + for (bb, found) in bb_to_found { + write!( + &mut s, + " {:?} baseboard part {:?} serial {:?}: board {:?}\n", + kind, bb.part_number, bb.serial_number, found.caboose.board, + ) + .unwrap(); + } + } + + write!(&mut s, "\nerrors:\n").unwrap(); + for e in &collection.errors { + // Some error strings have OS error numbers in them. We want to + // ignore those, particularly for CI, which runs these tests on + // multiple OSes. + let message = regex::Regex::new(r"os error \d+") + .unwrap() + .replace_all(&e, "os error <>"); + write!(&mut s, "error: {}\n", message).unwrap(); + } + + s + } + + #[tokio::test] + async fn test_basic() { + // Set up the stock MGS test setup which includes a couple of fake SPs. + // Then run a collection against it. + let gwtestctx = + gateway_test_utils::setup::test_setup("test_basic", SpPort::One) + .await; + let log = &gwtestctx.logctx.log; + let mgs_url = format!("http://{}/", gwtestctx.client.bind_address); + let mgs_client = + Arc::new(gateway_client::Client::new(&mgs_url, log.clone())); + let collector = + Collector::new("test-suite", &[mgs_client], log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs() { + // This is the same as the basic test, but we set up two different MGS + // instances and point the collector at both. We should get the same + // result. + let gwtestctx1 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_1", + SpPort::One, + ) + .await; + let gwtestctx2 = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx1.logctx.log; + let mgs_clients = [&gwtestctx1, &gwtestctx2] + .into_iter() + .map(|g| { + let url = format!("http://{}/", g.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }) + .collect::>(); + let collector = Collector::new("test-suite", &mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert!(collection.errors.is_empty()); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_basic.txt", &s); + + gwtestctx1.teardown().await; + gwtestctx2.teardown().await; + } + + #[tokio::test] + async fn test_multi_mgs_failure() { + // This is similar to the multi-MGS test, but we don't actually set up + // the second MGS. To the collector, it should look offline or + // otherwise non-functional. + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_multi_mgs_2", + SpPort::Two, + ) + .await; + let log = &gwtestctx.logctx.log; + let real_client = { + let url = format!("http://{}/", gwtestctx.client.bind_address); + let client = gateway_client::Client::new(&url, log.clone()); + Arc::new(client) + }; + let bad_client = { + // This IP range is guaranteed by RFC 6666 to discard traffic. + let url = "http://[100::1]:12345"; + let client = gateway_client::Client::new(url, log.clone()); + Arc::new(client) + }; + let mgs_clients = &[bad_client, real_client]; + let collector = Collector::new("test-suite", mgs_clients, log.clone()); + let collection = collector + .collect_all() + .await + .expect("failed to carry out collection"); + assert_eq!(collection.collector, "test-suite"); + + let s = dump_collection(&collection); + expectorate::assert_contents("tests/output/collector_errors.txt", &s); + + gwtestctx.teardown().await; + } +} diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs new file mode 100644 index 0000000000..52aca397bb --- /dev/null +++ b/nexus/inventory/src/examples.rs @@ -0,0 +1,254 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Example collections used for testing + +use crate::CollectionBuilder; +use gateway_client::types::PowerState; +use gateway_client::types::RotSlot; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentCaboose; +use gateway_client::types::SpState; +use gateway_client::types::SpType; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseWhich; +use std::sync::Arc; +use strum::IntoEnumIterator; + +/// Returns an example Collection used for testing +/// +/// This collection is intended to cover a variety of possible inventory data, +/// including: +/// +/// - all three baseboard types (switch, sled, PSC) +/// - various valid values for all fields (sources, slot numbers, power +/// states, baseboard revisions, cabooses, etc.) +/// - some empty slots +/// - some missing cabooses +/// - some cabooses common to multiple baseboards; others not +/// - serial number reused across different model numbers +pub fn representative() -> Representative { + let mut builder = CollectionBuilder::new("example"); + + // an ordinary, working sled + let sled1_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 3, + SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("hubris1"), + model: String::from("model1"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // another ordinary sled with different values for ordinary fields + let sled2_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Sled, + 4, + SpState { + base_mac_address: [1; 6], + hubris_archive_id: String::from("hubris2"), + model: String::from("model2"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: Some(RotSlot::A), + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest2")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest2")), + transient_boot_preference: Some(RotSlot::B), + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a switch + let switch1_bb = builder + .found_sp_state( + "fake MGS 2", + SpType::Switch, + 0, + SpState { + base_mac_address: [2; 6], + hubris_archive_id: String::from("hubris3"), + model: String::from("model3"), + power_state: PowerState::A1, + revision: 2, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest3")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest3")), + transient_boot_preference: None, + }, + // same serial number, which is okay because it's a + // different model number + serial_number: String::from("s1"), + }, + ) + .unwrap(); + + // a PSC + let psc_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Power, + 1, + SpState { + base_mac_address: [3; 6], + hubris_archive_id: String::from("hubris4"), + model: String::from("model4"), + power_state: PowerState::A2, + revision: 3, + rot: RotState::Enabled { + active: RotSlot::B, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest4")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest4")), + transient_boot_preference: None, + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // a sled with no RoT state or other optional fields + let sled3_bb = builder + .found_sp_state( + "fake MGS 1", + SpType::Sled, + 5, + SpState { + base_mac_address: [4; 6], + hubris_archive_id: String::from("hubris5"), + model: String::from("model1"), + power_state: PowerState::A2, + revision: 1, + rot: RotState::CommunicationFailed { + message: String::from("test suite injected error"), + }, + serial_number: String::from("s2"), + }, + ) + .unwrap(); + + // Report some cabooses. + + // We'll use the same cabooses for most of these components, although + // that's not possible in a real system. We deliberately construct a + // new value each time to make sure the builder correctly normalizes it. + let common_caboose_baseboards = [&sled1_bb, &sled2_bb, &switch1_bb]; + for bb in &common_caboose_baseboards { + for which in CabooseWhich::iter() { + assert!(!builder.found_caboose_already(bb, which)); + let _ = builder + .found_caboose(bb, which, "test suite", caboose("1")) + .unwrap(); + assert!(builder.found_caboose_already(bb, which)); + } + } + + // For the PSC, use different cabooses for both slots of both the SP and + // RoT, just to exercise that we correctly keep track of different + // cabooses. + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot0, + "test suite", + caboose("psc_sp_0"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::SpSlot1, + "test suite", + caboose("psc_sp_1"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotA, + "test suite", + caboose("psc_rot_a"), + ) + .unwrap(); + let _ = builder + .found_caboose( + &psc_bb, + CabooseWhich::RotSlotB, + "test suite", + caboose("psc_rot_b"), + ) + .unwrap(); + + // We deliberately provide no cabooses for sled3. + + Representative { + builder, + sleds: [sled1_bb, sled2_bb, sled3_bb], + switch: switch1_bb, + psc: psc_bb, + } +} + +pub struct Representative { + pub builder: CollectionBuilder, + pub sleds: [Arc; 3], + pub switch: Arc, + pub psc: Arc, +} + +/// Returns an SP state that can be used to populate a collection for testing +pub fn sp_state(unique: &str) -> SpState { + SpState { + base_mac_address: [0; 6], + hubris_archive_id: format!("hubris{}", unique), + model: format!("model{}", unique), + power_state: PowerState::A2, + revision: 0, + rot: RotState::Enabled { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_sha3_256_digest: Some(String::from("slotAdigest1")), + slot_b_sha3_256_digest: Some(String::from("slotBdigest1")), + transient_boot_preference: None, + }, + serial_number: format!("serial{}", unique), + } +} + +pub fn caboose(unique: &str) -> SpComponentCaboose { + SpComponentCaboose { + board: format!("board_{}", unique), + git_commit: format!("git_commit_{}", unique), + name: format!("name_{}", unique), + version: format!("version_{}", unique), + } +} diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs new file mode 100644 index 0000000000..3a5f60b387 --- /dev/null +++ b/nexus/inventory/src/lib.rs @@ -0,0 +1,27 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron component inventory +//! +//! This module provides [`Collector`], an interface for collecting a complete +//! hardware/software inventory in a running Omicron deployment +//! +//! This is really just the collection part. For separation of concerns, this +//! module doesn't know anything about storing these collections into the +//! database. That's provided by the datastore. The types associated with +//! collections are in `nexus_types::inventory` so they can be shared with other +//! parts of Nexus (like the datastore). +//! +//! This module lives inside Nexus but it has few dependencies on other parts of +//! Nexus. It could be incorporated into other components. (The corresponding +//! types in `nexus_types` might have to move, too) + +mod builder; +mod collector; +pub mod examples; + +// only exposed for test code to construct collections +pub use builder::CollectionBuilder; + +pub use collector::Collector; diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt new file mode 100644 index 0000000000..4a3bf62d63 --- /dev/null +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -0,0 +1,43 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + +errors: diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt new file mode 100644 index 0000000000..f231cc7d97 --- /dev/null +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -0,0 +1,44 @@ +baseboards: + part "FAKE_SIM_GIMLET" serial "SimGimlet00" + part "FAKE_SIM_GIMLET" serial "SimGimlet01" + part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses: + board "SimGimletRot" name "SimGimlet" version "0.0.1" git_commit "eeeeeeee" + board "SimGimletSp" name "SimGimlet" version "0.0.1" git_commit "ffffffff" + board "SimSidecarRot" name "SimSidecar" version "0.0.1" git_commit "eeeeeeee" + board "SimSidecarSp" name "SimSidecar" version "0.0.1" git_commit "ffffffff" + +SPs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +RoTs: + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00" + baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0" + baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1" + +cabooses found: + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot0 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarSp" + SpSlot1 baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarSp" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotA baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet00": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_GIMLET" serial "SimGimlet01": board "SimGimletRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar0": board "SimSidecarRot" + RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" + +errors: +error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error trying to connect: tcp connect error: Network is unreachable (os error <>): tcp connect error: Network is unreachable (os error <>): Network is unreachable (os error <>) diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index 3fcf0483a5..7b05eab61b 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -177,7 +177,7 @@ pub struct Driver { /// /// This is returned by [`Driver::register()`] to identify the corresponding /// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::status()`] to identify the task. +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] pub struct TaskHandle(String); @@ -277,8 +277,8 @@ impl Driver { /// Enumerate all registered background tasks /// /// This is aimed at callers that want to get the status of all background - /// tasks. You'd call [`Driver::status()`] with each of the items produced - /// by the iterator. + /// tasks. You'd call [`Driver::task_status()`] with each of the items + /// produced by the iterator. pub fn tasks(&self) -> impl Iterator { self.tasks.keys() } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index aa949bbc9f..b000dd9bda 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -9,6 +9,7 @@ use super::dns_config; use super::dns_propagation; use super::dns_servers; use super::external_endpoints; +use super::inventory_collection; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -16,6 +17,7 @@ use omicron_common::nexus_config::BackgroundTaskConfig; use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::sync::Arc; +use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with /// them @@ -42,6 +44,9 @@ pub struct BackgroundTasks { pub external_endpoints: tokio::sync::watch::Receiver< Option, >, + + /// task handle for the task that collects inventory + pub task_inventory_collection: common::TaskHandle, } impl BackgroundTasks { @@ -50,6 +55,8 @@ impl BackgroundTasks { opctx: &OpContext, datastore: Arc, config: &BackgroundTaskConfig, + nexus_id: Uuid, + resolver: internal_dns::resolver::Resolver, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -70,8 +77,9 @@ impl BackgroundTasks { // Background task: External endpoints list watcher let (task_external_endpoints, external_endpoints) = { - let watcher = - external_endpoints::ExternalEndpointsWatcher::new(datastore); + let watcher = external_endpoints::ExternalEndpointsWatcher::new( + datastore.clone(), + ); let watcher_channel = watcher.watcher(); let task = driver.register( String::from("external_endpoints"), @@ -88,6 +96,30 @@ impl BackgroundTasks { (task, watcher_channel) }; + // Background task: inventory collector + let task_inventory_collection = { + let collector = inventory_collection::InventoryCollector::new( + datastore, + resolver, + &nexus_id.to_string(), + config.inventory.nkeep, + config.inventory.disable, + ); + let task = driver.register( + String::from("inventory_collection"), + String::from( + "collects hardware and software inventory data from the \ + whole system", + ), + config.inventory.period_secs, + Box::new(collector), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -96,6 +128,7 @@ impl BackgroundTasks { task_external_dns_servers, task_external_endpoints, external_endpoints, + task_inventory_collection, } } diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/inventory_collection.rs new file mode 100644 index 0000000000..f095b094db --- /dev/null +++ b/nexus/src/app/background/inventory_collection.rs @@ -0,0 +1,243 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for reading inventory for the rack + +use super::common::BackgroundTask; +use anyhow::ensure; +use anyhow::Context; +use futures::future::BoxFuture; +use futures::FutureExt; +use internal_dns::ServiceName; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::inventory::Collection; +use serde_json::json; +use std::sync::Arc; + +/// Background task that reads inventory for the rack +pub struct InventoryCollector { + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: String, + nkeep: u32, + disable: bool, +} + +impl InventoryCollector { + pub fn new( + datastore: Arc, + resolver: internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, + disable: bool, + ) -> InventoryCollector { + InventoryCollector { + datastore, + resolver, + creator: creator.to_owned(), + nkeep, + disable, + } + } +} + +impl BackgroundTask for InventoryCollector { + fn activate<'a, 'b, 'c>( + &'a mut self, + opctx: &'b OpContext, + ) -> BoxFuture<'c, serde_json::Value> + where + 'a: 'c, + 'b: 'c, + { + async { + match inventory_activate( + opctx, + &self.datastore, + &self.resolver, + &self.creator, + self.nkeep, + self.disable, + ) + .await + .context("failed to collect inventory") + { + Err(error) => { + let message = format!("{:#}", error); + warn!(opctx.log, "inventory collection failed"; + "error" => message.clone()); + json!({ "error": message }) + } + Ok(collection) => { + debug!(opctx.log, "inventory collection complete"; + "collection_id" => collection.id.to_string(), + "time_started" => collection.time_started.to_string(), + ); + json!({ + "collection_id": collection.id.to_string(), + "time_started": collection.time_started.to_string(), + "time_done": collection.time_done.to_string() + }) + } + } + } + .boxed() + } +} + +async fn inventory_activate( + opctx: &OpContext, + datastore: &DataStore, + resolver: &internal_dns::resolver::Resolver, + creator: &str, + nkeep: u32, + disabled: bool, +) -> Result { + // If we're disabled, don't do anything. (This switch is only intended for + // unforeseen production emergencies.) + ensure!(!disabled, "disabled by explicit configuration"); + + // Prune old collections. We do this first, here, to ensure that we never + // develop an unbounded backlog of collections. (If this process were done + // by a separate task, it would be possible for the backlog to grow + // unbounded if that task were simply slower than the collection process, + // let alone if there were some kind of extended operational issue + // blocking deletion.) + datastore + .inventory_prune_collections(opctx, nkeep) + .await + .context("pruning old collections")?; + + // Find MGS clients. + let mgs_clients = resolver + .lookup_all_socket_v6(ServiceName::ManagementGatewayService) + .await + .context("looking up MGS addresses")? + .into_iter() + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + let log = opctx.log.new(o!("gateway_url" => url.clone())); + Arc::new(gateway_client::Client::new(&url, log)) + }) + .collect::>(); + + // Run a collection. + let inventory = nexus_inventory::Collector::new( + creator, + &mgs_clients, + opctx.log.clone(), + ); + let collection = + inventory.collect_all().await.context("collecting inventory")?; + + // Write it to the database. + datastore + .inventory_insert_collection(opctx, &collection) + .await + .context("saving inventory to database")?; + + Ok(collection) +} + +#[cfg(test)] +mod test { + use crate::app::background::common::BackgroundTask; + use crate::app::background::inventory_collection::InventoryCollector; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DataStoreInventoryTest; + use nexus_test_utils_macros::nexus_test; + use omicron_test_utils::dev::poll; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + // Test that each activation creates a new collection and that we prune old + // collections, too. + #[nexus_test(server = crate::Server)] + async fn test_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Nexus starts the very background task that we're also testing + // manually here. As a result, we should find a collection in the + // database before too long. Wait for it so that after it appears, we + // can assume the rest of the collections came from the instance that + // we're testing. + let mut last_collections = + poll::wait_for_condition::<_, anyhow::Error, _, _>( + || async { + let collections = datastore + .inventory_collections() + .await + .map_err(poll::CondCheckError::Failed)?; + if collections.is_empty() { + Err(poll::CondCheckError::NotYet) + } else { + Ok(collections) + } + }, + &std::time::Duration::from_millis(50), + &std::time::Duration::from_secs(15), + ) + .await + .expect("background task did not populate initial collection"); + + let resolver = internal_dns::resolver::Resolver::new_from_addrs( + cptestctx.logctx.log.clone(), + &[cptestctx.internal_dns.dns_server.local_address()], + ) + .unwrap(); + + // Now we'll create our own copy of the background task and activate it + // a bunch and make sure that it always creates a new collection and + // does not allow a backlog to accumulate. + let nkeep = 3; + let mut task = InventoryCollector::new( + datastore.clone(), + resolver.clone(), + "me", + nkeep, + false, + ); + let nkeep = usize::try_from(nkeep).unwrap(); + for i in 0..10 { + let _ = task.activate(&opctx).await; + let collections = datastore.inventory_collections().await.unwrap(); + println!( + "iter {}: last = {:?}, current = {:?}", + i, last_collections, collections + ); + + let expected_from_last: Vec<_> = if last_collections.len() <= nkeep + { + last_collections + } else { + last_collections.into_iter().skip(1).collect() + }; + let expected_from_current: Vec<_> = + collections.iter().rev().skip(1).rev().cloned().collect(); + assert_eq!(expected_from_last, expected_from_current); + assert_eq!(collections.len(), std::cmp::min(i + 2, nkeep + 1)); + last_collections = collections; + } + + // Create a disabled task and make sure that does nothing. + let mut task = InventoryCollector::new( + datastore.clone(), + resolver, + "disabled", + 3, + true, + ); + let previous = datastore.inventory_collections().await.unwrap(); + let _ = task.activate(&opctx).await; + let latest = datastore.inventory_collections().await.unwrap(); + assert_eq!(previous, latest); + } +} diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 9ba0780246..e1f474b41a 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -10,6 +10,7 @@ mod dns_propagation; mod dns_servers; mod external_endpoints; mod init; +mod inventory_collection; mod status; pub use common::Driver; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 7db93a158a..ef8132451a 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -349,6 +349,8 @@ impl Nexus { &background_ctx, Arc::clone(&db_datastore), &config.pkg.background_tasks, + config.deployment.id, + resolver.clone(), ); let external_resolver = { diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 7697d34ecd..bed690f839 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -237,6 +237,7 @@ impl super::Nexus { &self.background_tasks.task_external_dns_config, &self.background_tasks.task_external_dns_servers, &self.background_tasks.task_external_endpoints, + &self.background_tasks.task_inventory_collection, ] { self.background_tasks.activate(task); } diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 8cd25582be..56cee27b37 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -14,6 +14,8 @@ crucible-agent-client.workspace = true dns-server.workspace = true dns-service-client.workspace = true dropshot.workspace = true +gateway-messages.workspace = true +gateway-test-utils.workspace = true headers.workspace = true http.workspace = true hyper.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 701a6e8ba9..647232031d 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -14,6 +14,7 @@ use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; +use gateway_test_utils::setup::GatewayTestContext; use nexus_test_interface::NexusServer; use nexus_types::external_api::params::UserId; use nexus_types::internal_api::params::Certificate; @@ -86,6 +87,7 @@ pub struct ControlPlaneTestContext { pub sled_agent: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, + pub gateway: GatewayTestContext, pub dendrite: HashMap, pub mgd: HashMap, pub external_dns_zone_name: String, @@ -107,6 +109,7 @@ impl ControlPlaneTestContext { self.sled_agent.http_server.close().await.unwrap(); self.oximeter.close().await.unwrap(); self.producer.close().await.unwrap(); + self.gateway.teardown().await; for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -226,6 +229,7 @@ impl RackInitRequestBuilder { pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub config: &'a mut omicron_common::nexus_config::Config, + test_name: &'a str, rack_init_builder: RackInitRequestBuilder, pub start_time: chrono::DateTime, @@ -241,6 +245,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub sled_agent: Option, pub oximeter: Option, pub producer: Option, + pub gateway: Option, pub dendrite: HashMap, pub mgd: HashMap, @@ -259,7 +264,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { pub fn new( - test_name: &str, + test_name: &'a str, config: &'a mut omicron_common::nexus_config::Config, ) -> Self { let start_time = chrono::Utc::now(); @@ -267,6 +272,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { Self { config, + test_name, rack_init_builder: RackInitRequestBuilder::new(), start_time, logctx, @@ -279,6 +285,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { sled_agent: None, oximeter: None, producer: None, + gateway: None, dendrite: HashMap::new(), mgd: HashMap::new(), nexus_internal: None, @@ -377,6 +384,37 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { .set_port(port); } + pub async fn start_gateway(&mut self) { + // For now, this MGS is not configured to match up in any way with + // either the simulated sled agent or the Dendrite instances. It's + // useful for testing stuff unrelated to that. But at some point we + // will probably want the reported data to match up better. + debug!(&self.logctx.log, "Starting Management Gateway"); + let gateway = gateway_test_utils::setup::test_setup( + self.test_name, + gateway_messages::SpPort::One, + ) + .await; + let fake_mgs_zone_id = Uuid::new_v4(); + let SocketAddr::V6(v6addr) = gateway.client.bind_address else { + panic!("MGS unexpectedly listening on IPv4?"); + }; + let zone = self + .rack_init_builder + .internal_dns_config + .host_zone(fake_mgs_zone_id, *v6addr.ip()) + .expect("Failed to add DNS for MGS zone"); + self.rack_init_builder + .internal_dns_config + .service_backend_zone( + internal_dns::ServiceName::ManagementGatewayService, + &zone, + v6addr.port(), + ) + .expect("Failed to add DNS for MGS service"); + self.gateway = Some(gateway); + } + pub async fn start_dendrite(&mut self, switch_location: SwitchLocation) { let log = &self.logctx.log; debug!(log, "Starting Dendrite for {switch_location}"); @@ -796,6 +834,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { oximeter: self.oximeter.unwrap(), producer: self.producer.unwrap(), logctx: self.logctx, + gateway: self.gateway.unwrap(), dendrite: self.dendrite, mgd: self.mgd, external_dns_zone_name: self.external_dns_zone_name.unwrap(), @@ -825,6 +864,9 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { if let Some(producer) = self.producer { producer.close().await.unwrap(); } + if let Some(gateway) = self.gateway { + gateway.teardown().await; + } for (_, mut dendrite) in self.dendrite { dendrite.cleanup().await.unwrap(); } @@ -919,6 +961,7 @@ async fn setup_with_config_impl( ) -> ControlPlaneTestContext { builder.start_crdb_impl(populate).await; builder.start_clickhouse().await; + builder.start_gateway().await; builder.start_dendrite(SwitchLocation::Switch0).await; builder.start_dendrite(SwitchLocation::Switch1).await; builder.start_mgd(SwitchLocation::Switch0).await; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 09f13e55c7..54f7e03eef 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -90,8 +90,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the # `Random` strategy, instead of `RandomWithDistinctSleds` -type = "random" \ No newline at end of file +type = "random" diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index c499714c31..5722b065cf 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -23,6 +23,7 @@ uuid.workspace = true api_identity.workspace = true dns-service-client.workspace = true +gateway-client.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs new file mode 100644 index 0000000000..112eec3a65 --- /dev/null +++ b/nexus/types/src/inventory.rs @@ -0,0 +1,179 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types representing collection of hardware/software inventory +//! +//! This lives in nexus/types because it's used by both nexus/db-model and +//! nexus/inventory. (It could as well just live in nexus/db-model, but +//! nexus/inventory does not currently know about nexus/db-model and it's +//! convenient to separate these concerns.) + +use chrono::DateTime; +use chrono::Utc; +pub use gateway_client::types::PowerState; +pub use gateway_client::types::RotSlot; +pub use gateway_client::types::SpType; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use strum::EnumIter; +use uuid::Uuid; + +/// Results of collecting hardware/software inventory from various Omicron +/// components +/// +/// This type is structured so that it's both easy to collect and easy to insert +/// into the database. This means items that are represented with separate +/// database tables (like service processors and roots of trust) are represented +/// with separate records, even though they might come from the same source +/// (in this case, a single MGS request). +/// +/// We make heavy use of maps, sets, and Arcs here because some of these objects +/// are pointed-to by many other objects in the same Collection. This approach +/// ensures clear ownership. It also reflects how things will wind up in the +/// database. +/// +/// See the documentation in the database schema for more background. +#[derive(Debug, Eq, PartialEq)] +pub struct Collection { + /// unique identifier for this collection + pub id: Uuid, + /// errors encountered during collection + pub errors: Vec, + /// time the collection started + pub time_started: DateTime, + /// time the collection eneded + pub time_done: DateTime, + /// name of the agent doing the collecting (generally, this Nexus's uuid) + pub collector: String, + + /// unique baseboard ids that were found in this collection + /// + /// In practice, these will be inserted into the `hw_baseboard_id` table. + pub baseboards: BTreeSet>, + /// unique caboose contents that were found in this collection + /// + /// In practice, these will be inserted into the `sw_caboose` table. + pub cabooses: BTreeSet>, + + /// all service processors, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_service_processor` + /// table. + pub sps: BTreeMap, ServiceProcessor>, + /// all roots of trust, keyed by baseboard id + /// + /// In practice, these will be inserted into the `inv_root_of_trust` table. + pub rots: BTreeMap, RotState>, + /// all caboose contents found, keyed first by the kind of caboose + /// (`CabooseWhich`), then the baseboard id of the sled where they were + /// found + /// + /// In practice, these will be inserted into the `inv_caboose` table. + pub cabooses_found: + BTreeMap, CabooseFound>>, +} + +impl Collection { + pub fn caboose_for( + &self, + which: CabooseWhich, + baseboard_id: &BaseboardId, + ) -> Option<&CabooseFound> { + self.cabooses_found + .get(&which) + .and_then(|by_bb| by_bb.get(baseboard_id)) + } +} + +/// A unique baseboard id found during a collection +/// +/// Baseboard ids are the keys used to link up information from disparate +/// sources (like a service processor and a sled agent). +/// +/// These are normalized in the database. Each distinct baseboard id is +/// assigned a uuid and shared across the many possible collections that +/// reference it. +/// +/// Usually, the part number and serial number are combined with a revision +/// number. We do not include that here. If we ever did find a baseboard with +/// the same part number and serial number but a new revision number, we'd want +/// to treat that as the same baseboard as one with a different revision number. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + +/// Caboose contents found during a collection +/// +/// These are normalized in the database. Each distinct `Caboose` is assigned a +/// uuid and shared across many possible collections that reference it. +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct Caboose { + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for Caboose { + fn from(c: gateway_client::types::SpComponentCaboose) -> Self { + Caboose { + board: c.board, + git_commit: c.git_commit, + name: c.name, + version: c.version, + } + } +} + +/// Indicates that a particular `Caboose` was found (at a particular time from a +/// particular source, but these are only for debugging) +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct CabooseFound { + pub time_collected: DateTime, + pub source: String, + pub caboose: Arc, +} + +/// Describes a service processor found during collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct ServiceProcessor { + pub time_collected: DateTime, + pub source: String, + + pub sp_type: SpType, + pub sp_slot: u16, + + pub baseboard_revision: u32, + pub hubris_archive: String, + pub power_state: PowerState, +} + +/// Describes the root of trust state found (from a service processor) during +/// collection +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct RotState { + pub time_collected: DateTime, + pub source: String, + + pub active_slot: RotSlot, + pub persistent_boot_preference: RotSlot, + pub pending_persistent_boot_preference: Option, + pub transient_boot_preference: Option, + pub slot_a_sha3_256_digest: Option, + pub slot_b_sha3_256_digest: Option, +} + +/// Describes which caboose this is (which component, which slot) +#[derive(Clone, Copy, Debug, EnumIter, PartialEq, Eq, PartialOrd, Ord)] +pub enum CabooseWhich { + SpSlot0, + SpSlot1, + RotSlotA, + RotSlotB, +} diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 3f864b0f17..a48c4d3b00 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -32,3 +32,4 @@ pub mod external_api; pub mod identity; pub mod internal_api; +pub mod inventory; diff --git a/openapi/gateway.json b/openapi/gateway.json index 67cc2bd634..97cb7994aa 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -2385,14 +2385,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentDetails": { diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 75db82e8e1..a75c965ad8 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -2517,14 +2517,14 @@ "type": "string" }, "version": { - "nullable": true, "type": "string" } }, "required": [ "board", "git_commit", - "name" + "name", + "version" ] }, "SpComponentInfo": { diff --git a/schema/crdb/9.0.0/up01.sql b/schema/crdb/9.0.0/up01.sql new file mode 100644 index 0000000000..88439c433b --- /dev/null +++ b/schema/crdb/9.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up02.sql b/schema/crdb/9.0.0/up02.sql new file mode 100644 index 0000000000..d98f896fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up02.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); diff --git a/schema/crdb/9.0.0/up03.sql b/schema/crdb/9.0.0/up03.sql new file mode 100644 index 0000000000..3bd036be7e --- /dev/null +++ b/schema/crdb/9.0.0/up03.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); diff --git a/schema/crdb/9.0.0/up04.sql b/schema/crdb/9.0.0/up04.sql new file mode 100644 index 0000000000..1590ec4e88 --- /dev/null +++ b/schema/crdb/9.0.0/up04.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); diff --git a/schema/crdb/9.0.0/up05.sql b/schema/crdb/9.0.0/up05.sql new file mode 100644 index 0000000000..1042282fb0 --- /dev/null +++ b/schema/crdb/9.0.0/up05.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up06.sql b/schema/crdb/9.0.0/up06.sql new file mode 100644 index 0000000000..aa614fa2fb --- /dev/null +++ b/schema/crdb/9.0.0/up06.sql @@ -0,0 +1,2 @@ +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); diff --git a/schema/crdb/9.0.0/up07.sql b/schema/crdb/9.0.0/up07.sql new file mode 100644 index 0000000000..945f5a44c8 --- /dev/null +++ b/schema/crdb/9.0.0/up07.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); diff --git a/schema/crdb/9.0.0/up08.sql b/schema/crdb/9.0.0/up08.sql new file mode 100644 index 0000000000..1abeb9203f --- /dev/null +++ b/schema/crdb/9.0.0/up08.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); diff --git a/schema/crdb/9.0.0/up09.sql b/schema/crdb/9.0.0/up09.sql new file mode 100644 index 0000000000..770c771775 --- /dev/null +++ b/schema/crdb/9.0.0/up09.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); diff --git a/schema/crdb/9.0.0/up10.sql b/schema/crdb/9.0.0/up10.sql new file mode 100644 index 0000000000..57665ee468 --- /dev/null +++ b/schema/crdb/9.0.0/up10.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); diff --git a/schema/crdb/9.0.0/up11.sql b/schema/crdb/9.0.0/up11.sql new file mode 100644 index 0000000000..40da69af5b --- /dev/null +++ b/schema/crdb/9.0.0/up11.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); diff --git a/schema/crdb/9.0.0/up12.sql b/schema/crdb/9.0.0/up12.sql new file mode 100644 index 0000000000..9089ac93ba --- /dev/null +++ b/schema/crdb/9.0.0/up12.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up13.sql b/schema/crdb/9.0.0/up13.sql new file mode 100644 index 0000000000..241c5d9e80 --- /dev/null +++ b/schema/crdb/9.0.0/up13.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, + slot_a_sha3_256 TEXT, + slot_b_sha3_256 TEXT, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); diff --git a/schema/crdb/9.0.0/up14.sql b/schema/crdb/9.0.0/up14.sql new file mode 100644 index 0000000000..6725d35acf --- /dev/null +++ b/schema/crdb/9.0.0/up14.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); diff --git a/schema/crdb/9.0.0/up15.sql b/schema/crdb/9.0.0/up15.sql new file mode 100644 index 0000000000..48a68d167a --- /dev/null +++ b/schema/crdb/9.0.0/up15.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0fdaf5083c..da842cbfeb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2514,6 +2514,222 @@ CREATE TABLE IF NOT EXISTS omicron.public.bootstore_keys ( generation INT8 NOT NULL ); +/* + * Hardware/software inventory + * + * See RFD 433 for details. Here are the highlights. + * + * Omicron periodically collects hardware/software inventory data from the + * running system and stores it into the database. Each discrete set of data is + * called a **collection**. Each collection contains lots of different kinds of + * data, so there are many tables here. For clarity, these tables are prefixed + * with: + * + * `inv_*` (examples: `inv_collection`, `inv_service_processor`) + * + * Describes the complete set of hardware and software in the system. + * Rows in these tables are immutable, but they describe mutable facts + * about hardware and software (e.g., the slot that a disk is in). When + * these facts change (e.g., a disk moves between slots), a new set of + * records is written. + * + * All rows in the `inv_*` tables point back to a particular collection. They + * represent the state observed at some particular time. Generally, if two + * observations came from two different places, they're not put into the same + * row of the same table. For example, caboose information comes from the SP, + * but it doesn't go into the `inv_service_processor` table. It goes in a + * separate `inv_caboose` table. This is debatable but it preserves a clearer + * record of exactly what information came from where, since the separate record + * has its own "source" and "time_collected". + * + * Information about service processors and roots of trust are joined with + * information reported by sled agents via the baseboard id. + * + * Hardware and software identifiers are normalized for the usual database + * design reasons. This means instead of storing hardware and software + * identifiers directly in the `inv_*` tables, these tables instead store + * foreign keys into one of these groups of tables, whose names are also + * prefixed for clarity: + * + * `hw_*` (example: `hw_baseboard_id`) + * + * Maps hardware-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * `sw_*` (example: `sw_caboose`) + * + * Maps software-provided identifiers to UUIDs that are used as foreign + * keys in the rest of the schema. (Avoids embedding these identifiers + * into all the other tables.) + * + * Records in these tables are shared across potentially many collections. To + * see why this is useful, consider that `sw_caboose` records contain several + * long identifiers (e.g., git commit, SHA sums) and in practice, most of the + * time, we expect that all components of a given type will have the exact same + * cabooses. Rather than store the caboose contents in each + * `inv_service_processor` row (for example), often replicating the exact same + * contents for each SP for each collection, these rows just have pointers into + * the `sw_caboose` table that stores this data once. (This also makes it much + * easier to determine that these components _do_ have the same cabooses.) + * + * On PC systems (i.e., non-Oxide hardware), most of these tables will be empty + * because we do not support hardware inventory on these systems. + * + * Again, see RFD 433 for more on all this. + */ + +/* + * baseboard ids: this table assigns uuids to distinct part/serial values + * + * Usually we include the baseboard revision number when we reference the part + * number and serial number. The revision number is deliberately left out here. + * If we happened to see the same baseboard part number and serial number with + * different revisions, that's the same baseboard. + */ +CREATE TABLE IF NOT EXISTS omicron.public.hw_baseboard_id ( + id UUID PRIMARY KEY, + part_number TEXT NOT NULL, + serial_number TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS lookup_baseboard_id_by_props + ON omicron.public.hw_baseboard_id (part_number, serial_number); + +/* power states reportable by the SP */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_power_state AS ENUM ( + 'A0', + 'A1', + 'A2' +); + +/* root of trust firmware slots */ +CREATE TYPE IF NOT EXISTS omicron.public.hw_rot_slot AS ENUM ( + 'A', + 'B' +); + +/* cabooses: this table assigns unique ids to distinct caboose contents */ +CREATE TABLE IF NOT EXISTS omicron.public.sw_caboose ( + id UUID PRIMARY KEY, + board TEXT NOT NULL, + git_commit TEXT NOT NULL, + name TEXT NOT NULL, + -- The MGS response that provides this field indicates that it can be NULL. + -- But that's only to support old software that we no longer support. + version TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS caboose_properties + on omicron.public.sw_caboose (board, git_commit, name, version); + +/* Inventory Collections */ + +-- list of all collections +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection ( + id UUID PRIMARY KEY, + time_started TIMESTAMPTZ NOT NULL, + time_done TIMESTAMPTZ NOT NULL, + collector TEXT NOT NULL +); +-- Supports finding latest collection (to use) or the oldest collection (to +-- clean up) +CREATE INDEX IF NOT EXISTS inv_collection_by_time_started + ON omicron.public.inv_collection (time_started); + +-- list of errors generated during a collection +CREATE TABLE IF NOT EXISTS omicron.public.inv_collection_error ( + inv_collection_id UUID NOT NULL, + idx INT4 NOT NULL, + message TEXT +); +CREATE INDEX IF NOT EXISTS errors_by_collection + ON omicron.public.inv_collection_error (inv_collection_id, idx); + +/* what kind of slot MGS reported a device in */ +CREATE TYPE IF NOT EXISTS omicron.public.sp_type AS ENUM ( + 'sled', + 'switch', + 'power' +); + +-- observations from and about service processors +-- also see `inv_root_of_trust` +CREATE TABLE IF NOT EXISTS omicron.public.inv_service_processor ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + -- identity of this device according to MGS + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- Data from MGS "Get SP Info" API. See MGS API documentation. + baseboard_revision INT8 NOT NULL, + hubris_archive_id TEXT NOT NULL, + power_state omicron.public.hw_power_state NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +-- root of trust information reported by SP +-- There's usually one row here for each row in inv_service_processor, but not +-- necessarily. +CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + slot_active omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_transient omicron.public.hw_rot_slot, -- nullable + slot_boot_pref_persistent omicron.public.hw_rot_slot NOT NULL, + slot_boot_pref_persistent_pending omicron.public.hw_rot_slot, -- nullable + slot_a_sha3_256 TEXT, -- nullable + slot_b_sha3_256 TEXT, -- nullable + + PRIMARY KEY (inv_collection_id, hw_baseboard_id) +); + +CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( + 'sp_slot_0', + 'sp_slot_1', + 'rot_slot_A', + 'rot_slot_B' +); + +-- cabooses found +CREATE TABLE IF NOT EXISTS omicron.public.inv_caboose ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + which omicron.public.caboose_which NOT NULL, + sw_caboose_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, which) +); + +/*******************************************************************/ + /* * The `sled_instance` view's definition needs to be modified in a separate * transaction from the transaction that created it. @@ -2522,6 +2738,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.bootstore_keys ( COMMIT; BEGIN; +/*******************************************************************/ + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. @@ -2620,7 +2838,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '8.0.0', NULL) + ( TRUE, NOW(), NOW(), '9.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 2dfee81d02..cae1f650c9 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -38,8 +38,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds # seed is omitted so a new seed will be chosen with every allocation. -type = "random_with_distinct_sleds" \ No newline at end of file +type = "random_with_distinct_sleds" diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index aff0a8a25f..be8683be54 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -38,8 +38,15 @@ dns_external.max_concurrent_server_updates = 5 # certificates it will take _other_ Nexus instances to notice and stop serving # them (on a sunny day). external_endpoints.period_secs = 60 +# How frequently to collect hardware/software inventory from the whole system +# (even if we don't have reason to believe anything has changed). +inventory.period_secs = 600 +# Maximum number of past collections to keep in the database +inventory.nkeep = 3 +# Disable inventory collection altogether (for emergencies) +inventory.disable = false [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. # seed is omitted so a new seed will be chosen with every allocation. -type = "random" \ No newline at end of file +type = "random" diff --git a/wicket/src/state/inventory.rs b/wicket/src/state/inventory.rs index 3a561167b1..23a0e244cf 100644 --- a/wicket/src/state/inventory.rs +++ b/wicket/src/state/inventory.rs @@ -147,7 +147,7 @@ pub enum Component { } fn version_or_unknown(caboose: Option<&SpComponentCaboose>) -> String { - caboose.and_then(|c| c.version.as_deref()).unwrap_or("UNKNOWN").to_string() + caboose.map(|c| c.version.as_str()).unwrap_or("UNKNOWN").to_string() } impl Component { diff --git a/wicket/src/ui/panes/overview.rs b/wicket/src/ui/panes/overview.rs index 7de0171e41..e8cf50bb32 100644 --- a/wicket/src/ui/panes/overview.rs +++ b/wicket/src/ui/panes/overview.rs @@ -885,7 +885,6 @@ fn append_caboose( } = caboose; let label_style = style::text_label(); let ok_style = style::text_success(); - let bad_style = style::text_failure(); spans.push( vec![ @@ -905,9 +904,5 @@ fn append_caboose( ); let mut version_spans = vec![prefix.clone(), Span::styled("Version: ", label_style)]; - if let Some(v) = version.as_ref() { - version_spans.push(Span::styled(v.clone(), ok_style)); - } else { - version_spans.push(Span::styled("Unknown", bad_style)); - } + version_spans.push(Span::styled(version, ok_style)); } diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index 18b692703c..bd8e187fe9 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -839,25 +839,21 @@ impl UpdateDriver { let message = format!( "SP board {}, version {} (git commit {})", - caboose.board, - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.board, caboose.version, caboose.git_commit ); - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => { + match caboose.version.parse::() { + Ok(version) => { StepSuccess::new((sp_artifact, Some(version))) .with_message(message) .into() } - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( (sp_artifact, None), format!( "{message} (failed to parse SP version: {err})" ), ) .into(), - None => StepWarning::new((sp_artifact, None), message) - .into(), } }, ) @@ -1769,8 +1765,7 @@ impl UpdateContext { let message = format!( "RoT slot {active_slot_name} version {} (git commit {})", - caboose.version.as_deref().unwrap_or("unknown"), - caboose.git_commit + caboose.version, caboose.git_commit ); let make_result = |active_version| RotInterrogation { @@ -1779,16 +1774,15 @@ impl UpdateContext { active_version, }; - match caboose.version.map(|v| v.parse::()) { - Some(Ok(version)) => StepSuccess::new(make_result(Some(version))) + match caboose.version.parse::() { + Ok(version) => StepSuccess::new(make_result(Some(version))) .with_message(message) .into(), - Some(Err(err)) => StepWarning::new( + Err(err) => StepWarning::new( make_result(None), format!("{message} (failed to parse RoT version: {err})"), ) .into(), - None => StepWarning::new(make_result(None), message).into(), } } From 9a507142fab243c0651847f6e4af1794cfe2c972 Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 1 Nov 2023 15:39:11 -0700 Subject: [PATCH 11/14] [tufaceous] make fake SP/RoT names the same as sp-sim (#4408) I'd imagine that the two are used together most often. --- tufaceous-lib/src/assemble/manifest.rs | 7 ++++--- wicketd/tests/integration_tests/updates.rs | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tufaceous-lib/src/assemble/manifest.rs b/tufaceous-lib/src/assemble/manifest.rs index 409c85808c..437b84e7b0 100644 --- a/tufaceous-lib/src/assemble/manifest.rs +++ b/tufaceous-lib/src/assemble/manifest.rs @@ -261,9 +261,10 @@ impl<'a> FakeDataAttributes<'a> { | KnownArtifactKind::Trampoline | KnownArtifactKind::ControlPlane => return make_filler_text(size), - // hubris artifacts: build a fake archive - KnownArtifactKind::GimletSp => "fake-gimlet-sp", - KnownArtifactKind::GimletRot => "fake-gimlet-rot", + // hubris artifacts: build a fake archive (SimGimletSp and + // SimGimletRot are used by sp-sim) + KnownArtifactKind::GimletSp => "SimGimletSp", + KnownArtifactKind::GimletRot => "SimGimletRot", KnownArtifactKind::PscSp => "fake-psc-sp", KnownArtifactKind::PscRot => "fake-psc-rot", KnownArtifactKind::SwitchSp => "fake-sidecar-sp", diff --git a/wicketd/tests/integration_tests/updates.rs b/wicketd/tests/integration_tests/updates.rs index a9be9d4747..aa145a0f16 100644 --- a/wicketd/tests/integration_tests/updates.rs +++ b/wicketd/tests/integration_tests/updates.rs @@ -169,7 +169,7 @@ async fn test_updates() { StepEventKind::ExecutionFailed { failed_step, .. } => { // TODO: obviously we shouldn't stop here, get past more of the // update process in this test. - assert_eq!(failed_step.info.component, UpdateComponent::Sp); + assert_eq!(failed_step.info.component, UpdateComponent::Rot); } other => { panic!("unexpected terminal event kind: {other:?}"); From 81751dd59aeb9799b1cfb273eeb43b43b713202a Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 2 Nov 2023 04:28:24 +0000 Subject: [PATCH 12/14] chore(deps): update taiki-e/install-action digest to 11dea51 (#4412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`1286723` -> `11dea51`](https://togithub.com/taiki-e/install-action/compare/1286723...11dea51) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 102e0dca87..9d5f7444de 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -22,7 +22,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@1286723668b881a97f5cae2ef322c6b43efa610c # v2 + uses: taiki-e/install-action@11dea51b35bc2bfa42820716c6cabb14fd4c3266 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 63ca0f9d9603ee486241be1d541cdcc5101e9dc2 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:14:01 -0700 Subject: [PATCH 13/14] chore(deps): update rust crate toml to 0.8.6 (#4413) --- Cargo.lock | 64 +++++++++++++++++++-------------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 +-- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 92dd15044a..cf46fbb148 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -879,7 +879,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3f9629bc6c4388ea699781dc988c2b99766d7679b151c81990b4fa1208fafd3" dependencies = [ "serde", - "toml 0.8.0", + "toml 0.8.6", ] [[package]] @@ -1439,7 +1439,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "toml 0.8.0", + "toml 0.8.6", "tracing", "usdt", "uuid", @@ -1497,7 +1497,7 @@ dependencies = [ "tempfile", "thiserror", "tokio-rustls", - "toml 0.8.0", + "toml 0.8.6", "twox-hash", "uuid", "vergen", @@ -1767,7 +1767,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", ] [[package]] @@ -2040,7 +2040,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", "trust-dns-client", "trust-dns-proto", "trust-dns-resolver", @@ -2124,7 +2124,7 @@ dependencies = [ "serde", "serde_json", "slog", - "toml 0.7.8", + "toml 0.8.6", "uuid", ] @@ -2305,7 +2305,7 @@ dependencies = [ "russh-keys", "serde_json", "tokio", - "toml 0.7.8", + "toml 0.8.6", "trust-dns-resolver", "uuid", ] @@ -3400,7 +3400,7 @@ dependencies = [ "smf", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", "uuid", "zone", ] @@ -3518,7 +3518,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "toml 0.7.8", + "toml 0.8.6", "tufaceous-lib", "update-engine", "uuid", @@ -4143,7 +4143,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", ] [[package]] @@ -4437,7 +4437,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.7.8", + "toml 0.8.6", "usdt", "uuid", ] @@ -4911,7 +4911,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.7.8", + "toml 0.8.6", "uuid", ] @@ -4967,7 +4967,7 @@ dependencies = [ "serde", "serde_derive", "thiserror", - "toml 0.7.8", + "toml 0.8.6", ] [[package]] @@ -5001,7 +5001,7 @@ dependencies = [ "subprocess", "tokio", "tokio-postgres", - "toml 0.7.8", + "toml 0.8.6", ] [[package]] @@ -5045,7 +5045,7 @@ dependencies = [ "tokio-stream", "tokio-tungstenite 0.18.0", "tokio-util", - "toml 0.7.8", + "toml 0.8.6", "uuid", ] @@ -5156,7 +5156,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.7.8", + "toml 0.8.6", "tough", "trust-dns-resolver", "usdt", @@ -5238,7 +5238,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", "topological-sort", "walkdir", ] @@ -5358,7 +5358,7 @@ dependencies = [ "tofino", "tokio", "tokio-tungstenite 0.18.0", - "toml 0.7.8", + "toml 0.8.6", "usdt", "uuid", "zeroize", @@ -5812,7 +5812,7 @@ dependencies = [ "subprocess", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", "uuid", ] @@ -7902,9 +7902,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96426c9936fd7a0124915f9185ea1d20aa9445cc9821142f0a73bc9207a2e186" +checksum = "12022b835073e5b11e90a14f86838ceb1c8fb0325b72416845c487ac0fa95e80" dependencies = [ "serde", ] @@ -8431,7 +8431,7 @@ dependencies = [ "sprockets-rot", "thiserror", "tokio", - "toml 0.7.8", + "toml 0.8.6", ] [[package]] @@ -9192,21 +9192,21 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.0" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c226a7bba6d859b63c92c4b4fe69c5b6b72d0cb897dbc8e6012298e6154cb56e" +checksum = "8ff9e3abce27ee2c9a37f9ad37238c1bdd4e789c84ba37df76aa4d528f5072cc" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.20.0", + "toml_edit 0.20.7", ] [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" dependencies = [ "serde", ] @@ -9226,9 +9226,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.20.0" +version = "0.20.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ff63e60a958cefbb518ae1fd6566af80d9d4be430a33f3723dfc47d1d411d95" +checksum = "70f427fce4d84c72b5b732388bf4a9f4531b53f74e2887e3ecb2481f68f66d81" dependencies = [ "indexmap 2.0.0", "serde", @@ -9483,7 +9483,7 @@ dependencies = [ "sha2", "slog", "tar", - "toml 0.7.8", + "toml 0.8.6", "tough", "url", "zip", @@ -10103,7 +10103,7 @@ dependencies = [ "textwrap 0.16.0", "tokio", "tokio-util", - "toml 0.7.8", + "toml 0.8.6", "toml_edit 0.19.15", "tui-tree-widget", "unicode-width", @@ -10211,7 +10211,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml 0.7.8", + "toml 0.8.6", "tough", "trust-dns-resolver", "tufaceous", diff --git a/Cargo.toml b/Cargo.toml index ab56e052e7..d0fac037e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -362,7 +362,7 @@ tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1 tokio-stream = "0.1.14" tokio-tungstenite = "0.18" tokio-util = "0.7.10" -toml = "0.7.8" +toml = "0.8.6" toml_edit = "0.19.15" topological-sort = "0.2.2" tough = { version = "0.12", features = [ "http" ] } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 72854ed29a..06ee709440 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -246,7 +246,7 @@ hyper-rustls = { version = "0.24.2" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } -toml_datetime = { version = "0.6.3", default-features = false, features = ["serde"] } +toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit = { version = "0.19.15", features = ["serde"] } [target.x86_64-unknown-illumos.build-dependencies] @@ -255,7 +255,7 @@ hyper-rustls = { version = "0.24.2" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } -toml_datetime = { version = "0.6.3", default-features = false, features = ["serde"] } +toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit = { version = "0.19.15", features = ["serde"] } ### END HAKARI SECTION From dbf01fddbfc9c9b836c173f70ed80340c9230d09 Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 2 Nov 2023 15:35:26 -0700 Subject: [PATCH 14/14] [update-engine] record more info about parent steps that reached a terminal state (#4417) Record information about any parent steps that reached a terminal state. We need this as part of the line display output to ensure correct ordering. Since we may make a bunch of copies of `CompletionInfo`, `FailureInfo` and `AbortInfo`, wrap them in an `Arc`. --- update-engine/src/buffer.rs | 227 ++++++++++++++++++++++++---------- wicket/src/ui/panes/update.rs | 53 ++++---- 2 files changed, 191 insertions(+), 89 deletions(-) diff --git a/update-engine/src/buffer.rs b/update-engine/src/buffer.rs index 2426814444..a9e04c1d12 100644 --- a/update-engine/src/buffer.rs +++ b/update-engine/src/buffer.rs @@ -7,6 +7,7 @@ use std::{ collections::{HashMap, VecDeque}, fmt, + sync::Arc, time::Duration, }; @@ -554,10 +555,14 @@ impl EventStore { info: CompletionInfo, root_event_index: RootEventIndex, ) { + let info = Arc::new(info); if let Some(value) = self.map.get_mut(&root_key) { // Completion status only applies to the root key. Nodes reachable // from this node are still marked as complete, but without status. - value.mark_completed(Some(info), root_event_index); + value.mark_completed( + CompletionReason::StepCompleted(info.clone()), + root_event_index, + ); } // Mark anything reachable from this node as completed. @@ -567,7 +572,13 @@ impl EventStore { if let EventTreeNode::Step(key) = key { if key != root_key { if let Some(value) = self.map.get_mut(&key) { - value.mark_completed(None, root_event_index); + value.mark_completed( + CompletionReason::ParentCompleted { + parent_step: root_key, + parent_info: info.clone(), + }, + root_event_index, + ); } } } @@ -580,9 +591,13 @@ impl EventStore { info: CompletionInfo, root_event_index: RootEventIndex, ) { + let info = Arc::new(info); if let Some(value) = self.map.get_mut(&root_key) { // Completion status only applies to the root key. - value.mark_completed(Some(info), root_event_index); + value.mark_completed( + CompletionReason::StepCompleted(info.clone()), + root_event_index, + ); } let mut dfs = DfsPostOrder::new( @@ -593,7 +608,27 @@ impl EventStore { if let EventTreeNode::Step(key) = key { if key != root_key { if let Some(value) = self.map.get_mut(&key) { - value.mark_completed(None, root_event_index); + // There's two kinds of nodes reachable from + // EventTreeNode::Root that could be marked as + // completed: subsequent steps within the same + // execution, and steps in child executions. + if key.execution_id == root_key.execution_id { + value.mark_completed( + CompletionReason::SubsequentStarted { + later_step: root_key, + root_total_elapsed: info.root_total_elapsed, + }, + root_event_index, + ); + } else { + value.mark_completed( + CompletionReason::ParentCompleted { + parent_step: root_key, + parent_info: info.clone(), + }, + root_event_index, + ); + } } } } @@ -606,7 +641,8 @@ impl EventStore { info: FailureInfo, root_event_index: RootEventIndex, ) { - self.mark_step_failed_impl(root_key, root_event_index, |value, kind| { + let info = Arc::new(info); + self.mark_step_failed_impl(root_key, |value, kind| { match kind { MarkStepFailedImplKind::Root => { value.mark_failed( @@ -616,11 +652,14 @@ impl EventStore { } MarkStepFailedImplKind::Descendant => { value.mark_failed( - FailureReason::ParentFailed { parent_step: root_key }, + FailureReason::ParentFailed { + parent_step: root_key, + parent_info: info.clone(), + }, root_event_index, ); } - MarkStepFailedImplKind::Future => { + MarkStepFailedImplKind::Subsequent => { value.mark_will_not_be_run( WillNotBeRunReason::PreviousStepFailed { step: root_key, @@ -628,6 +667,15 @@ impl EventStore { root_event_index, ); } + MarkStepFailedImplKind::PreviousCompleted => { + value.mark_completed( + CompletionReason::SubsequentStarted { + later_step: root_key, + root_total_elapsed: info.root_total_elapsed, + }, + root_event_index, + ); + } }; }) } @@ -638,42 +686,48 @@ impl EventStore { info: AbortInfo, root_event_index: RootEventIndex, ) { - self.mark_step_failed_impl( - root_key, - root_event_index, - |value, kind| { - match kind { - MarkStepFailedImplKind::Root => { - value.mark_aborted( - AbortReason::StepAborted(info.clone()), - root_event_index, - ); - } - MarkStepFailedImplKind::Descendant => { - value.mark_aborted( - AbortReason::ParentAborted { - parent_step: root_key, - }, - root_event_index, - ); - } - MarkStepFailedImplKind::Future => { - value.mark_will_not_be_run( - WillNotBeRunReason::PreviousStepAborted { - step: root_key, - }, - root_event_index, - ); - } - }; - }, - ); + let info = Arc::new(info); + self.mark_step_failed_impl(root_key, |value, kind| { + match kind { + MarkStepFailedImplKind::Root => { + value.mark_aborted( + AbortReason::StepAborted(info.clone()), + root_event_index, + ); + } + MarkStepFailedImplKind::Descendant => { + value.mark_aborted( + AbortReason::ParentAborted { + parent_step: root_key, + parent_info: info.clone(), + }, + root_event_index, + ); + } + MarkStepFailedImplKind::Subsequent => { + value.mark_will_not_be_run( + WillNotBeRunReason::PreviousStepAborted { + step: root_key, + }, + root_event_index, + ); + } + MarkStepFailedImplKind::PreviousCompleted => { + value.mark_completed( + CompletionReason::SubsequentStarted { + later_step: root_key, + root_total_elapsed: info.root_total_elapsed, + }, + root_event_index, + ); + } + }; + }); } fn mark_step_failed_impl( &mut self, root_key: StepKey, - root_event_index: RootEventIndex, mut cb: impl FnMut(&mut EventBufferStepData, MarkStepFailedImplKind), ) { if let Some(value) = self.map.get_mut(&root_key) { @@ -686,7 +740,7 @@ impl EventStore { for index in 0..root_key.index { let key = StepKey { execution_id: root_key.execution_id, index }; if let Some(value) = self.map.get_mut(&key) { - value.mark_completed(None, root_event_index); + (cb)(value, MarkStepFailedImplKind::PreviousCompleted); } } @@ -713,7 +767,7 @@ impl EventStore { while let Some(key) = dfs.next(&self.event_tree) { if let EventTreeNode::Step(key) = key { if let Some(value) = self.map.get_mut(&key) { - (cb)(value, MarkStepFailedImplKind::Future); + (cb)(value, MarkStepFailedImplKind::Subsequent); } } } @@ -723,7 +777,8 @@ impl EventStore { enum MarkStepFailedImplKind { Root, Descendant, - Future, + Subsequent, + PreviousCompleted, } /// Actions taken by a recursion step. @@ -965,12 +1020,12 @@ impl EventBufferStepData { fn mark_completed( &mut self, - status: Option, + reason: CompletionReason, root_event_index: RootEventIndex, ) { match self.step_status { StepStatus::NotStarted | StepStatus::Running { .. } => { - self.step_status = StepStatus::Completed { info: status }; + self.step_status = StepStatus::Completed { reason }; self.update_root_event_index(root_event_index); } StepStatus::Completed { .. } @@ -1011,7 +1066,7 @@ impl EventBufferStepData { match &mut self.step_status { StepStatus::NotStarted => { match reason { - AbortReason::ParentAborted { parent_step } => { + AbortReason::ParentAborted { parent_step, .. } => { // A parent was aborted and this step hasn't been // started. self.step_status = StepStatus::WillNotBeRun { @@ -1116,10 +1171,8 @@ pub enum StepStatus { /// The step has completed execution. Completed { - /// Completion information. - /// - /// This might be unavailable in some cases. - info: Option, + /// The reason for completion. + reason: CompletionReason, }, /// The step has failed. @@ -1179,6 +1232,43 @@ impl StepStatus { } } +#[derive(Clone, Debug)] +pub enum CompletionReason { + /// This step completed. + StepCompleted(Arc), + /// A later step within the same execution was started and we don't have + /// information regarding this step. + SubsequentStarted { + /// The later step that was started. + later_step: StepKey, + + /// The root total elapsed time at the moment the later step was started. + root_total_elapsed: Duration, + }, + /// A parent step within the same execution completed and we don't have + /// information regarding this step. + ParentCompleted { + /// The parent step that completed. + parent_step: StepKey, + + /// Completion info associated with the parent step. + parent_info: Arc, + }, +} + +impl CompletionReason { + /// Returns the [`CompletionInfo`] for this step, if this is the + /// [`Self::StepCompleted`] variant. + pub fn step_completed_info(&self) -> Option<&Arc> { + match self { + Self::StepCompleted(info) => Some(info), + Self::SubsequentStarted { .. } | Self::ParentCompleted { .. } => { + None + } + } + } +} + #[derive(Clone, Debug)] pub struct CompletionInfo { pub attempt: usize, @@ -1192,17 +1282,21 @@ pub struct CompletionInfo { #[derive(Clone, Debug)] pub enum FailureReason { /// This step failed. - StepFailed(FailureInfo), + StepFailed(Arc), /// A parent step failed. ParentFailed { /// The parent step that failed. parent_step: StepKey, + + /// Failure info associated with the parent step. + parent_info: Arc, }, } impl FailureReason { - /// Returns the [`FailureInfo`] if present. - pub fn info(&self) -> Option<&FailureInfo> { + /// Returns the [`FailureInfo`] for this step, if this is the + /// [`Self::StepFailed`] variant. + pub fn step_failed_info(&self) -> Option<&Arc> { match self { Self::StepFailed(info) => Some(info), Self::ParentFailed { .. } => None, @@ -1224,17 +1318,21 @@ pub struct FailureInfo { #[derive(Clone, Debug)] pub enum AbortReason { /// This step was aborted. - StepAborted(AbortInfo), + StepAborted(Arc), /// A parent step was aborted. ParentAborted { /// The parent step key that was aborted. parent_step: StepKey, + + /// Abort info associated with the parent step. + parent_info: Arc, }, } impl AbortReason { - /// Returns the [`AbortInfo`] if present. - pub fn info(&self) -> Option<&AbortInfo> { + /// Returns the [`AbortInfo`] for this step, if this is the + /// [`Self::StepAborted`] variant. + pub fn step_aborted_info(&self) -> Option<&Arc> { match self { Self::StepAborted(info) => Some(info), Self::ParentAborted { .. } => None, @@ -1311,14 +1409,15 @@ impl ExecutionSummary { StepStatus::Running { .. } => { execution_status = ExecutionStatus::Running { step_key }; } - StepStatus::Completed { info } => { - let (root_total_elapsed, leaf_total_elapsed) = match info { - Some(info) => ( - Some(info.root_total_elapsed), - Some(info.leaf_total_elapsed), - ), - None => (None, None), - }; + StepStatus::Completed { reason } => { + let (root_total_elapsed, leaf_total_elapsed) = + match reason.step_completed_info() { + Some(info) => ( + Some(info.root_total_elapsed), + Some(info.leaf_total_elapsed), + ), + None => (None, None), + }; let terminal_status = ExecutionTerminalInfo { kind: TerminalKind::Completed, @@ -1331,7 +1430,7 @@ impl ExecutionSummary { } StepStatus::Failed { reason } => { let (root_total_elapsed, leaf_total_elapsed) = - match reason.info() { + match reason.step_failed_info() { Some(info) => ( Some(info.root_total_elapsed), Some(info.leaf_total_elapsed), @@ -1350,7 +1449,7 @@ impl ExecutionSummary { } StepStatus::Aborted { reason, .. } => { let (root_total_elapsed, leaf_total_elapsed) = - match reason.info() { + match reason.step_aborted_info() { Some(info) => ( Some(info.root_total_elapsed), Some(info.leaf_total_elapsed), diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs index 2819b3ddda..d76c6c3b49 100644 --- a/wicket/src/ui/panes/update.rs +++ b/wicket/src/ui/panes/update.rs @@ -29,8 +29,8 @@ use ratatui::widgets::{ use slog::{info, o, Logger}; use tui_tree_widget::{Tree, TreeItem, TreeState}; use update_engine::{ - AbortReason, ExecutionStatus, FailureReason, StepKey, TerminalKind, - WillNotBeRunReason, + AbortReason, CompletionReason, ExecutionStatus, FailureReason, StepKey, + TerminalKind, WillNotBeRunReason, }; use wicket_common::update_events::{ EventBuffer, EventReport, ProgressEvent, StepOutcome, StepStatus, @@ -282,7 +282,9 @@ impl UpdatePane { // TODO: show previous attempts } - StepStatus::Completed { info: Some(info) } => { + StepStatus::Completed { + reason: CompletionReason::StepCompleted(info), + } => { let mut spans = vec![Span::styled("Status: ", style::selected())]; @@ -333,9 +335,9 @@ impl UpdatePane { push_text_lines(&message, prefix, &mut body.lines); } } - StepStatus::Completed { info: None } => { - // No information is available, so all we can do is say that - // this step is completed. + StepStatus::Completed { reason: _ } => { + // No information about this step is available, so all we can do + // is say that this step is completed. body.lines.push(Line::from(vec![ Span::styled("Status: ", style::selected()), Span::styled("Completed", style::successful_update_bold()), @@ -383,7 +385,7 @@ impl UpdatePane { } } StepStatus::Failed { - reason: FailureReason::ParentFailed { parent_step }, + reason: FailureReason::ParentFailed { parent_step, .. }, } => { let mut spans = vec![ Span::styled("Status: ", style::selected()), @@ -442,7 +444,7 @@ impl UpdatePane { } } StepStatus::Aborted { - reason: AbortReason::ParentAborted { parent_step }, + reason: AbortReason::ParentAborted { parent_step, .. }, last_progress, } => { let mut spans = vec![ @@ -2017,24 +2019,25 @@ impl ComponentUpdateListState { } style::selected() } - StepStatus::Completed { info } => { - let (character, style) = if let Some(info) = info { - match info.outcome { - StepOutcome::Success { .. } => { - ('✔', style::successful_update()) - } - StepOutcome::Warning { .. } => { - ('âš ', style::warning_update()) - } - StepOutcome::Skipped { .. } => { - ('*', style::successful_update()) + StepStatus::Completed { reason } => { + let (character, style) = + if let Some(info) = reason.step_completed_info() { + match info.outcome { + StepOutcome::Success { .. } => { + ('✔', style::successful_update()) + } + StepOutcome::Warning { .. } => { + ('âš ', style::warning_update()) + } + StepOutcome::Skipped { .. } => { + ('*', style::successful_update()) + } } - } - } else { - // No information available for this step -- just mark - // it successful. - ('✔', style::successful_update()) - }; + } else { + // No information available for this step -- just mark + // it successful. + ('✔', style::successful_update()) + }; item_spans.push(Span::styled( format!("{:>5} ", character), style,