diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index d971545087..fc9bb9cc2f 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -50,6 +50,7 @@ use indicatif::ProgressDrawTarget; use indicatif::ProgressStyle; use internal_dns_types::names::ServiceName; use ipnetwork::IpNetwork; +use itertools::Itertools; use nexus_config::PostgresConfigWithUrl; use nexus_db_model::Dataset; use nexus_db_model::Disk; @@ -5189,6 +5190,7 @@ async fn cmd_db_inventory_collections_show( let nerrors = inv_collection_print_errors(&collection).await?; inv_collection_print_devices(&collection, &long_string_formatter).await?; inv_collection_print_sleds(&collection); + inv_collection_print_keeper_membership(&collection); if nerrors > 0 { eprintln!( @@ -5514,6 +5516,24 @@ fn inv_collection_print_sleds(collection: &Collection) { } } +fn inv_collection_print_keeper_membership(collection: &Collection) { + println!("\nKEEPER MEMBERSHIP"); + for k in &collection.clickhouse_keeper_cluster_membership { + println!("\n queried keeper: {}", k.queried_keeper); + println!( + " leader_committed_log_index: {}", + k.leader_committed_log_index + ); + + let s = k.raft_config.iter().join(", "); + println!(" raft config: {s}"); + } + if collection.clickhouse_keeper_cluster_membership.is_empty() { + println!("No membership retrieved."); + } + println!(""); +} + #[derive(Debug)] struct LongStringFormatter { show_long_strings: bool, diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 77d8056709..d32d00ed8f 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -359,10 +359,16 @@ impl<'a> Collector<'a> { /// Collect inventory from about keepers from all `ClickhouseAdminKeeper` /// clients async fn collect_all_keepers(&mut self) { + debug!(self.log, "begin collecting all keepers"; + "nkeeper_admin_clients" => self.keeper_admin_clients.len()); + for client in &self.keeper_admin_clients { Self::collect_one_keeper(&client, &self.log, &mut self.in_progress) .await; } + + debug!(self.log, "end collecting all keepers"; + "nkeeper_admin_clients" => self.keeper_admin_clients.len()); } /// Collect inventory about one keeper from one `ClickhouseAdminKeeper` @@ -384,9 +390,14 @@ impl<'a> Collector<'a> { in_progress.found_error(InventoryError::from(error)); } Ok(membership) => { - in_progress.found_clickhouse_keeper_cluster_membership( - membership.into_inner(), + let membership = membership.into_inner(); + debug!(log, "found keeper membership"; + "keeper_admin_url" => client.baseurl(), + "leader_committed_log_index" => + membership.leader_committed_log_index ); + in_progress + .found_clickhouse_keeper_cluster_membership(membership); } } } diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index c4271c58d8..0b361b2014 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -9,6 +9,7 @@ use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; +use internal_dns_resolver::ResolveError; use internal_dns_types::names::ServiceName; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -132,19 +133,55 @@ async fn inventory_activate( }) .collect::>(); - // Find clickhouse-admin-keeper clients - let keeper_admin_clients = resolver - .lookup_socket_v6(ServiceName::ClickhouseAdminKeeper) + // Find clickhouse-admin-keeper servers if there are any. + let keeper_admin_clients = match resolver + .lookup_all_socket_v6(ServiceName::ClickhouseAdminKeeper) .await - .context("looking up ClickhouseAdminKeeper addresses") - .into_iter() - .map(|sockaddr| { - let url = format!("http://{}", sockaddr); - let log = - opctx.log.new(o!("clickhouse_admin_keeper_url" => url.clone())); - clickhouse_admin_keeper_client::Client::new(&url, log) - }) - .collect::>(); + { + Ok(sockaddrs) => sockaddrs + .into_iter() + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + let log = opctx + .log + .new(o!("clickhouse_admin_keeper_url" => url.clone())); + clickhouse_admin_keeper_client::Client::new(&url, log) + }) + .collect::>(), + Err(err) => match err { + // When DNS resolution fails because no clickhouse-keeper-admin + // servers have been found, we allow this and move on. This is + // because multi-node clickhouse may not be enabled, and therefore + // there will not be any clickhouse-keeper-admin servers to find. + // + // In the long term, we expect multi-node clickhouse to always + // be enabled, and therefore we may want to bubble up any error + // we find, including `NotFound`. However, since we must enable + // multi-node clickhouse via reconfigurator, and not RSS, we may + // find ourselves with a small gap early on where the names don't + // yet exist. This would block the rest of inventory collection if + // we early return. We may be able to resolve this problem at rack + // handoff time, but it's worth considering whether we want to error + // here in case a gap remains. + // + // See https://github.com/oxidecomputer/omicron/issues/7005 + ResolveError::NotFound(_) | ResolveError::NotFoundByString(_) => { + vec![] + } + ResolveError::Resolve(hickory_err) + if matches!( + hickory_err.kind(), + hickory_resolver::error::ResolveErrorKind::NoRecordsFound { .. } + ) => + { + vec![] + } + _ => { + return Err(err) + .context("looking up clickhouse-admin-keeper addresses"); + } + }, + }; // Create an enumerator to find sled agents. let sled_enum = DbSledAgentEnumerator { opctx, datastore };