From 1cb31d268c5ed1fcc812775516005c3ba213ef7e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 3 Jun 2024 10:58:23 -0400 Subject: [PATCH 01/19] add skeleton of RPW to collect CRDB node IDs --- .../app/background/crdb_node_id_collector.rs | 108 ++++++++++++++++++ nexus/src/app/background/init.rs | 20 ++++ nexus/src/app/background/mod.rs | 1 + 3 files changed, 129 insertions(+) create mode 100644 nexus/src/app/background/crdb_node_id_collector.rs diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs new file mode 100644 index 0000000000..08ec7d1da0 --- /dev/null +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -0,0 +1,108 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for collecting the Cockroach Node ID for running CRDB zones +//! +//! Cockroach assigns a node ID when the node is initially started and joins the +//! cluster. The node IDs are 1-up counters that are never reused. Cluster +//! management operations (e.g., decommissioning nodes) are keyed off of the +//! node ID. However, because node IDs aren't assigned until the node has +//! started and joins the cluster, it means there is a gap between when Omicron +//! creates a CRDB zone (and picks an Omicron zone ID for it) and when that zone +//! gets a CRDB node ID. This RPW exists to backfill the mapping from Omicron +//! zone ID <-> CRDB node ID for Cockroach zones. +//! +//! This isn't foolproof. If a Cockroach node fails to start, it won't have a +//! node ID and therefore this RPW won't be able to make an assignment. If a +//! Cockroach node succeeds in starting and gets a node ID but then fails in an +//! unrecoverable way before this RPW has collected its node ID, that will also +//! result in a missing assignment. Consumers of the Omicron zone ID <-> CRDB +//! node ID don't have a way of distinguishing these two failure modes from this +//! RPW alone, and will need to gather other information (e.g., asking CRDB for +//! the status of all nodes and looking for orphans, perhaps) to determine +//! whether a zone without a known node ID ever existed. + +use super::common::BackgroundTask; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_auth::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::blueprint_zone_type; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintTarget; +use nexus_types::deployment::BlueprintZoneFilter; +use nexus_types::deployment::BlueprintZoneType; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct CockroachNodeIdCollector { + datastore: Arc, + rx_blueprint: watch::Receiver>>, +} + +impl CockroachNodeIdCollector { + pub fn new( + datastore: Arc, + rx_blueprint: watch::Receiver< + Option>, + >, + ) -> Self { + Self { datastore, rx_blueprint } + } + + /// Implementation for `BackgroundTask::activate` for `BlueprintExecutor`, + /// added here to produce better compile errors. + /// + /// The presence of `boxed()` in `BackgroundTask::activate` has caused some + /// confusion with compilation errors in the past. So separate this method + /// out. + async fn activate_impl<'a>( + &mut self, + opctx: &OpContext, + ) -> serde_json::Value { + // Get the latest blueprint, cloning to prevent holding a read lock + // on the watch. + let update = self.rx_blueprint.borrow_and_update().clone(); + + let Some((_bp_target, blueprint)) = update.as_deref() else { + warn!( + &opctx.log, "Blueprint execution: skipped"; + "reason" => "no blueprint", + ); + return json!({"error": "no blueprint" }); + }; + + // We can only actively collect from zones that should be running; if + // there are CRDB zones in other states that still need their node ID + // collected, we have to wait until they're running. + for (_sled_id, zone) in + blueprint.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + { + let BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { address, .. }, + ) = &zone.zone_type + else { + continue; + }; + + // TODO 1 continue if we already know the node ID for this zone + _ = &self.datastore; + + // TODO 2 collect and insert the node ID for this zone + _ = address; + } + + json!({}) + } +} + +impl BackgroundTask for CockroachNodeIdCollector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + self.activate_impl(opctx).boxed() + } +} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index a87c53860d..ae9387c1e8 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -9,6 +9,7 @@ use super::bfd; use super::blueprint_execution; use super::blueprint_load; use super::common; +use super::crdb_node_id_collector; use super::dns_config; use super::dns_propagation; use super::dns_servers; @@ -86,6 +87,9 @@ pub struct BackgroundTasks { /// task handle for blueprint execution background task pub task_blueprint_executor: common::TaskHandle, + /// task handle for collecting CockroachDB node IDs + pub task_crdb_node_id_collector: common::TaskHandle, + /// task handle for the service zone nat tracker pub task_service_zone_nat_tracker: common::TaskHandle, @@ -263,6 +267,21 @@ impl BackgroundTasks { config.blueprints.period_secs_execute, Box::new(blueprint_executor), opctx.child(BTreeMap::new()), + vec![Box::new(rx_blueprint.clone())], + ); + + // Background task: CockroachDB node ID collector + let crdb_node_id_collector = + crdb_node_id_collector::CockroachNodeIdCollector::new( + datastore.clone(), + rx_blueprint.clone(), + ); + let task_crdb_node_id_collector = driver.register( + String::from("crdb_node_id_collector"), + String::from("Collects node IDs of running CockroachDB zones"), + config.blueprints.period_secs_execute, // TODO-john FIXME + Box::new(crdb_node_id_collector), + opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint)], ); @@ -438,6 +457,7 @@ impl BackgroundTasks { task_phantom_disks, task_blueprint_loader, task_blueprint_executor, + task_crdb_node_id_collector, task_service_zone_nat_tracker, task_switch_port_settings_manager, task_v2p_manager, diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 6de9e6f4d3..60f911e98d 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -27,5 +27,6 @@ mod status; mod sync_service_zone_nat; mod sync_switch_configuration; mod v2p_mappings; +mod crdb_node_id_collector; pub use init::BackgroundTasks; From 710e50cce14b38c704a5f16cba692f6b88e451f5 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Jun 2024 11:27:41 -0400 Subject: [PATCH 02/19] add table and queries for mapping crdb zone <-> node IDs --- nexus/db-model/src/cockroachdb_node_id.rs | 16 ++ nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 7 + .../src/db/datastore/cockroachdb_node_id.rs | 155 ++++++++++++++++++ nexus/db-queries/src/db/datastore/mod.rs | 1 + .../src/app/background/blueprint_execution.rs | 7 +- schema/crdb/dbinit.sql | 18 ++ 7 files changed, 203 insertions(+), 3 deletions(-) create mode 100644 nexus/db-model/src/cockroachdb_node_id.rs create mode 100644 nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs diff --git a/nexus/db-model/src/cockroachdb_node_id.rs b/nexus/db-model/src/cockroachdb_node_id.rs new file mode 100644 index 0000000000..1179b36f0b --- /dev/null +++ b/nexus/db-model/src/cockroachdb_node_id.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for mapping CockroachDB Omicron zone IDs to internal-to-CRDB node IDs + +use crate::schema::cockroachdb_zone_id_to_node_id; +use crate::typed_uuid::DbTypedUuid; +use omicron_uuid_kinds::OmicronZoneKind; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = cockroachdb_zone_id_to_node_id)] +pub struct CockroachZoneIdToNodeId { + pub omicron_zone_id: DbTypedUuid, + pub crdb_node_id: String, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 51fd0f6c9e..c38583f2e2 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -17,6 +17,7 @@ mod block_size; mod bootstore; mod bytecount; mod certificate; +mod cockroachdb_node_id; mod collection; mod console_session; mod dataset; @@ -125,6 +126,7 @@ pub use block_size::*; pub use bootstore::*; pub use bytecount::*; pub use certificate::*; +pub use cockroachdb_node_id::*; pub use collection::*; pub use console_session::*; pub use dataset::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8a00ce6e37..eb9ffe6b6f 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1615,6 +1615,13 @@ table! { } } +table! { + cockroachdb_zone_id_to_node_id (omicron_zone_id) { + omicron_zone_id -> Uuid, + crdb_node_id -> Text, + } +} + table! { bootstore_keys (key, generation) { key -> Text, diff --git a/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs b/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs new file mode 100644 index 0000000000..d1feaa988b --- /dev/null +++ b/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs @@ -0,0 +1,155 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Datastore methods involving CockroachDB node IDs. + +use super::DataStore; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::ExpressionMethods; +use diesel::OptionalExtension; +use diesel::QueryDsl; +use nexus_auth::authz; +use nexus_auth::context::OpContext; +use nexus_db_model::to_db_typed_uuid; +use nexus_db_model::CockroachZoneIdToNodeId; +use omicron_common::api::external::Error; +use omicron_common::api::external::LookupResult; +use omicron_uuid_kinds::OmicronZoneUuid; + +impl DataStore { + /// Get the CockroachDB node ID of a given Omicron zone ID. + /// + /// Returns `Ok(None)` if no node ID is known. This can occur if the + /// requested zone ID isn't a CockroachDB zone, or if it is a CockroachDB + /// zone but the background task responsible for collecting its node ID has + /// not yet successfully done so. + pub async fn cockroachdb_node_id( + &self, + opctx: &OpContext, + omicron_zone_id: OmicronZoneUuid, + ) -> LookupResult> { + use db::schema::cockroachdb_zone_id_to_node_id::dsl; + + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + dsl::cockroachdb_zone_id_to_node_id + .select(dsl::crdb_node_id) + .filter(dsl::omicron_zone_id.eq(to_db_typed_uuid(omicron_zone_id))) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Record the CockroachDB node ID of a given Omicron zone ID. + /// + /// This function must only be called with valid CockroachDB zone IDs. It + /// will return an error if the given `omicron_zone_id` already has an entry + /// in this table that does not exactly match `crdb_node_id`. + pub async fn set_cockroachdb_node_id( + &self, + opctx: &OpContext, + omicron_zone_id: OmicronZoneUuid, + crdb_node_id: String, + ) -> Result<(), Error> { + use db::schema::cockroachdb_zone_id_to_node_id::dsl; + + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + let row = CockroachZoneIdToNodeId { + omicron_zone_id: omicron_zone_id.into(), + crdb_node_id, + }; + + let _nrows = diesel::insert_into(dsl::cockroachdb_zone_id_to_node_id) + .values(row) + .on_conflict((dsl::omicron_zone_id, dsl::crdb_node_id)) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::datastore::test_utils::datastore_test; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + + #[tokio::test] + async fn test_cockroachdb_node_id() { + let logctx = + dev::test_setup_log("test_service_network_interfaces_list"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Make up a CRDB zone id. + let crdb_zone_id = OmicronZoneUuid::new_v4(); + + // We shouldn't have a mapping for it yet. + let node_id = datastore + .cockroachdb_node_id(&opctx, crdb_zone_id) + .await + .expect("looked up node ID"); + assert_eq!(node_id, None); + + // We can assign a mapping. + let fake_node_id = "test-node"; + datastore + .set_cockroachdb_node_id( + &opctx, + crdb_zone_id, + fake_node_id.to_string(), + ) + .await + .expect("set node ID"); + + // We can look up the mapping we created. + let node_id = datastore + .cockroachdb_node_id(&opctx, crdb_zone_id) + .await + .expect("looked up node ID"); + assert_eq!(node_id.as_deref(), Some(fake_node_id)); + + // We can't assign a different node ID to this same zone. + let different_node_id = "test-node-2"; + datastore + .set_cockroachdb_node_id( + &opctx, + crdb_zone_id, + different_node_id.to_string(), + ) + .await + .expect_err("failed to set node ID"); + + // We can reassign the same node ID (i.e., setting is idempotent). + datastore + .set_cockroachdb_node_id( + &opctx, + crdb_zone_id, + fake_node_id.to_string(), + ) + .await + .expect("set node ID is idempotent"); + + // The mapping should not have changed. + let node_id = datastore + .cockroachdb_node_id(&opctx, crdb_zone_id) + .await + .expect("looked up node ID"); + assert_eq!(node_id.as_deref(), Some(fake_node_id)); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 9ec3575860..436e9c2d09 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -54,6 +54,7 @@ mod bfd; mod bgp; mod bootstore; mod certificate; +mod cockroachdb_node_id; mod cockroachdb_settings; mod console_session; mod dataset; diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/blueprint_execution.rs index 69725acf1d..b01d1213de 100644 --- a/nexus/src/app/background/blueprint_execution.rs +++ b/nexus/src/app/background/blueprint_execution.rs @@ -54,9 +54,10 @@ impl BlueprintExecutor { let update = self.rx_blueprint.borrow_and_update().clone(); let Some(update) = update else { - warn!(&opctx.log, - "Blueprint execution: skipped"; - "reason" => "no blueprint"); + warn!( + &opctx.log, "Blueprint execution: skipped"; + "reason" => "no blueprint", + ); return json!({"error": "no blueprint" }); }; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b759f86f1b..c31e32b028 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3461,6 +3461,24 @@ CREATE TABLE IF NOT EXISTS omicron.public.bp_omicron_zone_nic ( PRIMARY KEY (blueprint_id, id) ); +-- Mapping of Omicron zone ID to CockroachDB node ID. This isn't directly used +-- by the blueprint tables above, but is used by the more general Reconfigurator +-- system along with them (e.g., to decommission expunged CRDB nodes). +CREATE TABLE IF NOT EXISTS omicron.public.cockroachdb_zone_id_to_node_id ( + omicron_zone_id UUID NOT NULL PRIMARY KEY, + crdb_node_id TEXT NOT NULL +); + +-- We already effectively have a unique crdb_node_id for each omicron_zone_id +-- because it's the primary key; however, an insert query that uses `ON CONFLICT +-- DO NOTHING` for idempotency requires a UNIQUE constraint on the pair. +CREATE UNIQUE INDEX IF NOT EXISTS cockroachdb_zone_id_to_node_id_unique_pair ON + omicron.public.cockroachdb_zone_id_to_node_id ( + omicron_zone_id, + crdb_node_id + ); + + /*******************************************************************/ /* From 9935e87678a572ad457d91ca1d5b1aefb7d0701b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Jun 2024 11:28:02 -0400 Subject: [PATCH 03/19] cargo fmt --- nexus/src/app/background/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 60f911e98d..7d1fc43d69 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -9,6 +9,7 @@ mod bfd; mod blueprint_execution; mod blueprint_load; mod common; +mod crdb_node_id_collector; mod dns_config; mod dns_propagation; mod dns_servers; @@ -27,6 +28,5 @@ mod status; mod sync_service_zone_nat; mod sync_switch_configuration; mod v2p_mappings; -mod crdb_node_id_collector; pub use init::BackgroundTasks; From b8f69f27794cf47ef959f20b5b93d62febaa0446 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Jun 2024 14:09:33 -0400 Subject: [PATCH 04/19] add API to get local node ID from cockroach-admin server --- Cargo.lock | 2 + cockroach-admin/Cargo.toml | 2 + cockroach-admin/src/cockroach_cli.rs | 4 + cockroach-admin/src/context.rs | 118 ++++++++++++++++++++++++ cockroach-admin/src/http_entrypoints.rs | 26 ++++++ cockroach-admin/src/lib.rs | 5 +- 6 files changed, 156 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 8b73743ef3..fa1f17b007 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5254,6 +5254,7 @@ dependencies = [ "omicron-rpaths", "omicron-test-utils", "omicron-workspace-hack", + "once_cell", "pq-sys", "schemars", "serde", @@ -5263,6 +5264,7 @@ dependencies = [ "slog-error-chain", "thiserror", "tokio", + "tokio-postgres", "toml 0.8.13", "url", ] diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index e0c02493c2..84cfbbc21b 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -17,6 +17,7 @@ dropshot.workspace = true http.workspace = true illumos-utils.workspace = true omicron-common.workspace = true +once_cell.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" schemars.workspace = true @@ -27,6 +28,7 @@ slog-error-chain.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-postgres.workspace = true toml.workspace = true omicron-workspace-hack.workspace = true diff --git a/cockroach-admin/src/cockroach_cli.rs b/cockroach-admin/src/cockroach_cli.rs index 5b3958546f..00478b81a1 100644 --- a/cockroach-admin/src/cockroach_cli.rs +++ b/cockroach-admin/src/cockroach_cli.rs @@ -75,6 +75,10 @@ impl CockroachCli { Self { path_to_cockroach_binary, cockroach_address } } + pub fn cockroach_address(&self) -> SocketAddrV6 { + self.cockroach_address + } + pub async fn node_status( &self, ) -> Result, CockroachCliError> { diff --git a/cockroach-admin/src/context.rs b/cockroach-admin/src/context.rs index b3f39f463a..a81c54e16c 100644 --- a/cockroach-admin/src/context.rs +++ b/cockroach-admin/src/context.rs @@ -3,7 +3,125 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::CockroachCli; +use anyhow::Context; +use dropshot::HttpError; +use slog::Logger; +use slog_error_chain::InlineErrorChain; +use tokio::sync::OnceCell; pub struct ServerContext { pub cockroach_cli: CockroachCli, + // Cockroach node IDs never change; we defer contacting our local node to + // ask for its ID until we need to, but once we have it we don't need to ask + // again. + node_id: OnceCell, + log: Logger, +} + +impl ServerContext { + pub fn new(cockroach_cli: CockroachCli, log: Logger) -> Self { + Self { cockroach_cli, node_id: OnceCell::new(), log } + } + + pub async fn node_id(&self) -> Result<&str, HttpError> { + match self + .node_id + .get_or_try_init(|| self.read_node_id_from_cockroach()) + .await + { + Ok(id) => Ok(id.as_str()), + Err(err) => { + let message = format!( + "failed to read node ID from local cockroach instance: \ + {err:#}", + ); + Err(HttpError { + status_code: http::StatusCode::SERVICE_UNAVAILABLE, + error_code: None, + external_message: message.clone(), + internal_message: message, + }) + } + } + } + + async fn read_node_id_from_cockroach(&self) -> anyhow::Result { + // TODO-cleanup This connection string is duplicated in Nexus - maybe we + // should centralize it? I'm not sure where we could put it - + // omicron_common, perhaps? + let connect_url = format!( + "postgresql://root@{}/omicron?sslmode=disable", + self.cockroach_cli.cockroach_address() + ); + let (client, connection) = + tokio_postgres::connect(&connect_url, tokio_postgres::NoTls) + .await + .with_context(|| { + format!("failed to connect to {connect_url}") + })?; + + let log = self.log.clone(); + tokio::spawn(async move { + if let Err(e) = connection.await { + slog::warn!( + log, "connection error reading node ID"; + "err" => InlineErrorChain::new(&e), + ); + } + }); + + // This uses an undocumented internal function - not awesome, but we're + // told this is "unlikely to change for some time". + // https://github.com/cockroachdb/cockroach/issues/124988 requests that + // this be documented (or an alternative be documented / provided). + let row = client + .query_one("SELECT crdb_internal.node_id()::TEXT", &[]) + .await + .context("failed to send node ID query")?; + + row.try_get(0).context("failed to read results of node ID query") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + use std::net::SocketAddrV6; + use url::Url; + + #[tokio::test] + async fn test_node_id() { + let logctx = dev::test_setup_log("test_node_id"); + let mut db = test_setup_database(&logctx.log).await; + + // Construct a `ServerContext`. + let db_url = db.listen_url().to_string(); + let url: Url = db_url.parse().expect("valid url"); + let cockroach_address: SocketAddrV6 = format!( + "{}:{}", + url.host().expect("url has host"), + url.port().expect("url has port") + ) + .parse() + .expect("valid SocketAddrV6"); + let cli = CockroachCli::new("cockroach".into(), cockroach_address); + let context = ServerContext::new(cli, logctx.log.clone()); + + // We should be able to fetch a node id, and it should be `1` (since we + // have a single-node test cockroach instance). + let node_id = + context.node_id().await.expect("successfully read node ID"); + assert_eq!(node_id, "1"); + + // The `OnceCell` should be populated now; even if we shut down the DB, + // we can still fetch the node ID. + db.cleanup().await.unwrap(); + let node_id = + context.node_id().await.expect("successfully read node ID"); + assert_eq!(node_id, "1"); + + logctx.cleanup_successful(); + } } diff --git a/cockroach-admin/src/http_entrypoints.rs b/cockroach-admin/src/http_entrypoints.rs index 24d36c9823..cc9b14087a 100644 --- a/cockroach-admin/src/http_entrypoints.rs +++ b/cockroach-admin/src/http_entrypoints.rs @@ -17,6 +17,7 @@ type CrdbApiDescription = dropshot::ApiDescription>; pub fn api() -> CrdbApiDescription { fn register_endpoints(api: &mut CrdbApiDescription) -> Result<(), String> { + api.register(node_id)?; api.register(node_status)?; Ok(()) } @@ -47,3 +48,28 @@ async fn node_status( ctx.cockroach_cli.node_status().await.map_err(HttpError::from)?; Ok(HttpResponseOk(ClusterNodeStatus { all_nodes })) } + +/// CockroachDB Node ID +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct NodeId { + // CockroachDB node IDs are integers, in practice, but our use of them is as + // input and output to the `cockroach` CLI. We use a string which is a bit + // more natural (no need to parse CLI output or stringify an ID to send it + // as input) and leaves open the door for the format to change in the + // future. + pub id: String, +} + +/// Get the CockroachDB node ID of the local cockroach instance. +#[endpoint { + method = GET, + path = "/node/id", +}] +async fn node_id( + rqctx: RequestContext>, +) -> Result, HttpError> { + let ctx = rqctx.context(); + let id = ctx.node_id().await?.to_string(); + Ok(HttpResponseOk(NodeId { id })) +} diff --git a/cockroach-admin/src/lib.rs b/cockroach-admin/src/lib.rs index d6c53c8dc6..f7cc90176f 100644 --- a/cockroach-admin/src/lib.rs +++ b/cockroach-admin/src/lib.rs @@ -72,7 +72,10 @@ pub async fn start_server( } } - let context = ServerContext { cockroach_cli }; + let context = ServerContext::new( + cockroach_cli, + log.new(slog::o!("component" => "ServerContext")), + ); let http_server_starter = dropshot::HttpServerStarter::new( &server_config.dropshot, http_entrypoints::api(), From 8aca0a29a4bcf52ca54ad6089bc9b3fbcdaa23af Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 30 May 2024 15:48:16 -0400 Subject: [PATCH 05/19] Add cockroach-admin OpenAPI and client crate --- Cargo.lock | 17 ++ Cargo.toml | 3 + clients/cockroach-admin-client/Cargo.toml | 16 ++ clients/cockroach-admin-client/src/lib.rs | 21 +++ cockroach-admin/Cargo.toml | 5 + .../tests/integration_tests/commands.rs | 43 +++++ .../tests/integration_tests/mod.rs | 5 + cockroach-admin/tests/mod.rs | 5 + .../output/cmd-cockroach-admin-openapi-stderr | 0 openapi/cockroach-admin.json | 168 ++++++++++++++++++ 10 files changed, 283 insertions(+) create mode 100644 clients/cockroach-admin-client/Cargo.toml create mode 100644 clients/cockroach-admin-client/src/lib.rs create mode 100644 cockroach-admin/tests/integration_tests/commands.rs create mode 100644 cockroach-admin/tests/integration_tests/mod.rs create mode 100644 cockroach-admin/tests/mod.rs create mode 100644 cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr create mode 100644 openapi/cockroach-admin.json diff --git a/Cargo.lock b/Cargo.lock index fa1f17b007..2d0cc83cd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1062,6 +1062,18 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" +[[package]] +name = "cockroach-admin-client" +version = "0.1.0" +dependencies = [ + "chrono", + "progenitor", + "reqwest", + "schemars", + "serde", + "slog", +] + [[package]] name = "colorchoice" version = "1.0.1" @@ -5247,6 +5259,7 @@ dependencies = [ "clap", "csv", "dropshot", + "expectorate", "http 0.2.12", "illumos-utils", "nexus-test-utils", @@ -5255,13 +5268,17 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "once_cell", + "openapi-lint", + "openapiv3", "pq-sys", "schemars", "serde", + "serde_json", "slog", "slog-async", "slog-dtrace", "slog-error-chain", + "subprocess", "thiserror", "tokio", "tokio-postgres", diff --git a/Cargo.toml b/Cargo.toml index f9b4906779..b07664cda8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "bootstore", "certificates", "clients/bootstrap-agent-client", + "clients/cockroach-admin-client", "clients/ddm-admin-client", "clients/dns-service-client", "clients/dpd-client", @@ -89,6 +90,7 @@ default-members = [ "bootstore", "certificates", "clients/bootstrap-agent-client", + "clients/cockroach-admin-client", "clients/ddm-admin-client", "clients/dns-service-client", "clients/dpd-client", @@ -240,6 +242,7 @@ ciborium = "0.2.2" cfg-if = "1.0" chrono = { version = "0.4", features = [ "serde" ] } clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] } +cockroach-admin-client = { path = "clients/cockroach-admin-client" } colored = "2.1" const_format = "0.2.32" cookie = "0.18" diff --git a/clients/cockroach-admin-client/Cargo.toml b/clients/cockroach-admin-client/Cargo.toml new file mode 100644 index 0000000000..da165d8a84 --- /dev/null +++ b/clients/cockroach-admin-client/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "cockroach-admin-client" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +chrono.workspace = true +progenitor.workspace = true +reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } +schemars.workspace = true +serde.workspace = true +slog.workspace = true diff --git a/clients/cockroach-admin-client/src/lib.rs b/clients/cockroach-admin-client/src/lib.rs new file mode 100644 index 0000000000..c0e47c97bc --- /dev/null +++ b/clients/cockroach-admin-client/src/lib.rs @@ -0,0 +1,21 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for making API requests to a Bootstrap Agent + +progenitor::generate_api!( + spec = "../../openapi/cockroach-admin.json", + inner_type = slog::Logger, + pre_hook = (|log: &slog::Logger, request: &reqwest::Request| { + slog::debug!(log, "client request"; + "method" => %request.method(), + "uri" => %request.url(), + "body" => ?&request.body(), + ); + }), + post_hook = (|log: &slog::Logger, result: &Result<_, _>| { + slog::debug!(log, "client response"; "result" => ?result); + }), + derives = [schemars::JsonSchema], +); diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index 84cfbbc21b..6307411d01 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -34,8 +34,13 @@ toml.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] +expectorate.workspace = true nexus-test-utils.workspace = true omicron-test-utils.workspace = true +openapi-lint.workspace = true +openapiv3.workspace = true +serde_json.workspace = true +subprocess.workspace = true url.workspace = true [lints] diff --git a/cockroach-admin/tests/integration_tests/commands.rs b/cockroach-admin/tests/integration_tests/commands.rs new file mode 100644 index 0000000000..0d84fa886d --- /dev/null +++ b/cockroach-admin/tests/integration_tests/commands.rs @@ -0,0 +1,43 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tests for the executable commands in this repo. + +use expectorate::assert_contents; +use omicron_test_utils::dev::test_cmds::{ + assert_exit_code, path_to_executable, run_command, EXIT_SUCCESS, +}; +use openapiv3::OpenAPI; +use std::path::PathBuf; +use subprocess::Exec; + +// path to executable +const CMD_WICKETD: &str = env!("CARGO_BIN_EXE_cockroach-admin"); + +fn path_to_cockroach_admin() -> PathBuf { + path_to_executable(CMD_WICKETD) +} + +#[test] +fn test_cockroach_admin_openapi() { + let exec = Exec::cmd(path_to_cockroach_admin()).arg("openapi"); + let (exit_status, stdout_text, stderr_text) = run_command(exec); + assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); + assert_contents( + "tests/output/cmd-cockroach-admin-openapi-stderr", + &stderr_text, + ); + + let spec: OpenAPI = serde_json::from_str(&stdout_text) + .expect("stdout was not valid OpenAPI"); + + // Check for lint errors. + let errors = openapi_lint::validate(&spec); + assert!(errors.is_empty(), "{}", errors.join("\n\n")); + + // Confirm that the output hasn't changed. It's expected that we'll change + // this file as the API evolves, but pay attention to the diffs to ensure + // that the changes match your expectations. + assert_contents("../openapi/cockroach-admin.json", &stdout_text); +} diff --git a/cockroach-admin/tests/integration_tests/mod.rs b/cockroach-admin/tests/integration_tests/mod.rs new file mode 100644 index 0000000000..1bf43dc00c --- /dev/null +++ b/cockroach-admin/tests/integration_tests/mod.rs @@ -0,0 +1,5 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +mod commands; diff --git a/cockroach-admin/tests/mod.rs b/cockroach-admin/tests/mod.rs new file mode 100644 index 0000000000..99aeeb8299 --- /dev/null +++ b/cockroach-admin/tests/mod.rs @@ -0,0 +1,5 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +mod integration_tests; diff --git a/cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr b/cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openapi/cockroach-admin.json b/openapi/cockroach-admin.json new file mode 100644 index 0000000000..ddacd89ab1 --- /dev/null +++ b/openapi/cockroach-admin.json @@ -0,0 +1,168 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "Oxide CockroachDb Cluster Admin API", + "description": "API for interacting with the Oxide control plane's CockroachDb cluster", + "contact": { + "url": "https://oxide.computer", + "email": "api@oxide.computer" + }, + "version": "0.0.1" + }, + "paths": { + "/node/id": { + "get": { + "summary": "Get the CockroachDB node ID of the local cockroach instance.", + "operationId": "node_id", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/NodeId" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/node/status": { + "get": { + "summary": "Get the status of all nodes in the CRDB cluster", + "operationId": "node_status", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClusterNodeStatus" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + } + }, + "components": { + "schemas": { + "ClusterNodeStatus": { + "type": "object", + "properties": { + "all_nodes": { + "type": "array", + "items": { + "$ref": "#/components/schemas/NodeStatus" + } + } + }, + "required": [ + "all_nodes" + ] + }, + "Error": { + "description": "Error information from a response.", + "type": "object", + "properties": { + "error_code": { + "type": "string" + }, + "message": { + "type": "string" + }, + "request_id": { + "type": "string" + } + }, + "required": [ + "message", + "request_id" + ] + }, + "NodeId": { + "description": "CockroachDB Node ID", + "type": "object", + "properties": { + "id": { + "type": "string" + } + }, + "required": [ + "id" + ] + }, + "NodeStatus": { + "type": "object", + "properties": { + "address": { + "type": "string" + }, + "build": { + "type": "string" + }, + "is_available": { + "type": "boolean" + }, + "is_live": { + "type": "boolean" + }, + "locality": { + "type": "string" + }, + "node_id": { + "type": "string" + }, + "sql_address": { + "type": "string" + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "updated_at": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "address", + "build", + "is_available", + "is_live", + "locality", + "node_id", + "sql_address", + "started_at", + "updated_at" + ] + } + }, + "responses": { + "Error": { + "description": "Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } +} From 2fcdabe76d413fde82d1c77a9b4af4c0925b174d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Jun 2024 16:54:43 -0400 Subject: [PATCH 06/19] flesh out node ID collector RPW --- Cargo.lock | 1 + cockroach-admin/src/context.rs | 2 +- nexus/Cargo.toml | 1 + .../app/background/crdb_node_id_collector.rs | 243 ++++++++++++++++-- 4 files changed, 228 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d0cc83cd4..d9b31dd09e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5452,6 +5452,7 @@ dependencies = [ "cancel-safe-futures", "chrono", "clap", + "cockroach-admin-client", "criterion", "crucible-agent-client", "crucible-pantry-client", diff --git a/cockroach-admin/src/context.rs b/cockroach-admin/src/context.rs index a81c54e16c..82869e6376 100644 --- a/cockroach-admin/src/context.rs +++ b/cockroach-admin/src/context.rs @@ -47,7 +47,7 @@ impl ServerContext { async fn read_node_id_from_cockroach(&self) -> anyhow::Result { // TODO-cleanup This connection string is duplicated in Nexus - maybe we - // should centralize it? I'm not sure where we could put it - + // should centralize it? I'm not sure where we could put it; // omicron_common, perhaps? let connect_url = format!( "postgresql://root@{}/omicron?sslmode=disable", diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 58a1e824cb..445205144d 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -22,6 +22,7 @@ camino.workspace = true camino-tempfile.workspace = true clap.workspace = true chrono.workspace = true +cockroach-admin-client.workspace = true crucible-agent-client.workspace = true crucible-pantry-client.workspace = true dns-service-client.workspace = true diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index 08ec7d1da0..f36c1d1abc 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -24,8 +24,11 @@ //! whether a zone without a known node ID ever existed. use super::common::BackgroundTask; +use anyhow::Context; use futures::future::BoxFuture; +use futures::stream; use futures::FutureExt; +use futures::StreamExt; use nexus_auth::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::blueprint_zone_type; @@ -33,7 +36,10 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; +use omicron_common::address::COCKROACH_ADMIN_PORT; +use omicron_uuid_kinds::OmicronZoneUuid; use serde_json::json; +use std::net::SocketAddrV6; use std::sync::Arc; use tokio::sync::watch; @@ -58,9 +64,10 @@ impl CockroachNodeIdCollector { /// The presence of `boxed()` in `BackgroundTask::activate` has caused some /// confusion with compilation errors in the past. So separate this method /// out. - async fn activate_impl<'a>( + async fn activate_impl( &mut self, opctx: &OpContext, + addrs_from_blueprint: &T, ) -> serde_json::Value { // Get the latest blueprint, cloning to prevent holding a read lock // on the watch. @@ -74,28 +81,127 @@ impl CockroachNodeIdCollector { return json!({"error": "no blueprint" }); }; + // With a bit of concurrency, confirm we know the node IDs for all the + // CRDB zones in the blueprint. + let mut results = + stream::iter(addrs_from_blueprint.cockroach_admin_addrs(blueprint)) + .map(|(zone_id, admin_addr)| { + let datastore = &self.datastore; + async move { + ensure_node_id_known( + opctx, datastore, zone_id, admin_addr, + ) + .await + .map_err(|err| (zone_id, err)) + } + }) + .buffer_unordered(8); + + let mut nsuccess = 0; + let mut errors = vec![]; + while let Some(result) = results.next().await { + match result { + Ok(()) => { + nsuccess += 1; + } + Err((zone_id, err)) => { + errors.push(json!({ + "zone_id": zone_id, + "err": format!("{err:#}"), + })); + } + } + } + + if errors.is_empty() { + json!({ "nsuccess": nsuccess }) + } else { + json!({ + "nsuccess": nsuccess, + "errors": errors, + }) + } + } +} + +// This trait exists so we can inject addresses in our unit tests below that +// aren't required to have admin servers listening on the fixed +// `COCKROACH_ADMIN_PORT`. +trait CockroachAdminFromBlueprint { + fn cockroach_admin_addrs<'a>( + &self, + blueprint: &'a Blueprint, + ) -> impl Iterator + 'a; +} + +struct CockroachAdminFromBlueprintViaFixedPort; + +impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { + fn cockroach_admin_addrs<'a>( + &self, + blueprint: &'a Blueprint, + ) -> impl Iterator + 'a { // We can only actively collect from zones that should be running; if // there are CRDB zones in other states that still need their node ID // collected, we have to wait until they're running. - for (_sled_id, zone) in - blueprint.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) - { - let BlueprintZoneType::CockroachDb( - blueprint_zone_type::CockroachDb { address, .. }, - ) = &zone.zone_type - else { - continue; - }; + let zone_filter = BlueprintZoneFilter::ShouldBeRunning; - // TODO 1 continue if we already know the node ID for this zone - _ = &self.datastore; - - // TODO 2 collect and insert the node ID for this zone - _ = address; - } + blueprint.all_omicron_zones(zone_filter).filter_map( + |(_sled_id, zone)| match &zone.zone_type { + BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { address, .. }, + ) => { + let mut admin_addr = *address; + admin_addr.set_port(COCKROACH_ADMIN_PORT); + Some((zone.id, admin_addr)) + } + _ => None, + }, + ) + } +} - json!({}) +async fn ensure_node_id_known( + opctx: &OpContext, + datastore: &DataStore, + zone_id: OmicronZoneUuid, + admin_addr: SocketAddrV6, +) -> anyhow::Result<()> { + // Do we already know the node ID for this zone? + if datastore + .cockroachdb_node_id(opctx, zone_id) + .await + .with_context(|| { + format!("fetching existing node ID for zone {zone_id}") + })? + .is_some() + { + return Ok(()); } + + // We don't know the address; contact the admin server and ask if it knows. + let admin_url = format!("http://{admin_addr}"); + let admin_client = + cockroach_admin_client::Client::new(&admin_url, opctx.log.clone()); + let node_id = admin_client + .node_id() + .await + .with_context(|| { + format!("failed to fetch node ID for zone {zone_id} at {admin_url}") + })? + .into_inner() + .id; + + // Record this value. We have a harmless TOCTOU here; if multiple Nexus + // instances all checked for a node ID, found none, and get here, this call + // is idempotent (as long as they all are inserting the same node ID, which + // they certainly should be!). + datastore + .set_cockroachdb_node_id(opctx, zone_id, node_id.clone()) + .await + .with_context(|| { + format!("failed to record node ID {node_id} for zone {zone_id}") + }) } impl BackgroundTask for CockroachNodeIdCollector { @@ -103,6 +209,107 @@ impl BackgroundTask for CockroachNodeIdCollector { &'a mut self, opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { - self.activate_impl(opctx).boxed() + self.activate_impl(opctx, &CockroachAdminFromBlueprintViaFixedPort) + .boxed() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_types::deployment::BlueprintZoneConfig; + use nexus_types::deployment::BlueprintZoneDisposition; + use omicron_uuid_kinds::SledUuid; + use uuid::Uuid; + + // The `CockroachAdminFromBlueprintViaFixedPort` type above is the standard + // way to map from a blueprint to an iterator of cockroach-admin addresses. + // We can't use that in the more thorough test below (and it exists so we + // can _write_ that test), so test it in isolation here. + #[test] + fn test_default_cockroach_admin_addrs_from_blueprint() { + // Construct an empty blueprint with one sled. + let sled_id = SledUuid::new_v4(); + let mut blueprint = BlueprintBuilder::build_empty_with_sleds( + std::iter::once(sled_id), + "test", + ); + let bp_zones = blueprint + .blueprint_zones + .get_mut(&sled_id) + .expect("found entry for test sled"); + + let make_crdb_zone_config = + |disposition, id, addr: SocketAddrV6| BlueprintZoneConfig { + disposition, + id, + underlay_address: *addr.ip(), + zone_type: BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { + address: addr, + dataset: nexus_types::inventory::OmicronZoneDataset { + pool_name: format!("oxp_{}", Uuid::new_v4()) + .parse() + .unwrap(), + }, + }, + ), + }; + + // Add a three CRDB zones with known addresses; the first and third are + // in service, and the second is expunged. Only the first and third + // should show up when we ask for addresses below. + let crdb_id1 = OmicronZoneUuid::new_v4(); + let crdb_id2 = OmicronZoneUuid::new_v4(); + let crdb_id3 = OmicronZoneUuid::new_v4(); + let crdb_addr1: SocketAddrV6 = "[2001:db8::1]:1111".parse().unwrap(); + let crdb_addr2: SocketAddrV6 = "[2001:db8::2]:1234".parse().unwrap(); + let crdb_addr3: SocketAddrV6 = "[2001:db8::3]:1234".parse().unwrap(); + bp_zones.zones.push(make_crdb_zone_config( + BlueprintZoneDisposition::InService, + crdb_id1, + crdb_addr1, + )); + bp_zones.zones.push(make_crdb_zone_config( + BlueprintZoneDisposition::Expunged, + crdb_id2, + crdb_addr2, + )); + bp_zones.zones.push(make_crdb_zone_config( + BlueprintZoneDisposition::InService, + crdb_id3, + crdb_addr3, + )); + + // Also add a non-CRDB zone to ensure it's filtered out. + bp_zones.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id: OmicronZoneUuid::new_v4(), + underlay_address: "::1".parse().unwrap(), + zone_type: BlueprintZoneType::CruciblePantry( + blueprint_zone_type::CruciblePantry { + address: "[::1]:0".parse().unwrap(), + }, + ), + }); + + // We expect to see CRDB zones 1 and 3 with their IPs but the ports + // changed to `COCKROACH_ADMIN_PORT`. + let expected = vec![ + ( + crdb_id1, + SocketAddrV6::new(*crdb_addr1.ip(), COCKROACH_ADMIN_PORT, 0, 0), + ), + ( + crdb_id3, + SocketAddrV6::new(*crdb_addr3.ip(), COCKROACH_ADMIN_PORT, 0, 0), + ), + ]; + + let admin_addrs = CockroachAdminFromBlueprintViaFixedPort + .cockroach_admin_addrs(&blueprint) + .collect::>(); + assert_eq!(expected, admin_addrs); } } From 3f238d2195068b00a613b9a3bbf0c4329bb955ef Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 11:37:25 -0400 Subject: [PATCH 07/19] more testing --- nexus/Cargo.toml | 1 + .../app/background/crdb_node_id_collector.rs | 105 +++++++++++++++++- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 445205144d..81cf6499b2 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -119,6 +119,7 @@ hyper-rustls.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true hubtools.workspace = true +nexus-db-queries = { workspace = true, features = ["testing"] } nexus-test-utils-macros.workspace = true nexus-test-utils.workspace = true omicron-sled-agent.workspace = true diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index f36c1d1abc..9e24ce40d9 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -129,7 +129,7 @@ impl CockroachNodeIdCollector { // `COCKROACH_ADMIN_PORT`. trait CockroachAdminFromBlueprint { fn cockroach_admin_addrs<'a>( - &self, + &'a self, blueprint: &'a Blueprint, ) -> impl Iterator + 'a; } @@ -138,7 +138,7 @@ struct CockroachAdminFromBlueprintViaFixedPort; impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { fn cockroach_admin_addrs<'a>( - &self, + &'a self, blueprint: &'a Blueprint, ) -> impl Iterator + 'a { // We can only actively collect from zones that should be running; if @@ -217,10 +217,15 @@ impl BackgroundTask for CockroachNodeIdCollector { #[cfg(test)] mod tests { use super::*; + use chrono::Utc; + use nexus_db_queries::db::datastore::pub_test_utils::datastore_test; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; + use omicron_test_utils::dev; use omicron_uuid_kinds::SledUuid; + use std::iter; use uuid::Uuid; // The `CockroachAdminFromBlueprintViaFixedPort` type above is the standard @@ -232,7 +237,7 @@ mod tests { // Construct an empty blueprint with one sled. let sled_id = SledUuid::new_v4(); let mut blueprint = BlueprintBuilder::build_empty_with_sleds( - std::iter::once(sled_id), + iter::once(sled_id), "test", ); let bp_zones = blueprint @@ -312,4 +317,98 @@ mod tests { .collect::>(); assert_eq!(expected, admin_addrs); } + + #[tokio::test] + async fn test_activate_fails_if_no_blueprint() { + let logctx = dev::test_setup_log("test_activate_fails_if_no_blueprint"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = + datastore_test(&logctx, &db, Uuid::new_v4()).await; + + let (_tx_blueprint, rx_blueprint) = watch::channel(None); + let mut collector = + CockroachNodeIdCollector::new(datastore.clone(), rx_blueprint); + let result = collector.activate(&opctx).await; + + assert_eq!(result, json!({"error": "no blueprint"})); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + struct FakeCockroachAdminAddrs(Vec<(OmicronZoneUuid, SocketAddrV6)>); + + impl CockroachAdminFromBlueprint for FakeCockroachAdminAddrs { + fn cockroach_admin_addrs<'a>( + &'a self, + _blueprint: &'a Blueprint, + ) -> impl Iterator + 'a + { + self.0.iter().copied() + } + } + + #[tokio::test] + async fn test_activate_with_no_unknown_node_ids() { + let logctx = + dev::test_setup_log("test_activate_with_no_unknown_node_ids"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = + datastore_test(&logctx, &db, Uuid::new_v4()).await; + + let blueprint = BlueprintBuilder::build_empty_with_sleds( + iter::once(SledUuid::new_v4()), + "test", + ); + let blueprint_target = BlueprintTarget { + target_id: blueprint.id, + enabled: true, + time_made_target: Utc::now(), + }; + + let (_tx_blueprint, rx_blueprint) = + watch::channel(Some(Arc::new((blueprint_target, blueprint)))); + let mut collector = + CockroachNodeIdCollector::new(datastore.clone(), rx_blueprint); + + // The blueprint is empty. This should be fine: we should get no + // successes and no errors. + let result = collector.activate(&opctx).await; + assert_eq!(result, json!({"nsuccess": 0})); + + // Create a few fake CRDB zones, and assign them node IDs in the + // datastore. + let crdb_zones = + (0..5).map(|_| OmicronZoneUuid::new_v4()).collect::>(); + for (i, zone_id) in crdb_zones.iter().copied().enumerate() { + datastore + .set_cockroachdb_node_id( + &opctx, + zone_id, + format!("test-node-{i}"), + ) + .await + .expect("assigned fake node ID"); + } + + // Activate again, injecting our fake CRDB zones with arbitrary + // cockroach-admin addresses. Because the node IDs are already in the + // datastore, the collector shouldn't try to contact these addresses and + // should instead report that all nodes are recorded successfully. + let result = collector + .activate_impl( + &opctx, + &FakeCockroachAdminAddrs( + crdb_zones + .iter() + .map(|&zone_id| (zone_id, "[::1]:0".parse().unwrap())) + .collect(), + ), + ) + .await; + assert_eq!(result, json!({"nsuccess": crdb_zones.len()})); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } From d760514b9ba068800086a36335c6dd79e8832ff4 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 11:44:00 -0400 Subject: [PATCH 08/19] reject duplicate node ID records --- nexus/db-model/src/schema.rs | 2 +- .../src/db/datastore/cockroachdb_node_id.rs | 11 ++++++++++ schema/crdb/dbinit.sql | 20 ++++++++----------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index eb9ffe6b6f..0ad49f4417 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1616,7 +1616,7 @@ table! { } table! { - cockroachdb_zone_id_to_node_id (omicron_zone_id) { + cockroachdb_zone_id_to_node_id (omicron_zone_id, crdb_node_id) { omicron_zone_id -> Uuid, crdb_node_id -> Text, } diff --git a/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs b/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs index d1feaa988b..fee915ab59 100644 --- a/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs +++ b/nexus/db-queries/src/db/datastore/cockroachdb_node_id.rs @@ -132,6 +132,17 @@ mod tests { .await .expect_err("failed to set node ID"); + // We can't assign the same node ID to a different zone, either. + let different_zone_id = OmicronZoneUuid::new_v4(); + datastore + .set_cockroachdb_node_id( + &opctx, + different_zone_id, + fake_node_id.to_string(), + ) + .await + .expect_err("failed to set node ID"); + // We can reassign the same node ID (i.e., setting is idempotent). datastore .set_cockroachdb_node_id( diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c31e32b028..8c08a9e9dd 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3465,19 +3465,15 @@ CREATE TABLE IF NOT EXISTS omicron.public.bp_omicron_zone_nic ( -- by the blueprint tables above, but is used by the more general Reconfigurator -- system along with them (e.g., to decommission expunged CRDB nodes). CREATE TABLE IF NOT EXISTS omicron.public.cockroachdb_zone_id_to_node_id ( - omicron_zone_id UUID NOT NULL PRIMARY KEY, - crdb_node_id TEXT NOT NULL -); - --- We already effectively have a unique crdb_node_id for each omicron_zone_id --- because it's the primary key; however, an insert query that uses `ON CONFLICT --- DO NOTHING` for idempotency requires a UNIQUE constraint on the pair. -CREATE UNIQUE INDEX IF NOT EXISTS cockroachdb_zone_id_to_node_id_unique_pair ON - omicron.public.cockroachdb_zone_id_to_node_id ( - omicron_zone_id, - crdb_node_id - ); + omicron_zone_id UUID NOT NULL UNIQUE, + crdb_node_id TEXT NOT NULL UNIQUE, + -- We require the pair to be unique, and also require each column to be + -- unique: there should only be one entry for a given zone ID, one entry for + -- a given node ID, and we need a unique requirement on the pair (via this + -- primary key) to support `ON CONFLICT DO NOTHING` idempotent inserts. + PRIMARY KEY (omicron_zone_id, crdb_node_id) +); /*******************************************************************/ From 15f93fa118b347e325cc31492fc1f94594f24778 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 12:39:16 -0400 Subject: [PATCH 09/19] add zone id to node id endpoint --- Cargo.lock | 2 ++ clients/cockroach-admin-client/Cargo.toml | 1 + clients/cockroach-admin-client/src/lib.rs | 3 +++ cockroach-admin/Cargo.toml | 1 + cockroach-admin/src/bin/cockroach-admin.rs | 17 +++++++++--- cockroach-admin/src/context.rs | 26 ++++++++++++++++--- cockroach-admin/src/http_entrypoints.rs | 18 ++++++++++--- cockroach-admin/src/lib.rs | 3 +++ .../app/background/crdb_node_id_collector.rs | 24 +++++++++++++---- openapi/cockroach-admin.json | 17 ++++++++++-- sled-agent/src/services.rs | 2 ++ smf/cockroach-admin/manifest.xml | 1 + smf/cockroach-admin/method_script.sh | 2 ++ 13 files changed, 98 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d9b31dd09e..a9fd3f5766 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1067,6 +1067,7 @@ name = "cockroach-admin-client" version = "0.1.0" dependencies = [ "chrono", + "omicron-uuid-kinds", "progenitor", "reqwest", "schemars", @@ -5266,6 +5267,7 @@ dependencies = [ "omicron-common", "omicron-rpaths", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/clients/cockroach-admin-client/Cargo.toml b/clients/cockroach-admin-client/Cargo.toml index da165d8a84..87a0062285 100644 --- a/clients/cockroach-admin-client/Cargo.toml +++ b/clients/cockroach-admin-client/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] chrono.workspace = true +omicron-uuid-kinds.workspace = true progenitor.workspace = true reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } schemars.workspace = true diff --git a/clients/cockroach-admin-client/src/lib.rs b/clients/cockroach-admin-client/src/lib.rs index c0e47c97bc..cffeb9e95a 100644 --- a/clients/cockroach-admin-client/src/lib.rs +++ b/clients/cockroach-admin-client/src/lib.rs @@ -18,4 +18,7 @@ progenitor::generate_api!( slog::debug!(log, "client response"; "result" => ?result); }), derives = [schemars::JsonSchema], + replace = { + TypedUuidForOmicronZoneKind = omicron_uuid_kinds::OmicronZoneUuid, + } ); diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index 6307411d01..49401afb9d 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -17,6 +17,7 @@ dropshot.workspace = true http.workspace = true illumos-utils.workspace = true omicron-common.workspace = true +omicron-uuid-kinds.workspace = true once_cell.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" diff --git a/cockroach-admin/src/bin/cockroach-admin.rs b/cockroach-admin/src/bin/cockroach-admin.rs index eb28082faa..0399c8bbb0 100644 --- a/cockroach-admin/src/bin/cockroach-admin.rs +++ b/cockroach-admin/src/bin/cockroach-admin.rs @@ -12,6 +12,7 @@ use omicron_cockroach_admin::CockroachCli; use omicron_cockroach_admin::Config; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; +use omicron_uuid_kinds::OmicronZoneUuid; use std::net::SocketAddr; use std::net::SocketAddrV6; @@ -38,6 +39,10 @@ enum Args { /// Path to the server config file #[clap(long, action)] config_file_path: Utf8PathBuf, + + /// ID of the zone within which we're running + #[clap(long, action)] + zone_id: OmicronZoneUuid, }, } @@ -59,16 +64,20 @@ async fn main_impl() -> Result<(), CmdError> { cockroach_address, http_address, config_file_path, + zone_id, } => { let cockroach_cli = CockroachCli::new(path_to_cockroach_binary, cockroach_address); let mut config = Config::from_file(&config_file_path) .map_err(|err| CmdError::Failure(anyhow!(err)))?; config.dropshot.bind_address = SocketAddr::V6(http_address); - let server = - omicron_cockroach_admin::start_server(cockroach_cli, config) - .await - .map_err(|err| CmdError::Failure(anyhow!(err)))?; + let server = omicron_cockroach_admin::start_server( + zone_id, + cockroach_cli, + config, + ) + .await + .map_err(|err| CmdError::Failure(anyhow!(err)))?; server.await.map_err(|err| { CmdError::Failure(anyhow!( "server failed after starting: {err}" diff --git a/cockroach-admin/src/context.rs b/cockroach-admin/src/context.rs index 82869e6376..9a86743682 100644 --- a/cockroach-admin/src/context.rs +++ b/cockroach-admin/src/context.rs @@ -5,12 +5,14 @@ use crate::CockroachCli; use anyhow::Context; use dropshot::HttpError; +use omicron_uuid_kinds::OmicronZoneUuid; use slog::Logger; use slog_error_chain::InlineErrorChain; use tokio::sync::OnceCell; pub struct ServerContext { - pub cockroach_cli: CockroachCli, + zone_id: OmicronZoneUuid, + cockroach_cli: CockroachCli, // Cockroach node IDs never change; we defer contacting our local node to // ask for its ID until we need to, but once we have it we don't need to ask // again. @@ -19,8 +21,20 @@ pub struct ServerContext { } impl ServerContext { - pub fn new(cockroach_cli: CockroachCli, log: Logger) -> Self { - Self { cockroach_cli, node_id: OnceCell::new(), log } + pub fn new( + zone_id: OmicronZoneUuid, + cockroach_cli: CockroachCli, + log: Logger, + ) -> Self { + Self { zone_id, cockroach_cli, node_id: OnceCell::new(), log } + } + + pub fn cockroach_cli(&self) -> &CockroachCli { + &self.cockroach_cli + } + + pub fn zone_id(&self) -> OmicronZoneUuid { + self.zone_id } pub async fn node_id(&self) -> Result<&str, HttpError> { @@ -107,7 +121,11 @@ mod tests { .parse() .expect("valid SocketAddrV6"); let cli = CockroachCli::new("cockroach".into(), cockroach_address); - let context = ServerContext::new(cli, logctx.log.clone()); + let context = ServerContext::new( + OmicronZoneUuid::new_v4(), + cli, + logctx.log.clone(), + ); // We should be able to fetch a node id, and it should be `1` (since we // have a single-node test cockroach instance). diff --git a/cockroach-admin/src/http_entrypoints.rs b/cockroach-admin/src/http_entrypoints.rs index cc9b14087a..bf12eb933b 100644 --- a/cockroach-admin/src/http_entrypoints.rs +++ b/cockroach-admin/src/http_entrypoints.rs @@ -8,6 +8,7 @@ use dropshot::endpoint; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::RequestContext; +use omicron_uuid_kinds::OmicronZoneUuid; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -45,7 +46,7 @@ async fn node_status( ) -> Result, HttpError> { let ctx = rqctx.context(); let all_nodes = - ctx.cockroach_cli.node_status().await.map_err(HttpError::from)?; + ctx.cockroach_cli().node_status().await.map_err(HttpError::from)?; Ok(HttpResponseOk(ClusterNodeStatus { all_nodes })) } @@ -53,12 +54,20 @@ async fn node_status( #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] pub struct NodeId { + /// The ID of this Omicron zone. + /// + /// This is included to ensure correctness even if a socket address on a + /// sled is reused for a different zone; if our caller is trying to + /// determine the node ID for a particular Omicron CockroachDB zone, they'll + /// contact us by socket address. We include our zone ID in the response for + /// their confirmation that we are the zone they intended to contact. + pub zone_id: OmicronZoneUuid, // CockroachDB node IDs are integers, in practice, but our use of them is as // input and output to the `cockroach` CLI. We use a string which is a bit // more natural (no need to parse CLI output or stringify an ID to send it // as input) and leaves open the door for the format to change in the // future. - pub id: String, + pub node_id: String, } /// Get the CockroachDB node ID of the local cockroach instance. @@ -70,6 +79,7 @@ async fn node_id( rqctx: RequestContext>, ) -> Result, HttpError> { let ctx = rqctx.context(); - let id = ctx.node_id().await?.to_string(); - Ok(HttpResponseOk(NodeId { id })) + let node_id = ctx.node_id().await?.to_string(); + let zone_id = ctx.zone_id(); + Ok(HttpResponseOk(NodeId { zone_id, node_id })) } diff --git a/cockroach-admin/src/lib.rs b/cockroach-admin/src/lib.rs index f7cc90176f..f4a32cb6c0 100644 --- a/cockroach-admin/src/lib.rs +++ b/cockroach-admin/src/lib.rs @@ -4,6 +4,7 @@ use context::ServerContext; use omicron_common::FileKv; +use omicron_uuid_kinds::OmicronZoneUuid; use slog::debug; use slog::error; use slog::Drain; @@ -51,6 +52,7 @@ pub type Server = dropshot::HttpServer>; /// Start the dropshot server pub async fn start_server( + zone_id: OmicronZoneUuid, cockroach_cli: CockroachCli, server_config: Config, ) -> Result { @@ -73,6 +75,7 @@ pub async fn start_server( } let context = ServerContext::new( + zone_id, cockroach_cli, log.new(slog::o!("component" => "ServerContext")), ); diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index 9e24ce40d9..50135529c9 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -24,6 +24,7 @@ //! whether a zone without a known node ID ever existed. use super::common::BackgroundTask; +use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; use futures::stream; @@ -183,24 +184,37 @@ async fn ensure_node_id_known( let admin_url = format!("http://{admin_addr}"); let admin_client = cockroach_admin_client::Client::new(&admin_url, opctx.log.clone()); - let node_id = admin_client + let node = admin_client .node_id() .await .with_context(|| { format!("failed to fetch node ID for zone {zone_id} at {admin_url}") })? - .into_inner() - .id; + .into_inner(); + + // Ensure the address we have for this zone is the zone we think it is. + // Absent bugs, the only way this can fail is if our blueprint is out of + // date, and there's now a new zone running at `admin_addr`; we _should_ + // fail in that case, and we'll catch up to reality when we reload the + // target blueprint. + ensure!( + zone_id == node.zone_id, + "expected cockroach zone {zone_id} at {admin_url}, but found zone {}", + node.zone_id + ); // Record this value. We have a harmless TOCTOU here; if multiple Nexus // instances all checked for a node ID, found none, and get here, this call // is idempotent (as long as they all are inserting the same node ID, which // they certainly should be!). datastore - .set_cockroachdb_node_id(opctx, zone_id, node_id.clone()) + .set_cockroachdb_node_id(opctx, zone_id, node.node_id.clone()) .await .with_context(|| { - format!("failed to record node ID {node_id} for zone {zone_id}") + format!( + "failed to record node ID {} for zone {zone_id}", + node.node_id + ) }) } diff --git a/openapi/cockroach-admin.json b/openapi/cockroach-admin.json index ddacd89ab1..a46b0014a1 100644 --- a/openapi/cockroach-admin.json +++ b/openapi/cockroach-admin.json @@ -98,12 +98,21 @@ "description": "CockroachDB Node ID", "type": "object", "properties": { - "id": { + "node_id": { "type": "string" + }, + "zone_id": { + "description": "The ID of this Omicron zone.\n\nThis is included to ensure correctness even if a socket address on a sled is reused for a different zone; if our caller is trying to determine the node ID for a particular Omicron CockroachDB zone, they'll contact us by socket address. We include our zone ID in the response for their confirmation that we are the zone they intended to contact.", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForOmicronZoneKind" + } + ] } }, "required": [ - "id" + "node_id", + "zone_id" ] }, "NodeStatus": { @@ -150,6 +159,10 @@ "started_at", "updated_at" ] + }, + "TypedUuidForOmicronZoneKind": { + "type": "string", + "format": "uuid" } }, "responses": { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7df9f06d53..70d68f6a8e 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1693,6 +1693,7 @@ impl ServiceManager { ZoneArgs::Omicron(OmicronZoneConfigLocal { zone: OmicronZoneConfig { + id: zone_id, zone_type: OmicronZoneType::CockroachDb { .. }, underlay_address, .. @@ -1734,6 +1735,7 @@ impl ServiceManager { // Configure the Omicron cockroach-admin service. let cockroach_admin_config = PropertyGroupBuilder::new("config") + .add_property("zone_id", "astring", zone_id.to_string()) .add_property( "cockroach_address", "astring", diff --git a/smf/cockroach-admin/manifest.xml b/smf/cockroach-admin/manifest.xml index 1d6f7c4861..3f95e1462a 100644 --- a/smf/cockroach-admin/manifest.xml +++ b/smf/cockroach-admin/manifest.xml @@ -22,6 +22,7 @@ + diff --git a/smf/cockroach-admin/method_script.sh b/smf/cockroach-admin/method_script.sh index c5f924223d..4f1e3035b0 100755 --- a/smf/cockroach-admin/method_script.sh +++ b/smf/cockroach-admin/method_script.sh @@ -6,6 +6,7 @@ set -o pipefail . /lib/svc/share/smf_include.sh +ZONE_ID="$(svcprop -c -p config/zone_id "${SMF_FMRI}")" COCKROACH_ADDR="$(svcprop -c -p config/cockroach_address "${SMF_FMRI}")" HTTP_ADDR="$(svcprop -c -p config/http_address "${SMF_FMRI}")" @@ -15,6 +16,7 @@ args=( '--path-to-cockroach-binary' "/opt/oxide/cockroachdb/bin/cockroach" '--cockroach-address' "$COCKROACH_ADDR" '--http-address' "$HTTP_ADDR" + '--zone-id' "$ZONE_ID" ) exec /opt/oxide/cockroach-admin/bin/cockroach-admin "${args[@]}" & From 4275df16652881bb65e1a76b91516f5d67f8b85e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 13:17:28 -0400 Subject: [PATCH 10/19] additional testing --- .../app/background/crdb_node_id_collector.rs | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index 50135529c9..94d7ddc9ad 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -232,6 +232,10 @@ impl BackgroundTask for CockroachNodeIdCollector { mod tests { use super::*; use chrono::Utc; + use httptest::matchers::any; + use httptest::responders::json_encoded; + use httptest::responders::status_code; + use httptest::Expectation; use nexus_db_queries::db::datastore::pub_test_utils::datastore_test; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_test_utils::db::test_setup_database; @@ -239,7 +243,9 @@ mod tests { use nexus_types::deployment::BlueprintZoneDisposition; use omicron_test_utils::dev; use omicron_uuid_kinds::SledUuid; + use std::collections::BTreeMap; use std::iter; + use std::net::SocketAddr; use uuid::Uuid; // The `CockroachAdminFromBlueprintViaFixedPort` type above is the standard @@ -425,4 +431,143 @@ mod tests { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_activate_with_unknown_node_ids() { + // Test setup. + let logctx = dev::test_setup_log("test_activate_with_unknown_node_ids"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = + datastore_test(&logctx, &db, Uuid::new_v4()).await; + + let blueprint = BlueprintBuilder::build_empty_with_sleds( + iter::once(SledUuid::new_v4()), + "test", + ); + let blueprint_target = BlueprintTarget { + target_id: blueprint.id, + enabled: true, + time_made_target: Utc::now(), + }; + + let (_tx_blueprint, rx_blueprint) = + watch::channel(Some(Arc::new((blueprint_target, blueprint)))); + let mut collector = + CockroachNodeIdCollector::new(datastore.clone(), rx_blueprint); + + // We'll send in three Cockroach nodes for the collector to gather: + // + // 1. Node 1 will succeed + // 2. Node 2 will fail + // 3. Node 3 will succeed, but will report an unexpeted zone ID + // + // We should see one success and two errors in the activation result. We + // need to start three fake cockroach-admin servers to handle the + // requests. + let make_httptest_server = || { + httptest::ServerBuilder::new() + .bind_addr("[::1]:0".parse().unwrap()) + .run() + .expect("started httptest server") + }; + let crdb_zone_id1 = OmicronZoneUuid::new_v4(); + let crdb_zone_id2 = OmicronZoneUuid::new_v4(); + let crdb_zone_id3 = OmicronZoneUuid::new_v4(); + let crdb_zone_id4 = OmicronZoneUuid::new_v4(); + let crdb_node_id1 = "fake-node-1"; + let crdb_node_id3 = "fake-node-3"; + let mut admin1 = make_httptest_server(); + let mut admin2 = make_httptest_server(); + let mut admin3 = make_httptest_server(); + let crdb_admin_addrs = FakeCockroachAdminAddrs( + vec![ + (crdb_zone_id1, admin1.addr()), + (crdb_zone_id2, admin2.addr()), + (crdb_zone_id3, admin3.addr()), + ] + .into_iter() + .map(|(zone_id, addr)| { + let SocketAddr::V6(addr6) = addr else { + panic!("expected IPv6 addr; got {addr}"); + }; + (zone_id, addr6) + }) + .collect(), + ); + + // Node 1 succeeds. + admin1.expect(Expectation::matching(any()).times(1).respond_with( + json_encoded(cockroach_admin_client::types::NodeId { + zone_id: crdb_zone_id1, + node_id: crdb_node_id1.to_string(), + }), + )); + // Node 2 fails. + admin2.expect( + Expectation::matching(any()) + .times(1) + .respond_with(status_code(503)), + ); + // Node 3 succeeds, but with an unexpected zone_id. + admin3.expect(Expectation::matching(any()).times(1).respond_with( + json_encoded(cockroach_admin_client::types::NodeId { + zone_id: crdb_zone_id4, + node_id: crdb_node_id3.to_string(), + }), + )); + + let result = collector.activate_impl(&opctx, &crdb_admin_addrs).await; + + admin1.verify_and_clear(); + admin2.verify_and_clear(); + admin3.verify_and_clear(); + + let result = result.as_object().expect("JSON object"); + + // We should have one success (node 1). + assert_eq!( + result.get("nsuccess").expect("nsuccess key").as_number(), + Some(&serde_json::Number::from(1)) + ); + let errors = result + .get("errors") + .expect("errors key") + .as_array() + .expect("errors array") + .iter() + .map(|val| { + let error = val.as_object().expect("error object"); + let zone_id = error + .get("zone_id") + .expect("zone_id key") + .as_str() + .expect("zone_id string"); + let err = error + .get("err") + .expect("err key") + .as_str() + .expect("err string"); + (zone_id, err) + }) + .collect::>(); + println!("errors: {errors:?}"); + assert_eq!(errors.len(), 2); + + // We should have an error for node 2. We don't check the specific + // message because it may change if progenitor changes how it reports a + // 503 with no body. + assert!(errors.contains_key(crdb_zone_id2.to_string().as_str())); + + // The error message for node 3 should contain both the expected and + // unexpected zone IDs. + let crdb_zone_id3 = crdb_zone_id3.to_string(); + let crdb_zone_id4 = crdb_zone_id4.to_string(); + let crdb_err3 = + errors.get(crdb_zone_id3.as_str()).expect("error for zone 3"); + assert!(crdb_err3.contains(&crdb_zone_id3)); + assert!(crdb_err3.contains(&crdb_zone_id4)); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } From e75e342b676f42c0ec09175d4abde796a4f0dc39 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 13:29:38 -0400 Subject: [PATCH 11/19] add config param for new RPW activation period --- nexus-config/src/nexus_config.rs | 5 +++++ nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 2 +- nexus/tests/config.test.toml | 1 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 6 files changed, 10 insertions(+), 1 deletion(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 321064df49..59914df037 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -517,6 +517,11 @@ pub struct BlueprintTasksConfig { /// executes the latest target blueprint #[serde_as(as = "DurationSeconds")] pub period_secs_execute: Duration, + + /// period (in seconds) for periodic activations of the background task that + /// collects the node IDs of CockroachDB zones + #[serde_as(as = "DurationSeconds")] + pub period_secs_collect_crdb_node_ids: Duration, } #[serde_as] diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index d90c240e8e..c282232ef8 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -110,6 +110,7 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 +blueprints.period_secs_collect_crdb_node_ids = 180 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ae9387c1e8..f78cb69d76 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -279,7 +279,7 @@ impl BackgroundTasks { let task_crdb_node_id_collector = driver.register( String::from("crdb_node_id_collector"), String::from("Collects node IDs of running CockroachDB zones"), - config.blueprints.period_secs_execute, // TODO-john FIXME + config.blueprints.period_secs_collect_crdb_node_ids, Box::new(crdb_node_id_collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint)], diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 861d78e20c..9aed5bcb69 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -106,6 +106,7 @@ physical_disk_adoption.period_secs = 30 physical_disk_adoption.disable = true blueprints.period_secs_load = 100 blueprints.period_secs_execute = 600 +blueprints.period_secs_collect_crdb_node_ids = 600 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 3827cbb38c..d4612ba15e 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -52,6 +52,7 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 +blueprints.period_secs_collect_crdb_node_ids = 180 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index ee04f88e59..3b158d0387 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -52,6 +52,7 @@ phantom_disks.period_secs = 30 physical_disk_adoption.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 +blueprints.period_secs_collect_crdb_node_ids = 180 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 From d7ff42a05f15cca248b95c87b841a7cc4f337ad5 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 13:33:18 -0400 Subject: [PATCH 12/19] schema migration --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/add-cockroach-zone-id-to-node-id/up.sql | 5 +++++ schema/crdb/dbinit.sql | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/add-cockroach-zone-id-to-node-id/up.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 4465c3aacf..7fc2b64272 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(69, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(70, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(70, "add-cockroach-zone-id-to-node-id"), KnownVersion::new(69, "expose-stage0"), KnownVersion::new(68, "filter-v2p-mapping-by-instance-state"), KnownVersion::new(67, "add-instance-updater-lock"), diff --git a/schema/crdb/add-cockroach-zone-id-to-node-id/up.sql b/schema/crdb/add-cockroach-zone-id-to-node-id/up.sql new file mode 100644 index 0000000000..fdb7d00082 --- /dev/null +++ b/schema/crdb/add-cockroach-zone-id-to-node-id/up.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS omicron.public.cockroachdb_zone_id_to_node_id ( + omicron_zone_id UUID NOT NULL UNIQUE, + crdb_node_id TEXT NOT NULL UNIQUE, + PRIMARY KEY (omicron_zone_id, crdb_node_id) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 8c08a9e9dd..d10ca1dffe 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4059,7 +4059,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '69.0.0', NULL) + (TRUE, NOW(), NOW(), '70.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From f7ecb4832d18586a263a22c6902b523c46d6c2d9 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 13:40:13 -0400 Subject: [PATCH 13/19] fix up config param --- nexus-config/src/nexus_config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 59914df037..b88ce2482c 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -920,7 +920,9 @@ mod test { }, blueprints: BlueprintTasksConfig { period_secs_load: Duration::from_secs(10), - period_secs_execute: Duration::from_secs(60) + period_secs_execute: Duration::from_secs(60), + period_secs_collect_crdb_node_ids: + Duration::from_secs(180), }, sync_service_zone_nat: SyncServiceZoneNatConfig { period_secs: Duration::from_secs(30) @@ -1008,6 +1010,7 @@ mod test { phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 + blueprints.period_secs_collect_crdb_node_ids = 180 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 From 3ec1652aa4885c6afc42d70688aee36fa2159937 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 14:48:38 -0400 Subject: [PATCH 14/19] expectorate --- dev-tools/omdb/tests/env.out | 12 ++++++++++++ dev-tools/omdb/tests/successes.out | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index ccb824cda4..174ffe5e3e 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -43,6 +43,10 @@ task: "blueprint_loader" Loads the current target blueprint from the DB +task: "crdb_node_id_collector" + Collects node IDs of running CockroachDB zones + + task: "dns_config_external" watches external DNS data stored in CockroachDB @@ -163,6 +167,10 @@ task: "blueprint_loader" Loads the current target blueprint from the DB +task: "crdb_node_id_collector" + Collects node IDs of running CockroachDB zones + + task: "dns_config_external" watches external DNS data stored in CockroachDB @@ -270,6 +278,10 @@ task: "blueprint_loader" Loads the current target blueprint from the DB +task: "crdb_node_id_collector" + Collects node IDs of running CockroachDB zones + + task: "dns_config_external" watches external DNS data stored in CockroachDB diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 22d613f838..9f16c6026c 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -244,6 +244,10 @@ task: "blueprint_loader" Loads the current target blueprint from the DB +task: "crdb_node_id_collector" + Collects node IDs of running CockroachDB zones + + task: "dns_config_external" watches external DNS data stored in CockroachDB @@ -426,6 +430,13 @@ task: "bfd_manager" started at (s ago) and ran for ms last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } +task: "crdb_node_id_collector" + configured period: every 10m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last completion reported error: no blueprint + task: "external_endpoints" configured period: every 1m currently executing: no From ae55496faa837d82a3e6d5bc9ab78ee2d6d36ca2 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 14:50:06 -0400 Subject: [PATCH 15/19] hakari --- Cargo.lock | 1 + clients/cockroach-admin-client/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index a9fd3f5766..1f3590d2e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1068,6 +1068,7 @@ version = "0.1.0" dependencies = [ "chrono", "omicron-uuid-kinds", + "omicron-workspace-hack", "progenitor", "reqwest", "schemars", diff --git a/clients/cockroach-admin-client/Cargo.toml b/clients/cockroach-admin-client/Cargo.toml index 87a0062285..cbf81c708f 100644 --- a/clients/cockroach-admin-client/Cargo.toml +++ b/clients/cockroach-admin-client/Cargo.toml @@ -15,3 +15,4 @@ reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } schemars.workspace = true serde.workspace = true slog.workspace = true +omicron-workspace-hack.workspace = true From edc12b3afe5a0aedf784bc9ad90ad9af852e3b54 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 5 Jun 2024 17:05:46 -0400 Subject: [PATCH 16/19] really fix nexus-config test --- nexus-config/src/nexus_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index b88ce2482c..67acb5ec1b 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -797,6 +797,7 @@ mod test { phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 + blueprints.period_secs_collect_crdb_node_ids = 180 sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 From 51ac00a109236e06322ed5cbff6550c5cb66edb1 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 12 Jun 2024 15:24:19 -0400 Subject: [PATCH 17/19] typos / copy-paste fixes --- clients/cockroach-admin-client/src/lib.rs | 2 +- cockroach-admin/tests/integration_tests/commands.rs | 4 ++-- nexus/src/app/background/crdb_node_id_collector.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clients/cockroach-admin-client/src/lib.rs b/clients/cockroach-admin-client/src/lib.rs index cffeb9e95a..b7f067b97d 100644 --- a/clients/cockroach-admin-client/src/lib.rs +++ b/clients/cockroach-admin-client/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Interface for making API requests to a Bootstrap Agent +//! Interface for making API requests to an Omicron CockroachDB admin server progenitor::generate_api!( spec = "../../openapi/cockroach-admin.json", diff --git a/cockroach-admin/tests/integration_tests/commands.rs b/cockroach-admin/tests/integration_tests/commands.rs index 0d84fa886d..875427d948 100644 --- a/cockroach-admin/tests/integration_tests/commands.rs +++ b/cockroach-admin/tests/integration_tests/commands.rs @@ -13,10 +13,10 @@ use std::path::PathBuf; use subprocess::Exec; // path to executable -const CMD_WICKETD: &str = env!("CARGO_BIN_EXE_cockroach-admin"); +const CMD_COCKROACH_ADMIN: &str = env!("CARGO_BIN_EXE_cockroach-admin"); fn path_to_cockroach_admin() -> PathBuf { - path_to_executable(CMD_WICKETD) + path_to_executable(CMD_COCKROACH_ADMIN) } #[test] diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index 94d7ddc9ad..962f4dd2d7 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -282,7 +282,7 @@ mod tests { ), }; - // Add a three CRDB zones with known addresses; the first and third are + // Add three CRDB zones with known addresses; the first and third are // in service, and the second is expunged. Only the first and third // should show up when we ask for addresses below. let crdb_id1 = OmicronZoneUuid::new_v4(); @@ -459,7 +459,7 @@ mod tests { // // 1. Node 1 will succeed // 2. Node 2 will fail - // 3. Node 3 will succeed, but will report an unexpeted zone ID + // 3. Node 3 will succeed, but will report an unexpected zone ID // // We should see one success and two errors in the activation result. We // need to start three fake cockroach-admin servers to handle the From 70c75586e26285ff22112b253c4f4ded081a064d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 12 Jun 2024 15:53:15 -0400 Subject: [PATCH 18/19] be paranoid: check node_id against output of `node status` --- cockroach-admin/src/context.rs | 44 +++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/cockroach-admin/src/context.rs b/cockroach-admin/src/context.rs index 9a86743682..ea281f7b75 100644 --- a/cockroach-admin/src/context.rs +++ b/cockroach-admin/src/context.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use std::net::SocketAddr; + use crate::CockroachCli; +use anyhow::bail; use anyhow::Context; use dropshot::HttpError; use omicron_uuid_kinds::OmicronZoneUuid; @@ -60,12 +63,12 @@ impl ServerContext { } async fn read_node_id_from_cockroach(&self) -> anyhow::Result { + let cockroach_address = self.cockroach_cli().cockroach_address(); // TODO-cleanup This connection string is duplicated in Nexus - maybe we // should centralize it? I'm not sure where we could put it; // omicron_common, perhaps? let connect_url = format!( - "postgresql://root@{}/omicron?sslmode=disable", - self.cockroach_cli.cockroach_address() + "postgresql://root@{cockroach_address}/omicron?sslmode=disable", ); let (client, connection) = tokio_postgres::connect(&connect_url, tokio_postgres::NoTls) @@ -93,7 +96,42 @@ impl ServerContext { .await .context("failed to send node ID query")?; - row.try_get(0).context("failed to read results of node ID query") + let node_id = row + .try_get(0) + .context("failed to read results of node ID query")?; + + // We'll be paranoid: While it seems unlikely we could ever get an + // incorrect node ID from the internal builtin, since it's not + // documented, we don't know for sure if it's possible for our query to + // be forwarded to a different node. Let's also run `NodeStatus`, and + // ensure that this node ID's address matches the address of our local + // crdb instance. + let node_statuses = self + .cockroach_cli() + .node_status() + .await + .context("failed to get node status")?; + + let our_node_status = node_statuses + .iter() + .find(|status| status.node_id == node_id) + .with_context(|| { + format!( + "node status did not include information for our node ID \ + ({node_id}): {node_statuses:?}" + ) + })?; + + if our_node_status.address != SocketAddr::V6(cockroach_address) { + bail!( + "node ID / address mismatch: we fetched node ID {node_id} \ + from our local cockroach at {cockroach_address}, but \ + `node status` reported this node ID at address {}", + our_node_status.address + ) + } + + Ok(node_id) } } From 7478ae990fca438db6b705aa237a9635eb5d8fac Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 12 Jun 2024 15:58:45 -0400 Subject: [PATCH 19/19] typo fix --- nexus/src/app/background/crdb_node_id_collector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/crdb_node_id_collector.rs index 962f4dd2d7..2736514021 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/crdb_node_id_collector.rs @@ -59,8 +59,8 @@ impl CockroachNodeIdCollector { Self { datastore, rx_blueprint } } - /// Implementation for `BackgroundTask::activate` for `BlueprintExecutor`, - /// added here to produce better compile errors. + /// Implementation for `BackgroundTask::activate`, added here to produce + /// better compile errors. /// /// The presence of `boxed()` in `BackgroundTask::activate` has caused some /// confusion with compilation errors in the past. So separate this method