diff --git a/Cargo.lock b/Cargo.lock index 5a196d4b4f..191a9a1d6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12271,9 +12271,9 @@ dependencies = [ [[package]] name = "uzers" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d283dc7e8c901e79e32d077866eaf599156cbf427fffa8289aecc52c5c3f63" +checksum = "4df81ff504e7d82ad53e95ed1ad5b72103c11253f39238bcc0235b90768a97dd" dependencies = [ "libc", "log", diff --git a/Cargo.toml b/Cargo.toml index 4caf971602..a93af21528 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -624,7 +624,7 @@ update-engine = { path = "update-engine" } url = "2.5.2" usdt = "0.5.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } -uzers = "0.11" +uzers = "0.12" walkdir = "2.5" whoami = "1.5" wicket = { path = "wicket" } diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 056bb451f7..d8e5a89734 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -197,7 +197,11 @@ pub async fn test_setup_with_config( future::ready(result) }, &Duration::from_millis(100), - &Duration::from_secs(1), + // This seems like a pretty long time to wait for MGS to discover the + // simulated SPs, but we've seen tests fail due to timeouts here in the + // past, so we may as well be generous: + // https://github.com/oxidecomputer/omicron/issues/6877 + &Duration::from_secs(30), ) .await .unwrap(); diff --git a/nexus/db-model/src/clickhouse_policy.rs b/nexus/db-model/src/clickhouse_policy.rs new file mode 100644 index 0000000000..55b50d903a --- /dev/null +++ b/nexus/db-model/src/clickhouse_policy.rs @@ -0,0 +1,111 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Database representation of a clickhouse deployment policy + +use super::impl_enum_type; +use crate::SqlU32; +use crate::{schema::clickhouse_policy, SqlU8}; +use chrono::{DateTime, Utc}; +use nexus_types::deployment; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "clickhouse_mode", schema = "public"))] + pub struct ClickhouseModeEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = ClickhouseModeEnum)] + pub enum DbClickhouseMode; + + // Enum values + SingleNodeOnly => b"single_node_only" + ClusterOnly => b"cluster_only" + Both => b"both" +); + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = clickhouse_policy)] +pub struct ClickhousePolicy { + pub version: SqlU32, + pub clickhouse_mode: DbClickhouseMode, + pub clickhouse_cluster_target_servers: SqlU8, + pub clickhouse_cluster_target_keepers: SqlU8, + pub time_created: DateTime, +} + +impl From<&deployment::ClickhouseMode> for DbClickhouseMode { + fn from(value: &deployment::ClickhouseMode) -> Self { + match value { + deployment::ClickhouseMode::SingleNodeOnly => { + DbClickhouseMode::SingleNodeOnly + } + deployment::ClickhouseMode::ClusterOnly { .. } => { + DbClickhouseMode::ClusterOnly + } + deployment::ClickhouseMode::Both { .. } => DbClickhouseMode::Both, + } + } +} + +impl From for deployment::ClickhousePolicy { + fn from(value: ClickhousePolicy) -> Self { + let mode = match value.clickhouse_mode { + DbClickhouseMode::SingleNodeOnly => { + deployment::ClickhouseMode::SingleNodeOnly + } + DbClickhouseMode::ClusterOnly => { + deployment::ClickhouseMode::ClusterOnly { + target_servers: value.clickhouse_cluster_target_servers.0, + target_keepers: value.clickhouse_cluster_target_keepers.0, + } + } + DbClickhouseMode::Both => deployment::ClickhouseMode::Both { + target_servers: value.clickhouse_cluster_target_servers.0, + target_keepers: value.clickhouse_cluster_target_keepers.0, + }, + }; + + deployment::ClickhousePolicy { + version: value.version.0, + mode, + time_created: value.time_created, + } + } +} + +impl From for ClickhousePolicy { + fn from(value: deployment::ClickhousePolicy) -> Self { + match value.mode { + deployment::ClickhouseMode::SingleNodeOnly => ClickhousePolicy { + version: value.version.into(), + clickhouse_mode: DbClickhouseMode::SingleNodeOnly, + clickhouse_cluster_target_servers: 0.into(), + clickhouse_cluster_target_keepers: 0.into(), + time_created: value.time_created, + }, + deployment::ClickhouseMode::ClusterOnly { + target_servers, + target_keepers, + } => ClickhousePolicy { + version: value.version.into(), + clickhouse_mode: DbClickhouseMode::ClusterOnly, + clickhouse_cluster_target_servers: target_servers.into(), + clickhouse_cluster_target_keepers: target_keepers.into(), + time_created: value.time_created, + }, + deployment::ClickhouseMode::Both { + target_servers, + target_keepers, + } => ClickhousePolicy { + version: value.version.into(), + clickhouse_mode: DbClickhouseMode::Both, + clickhouse_cluster_target_servers: target_servers.into(), + clickhouse_cluster_target_keepers: target_keepers.into(), + time_created: value.time_created, + }, + } + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 001c97b6f6..9137b0b3c3 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -17,6 +17,7 @@ mod block_size; mod bootstore; mod bytecount; mod certificate; +mod clickhouse_policy; mod cockroachdb_node_id; mod collection; mod console_session; @@ -133,6 +134,7 @@ pub use block_size::*; pub use bootstore::*; pub use bytecount::*; pub use certificate::*; +pub use clickhouse_policy::*; pub use cockroachdb_node_id::*; pub use collection::*; pub use console_session::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index a51fd04c8e..e0942d6b3b 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -841,6 +841,16 @@ table! { } } +table! { + clickhouse_policy (version) { + version -> Int8, + clickhouse_mode -> crate::clickhouse_policy::ClickhouseModeEnum, + clickhouse_cluster_target_servers -> Int2, + clickhouse_cluster_target_keepers -> Int2, + time_created -> Timestamptz, + } +} + table! { rack (id) { id -> Uuid, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index d1d82b25fb..f75264d18a 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(109, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(110, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(110, "clickhouse-policy"), KnownVersion::new(109, "inv-clickhouse-keeper-membership"), KnownVersion::new(108, "internet-gateway"), KnownVersion::new(107, "add-instance-boot-disk"), diff --git a/nexus/db-queries/src/db/datastore/clickhouse_policy.rs b/nexus/db-queries/src/db/datastore/clickhouse_policy.rs new file mode 100644 index 0000000000..d433bb9b60 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/clickhouse_policy.rs @@ -0,0 +1,284 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Queries related to clickhouse policy + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::pagination::paginated; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::dsl::sql_query; +use diesel::expression::SelectableHelper; +use diesel::sql_types; +use diesel::ExpressionMethods; +use diesel::OptionalExtension; +use diesel::QueryDsl; +use nexus_db_model::ClickhouseModeEnum; +use nexus_db_model::ClickhousePolicy as DbClickhousePolicy; +use nexus_db_model::DbClickhouseMode; +use nexus_db_model::SqlU32; +use nexus_db_model::SqlU8; +use nexus_types::deployment::ClickhousePolicy; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; + +impl DataStore { + pub async fn clickhouse_policy_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, SqlU32>, + ) -> ListResultVec { + use db::schema::clickhouse_policy; + + opctx + .authorize(authz::Action::ListChildren, &authz::BLUEPRINT_CONFIG) + .await?; + + let policies = paginated( + clickhouse_policy::table, + clickhouse_policy::version, + pagparams, + ) + .select(DbClickhousePolicy::as_select()) + .get_results_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(policies.into_iter().map(ClickhousePolicy::from).collect()) + } + + /// Return the clickhouse policy with the highest version + pub async fn clickhouse_policy_get_latest( + &self, + opctx: &OpContext, + ) -> Result, Error> { + opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::clickhouse_policy::dsl; + + let latest_policy = dsl::clickhouse_policy + .order_by(dsl::version.desc()) + .first_async::(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(latest_policy.map(Into::into)) + } + + /// Insert the current version of the policy in the database + /// + /// Only succeeds if the prior version is the latest version currently + /// in the `clickhouse_policy` table. If there are no versions currently + /// in the table, then the current policy must be at version 1. + pub async fn clickhouse_policy_insert_latest_version( + &self, + opctx: &OpContext, + policy: &ClickhousePolicy, + ) -> Result<(), Error> { + if policy.version < 1 { + return Err(Error::invalid_request( + "policy version must be greater than 0", + )); + } + opctx + .authorize(authz::Action::Modify, &authz::BLUEPRINT_CONFIG) + .await?; + + let num_inserted = if policy.version == 1 { + self.clickhouse_policy_insert_first_policy(opctx, &policy).await? + } else { + self.clickhouse_policy_insert_next_policy(opctx, &policy).await? + }; + + match num_inserted { + 0 => Err(Error::invalid_request(format!( + "policy version {} is not the most recent", + policy.version + ))), + 1 => Ok(()), + // This is impossible because we are explicitly inserting only one + // row with a unique primary key. + _ => unreachable!("query inserted more than one row"), + } + } + + /// Insert the next version of the policy in the database + /// + /// Only succeeds if the prior version is the latest version currently + /// in the `clickhouse_policy` table. + /// + /// Panics if `policy.version <= 1`; + async fn clickhouse_policy_insert_next_policy( + &self, + opctx: &OpContext, + policy: &ClickhousePolicy, + ) -> Result { + assert!(policy.version > 1); + let prev_version = policy.version - 1; + + sql_query( + r"INSERT INTO clickhouse_policy + (version, clickhouse_mode, clickhouse_cluster_target_servers, + clickhouse_cluster_target_keepers, time_created) + SELECT $1, $2, $3, $4, $5 + FROM clickhouse_policy WHERE version = $6 AND version IN + (SELECT version FROM clickhouse_policy + ORDER BY version DESC LIMIT 1)", + ) + .bind::(policy.version.into()) + .bind::((&policy.mode).into()) + .bind::(policy.mode.target_servers().into()) + .bind::(policy.mode.target_keepers().into()) + .bind::(policy.time_created) + .bind::(prev_version.into()) + .execute_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Insert the first clickhouse policy in the database at version 1. + /// + /// Only insert this policy if no other policy exists yet. + /// + /// Return the number of inserted rows or an error. + async fn clickhouse_policy_insert_first_policy( + &self, + opctx: &OpContext, + policy: &ClickhousePolicy, + ) -> Result { + sql_query( + r"INSERT INTO clickhouse_policy + (version, clickhouse_mode, clickhouse_cluster_target_servers, + clickhouse_cluster_target_keepers, time_created) + SELECT $1, $2, $3, $4, $5 + WHERE NOT EXISTS (SELECT * FROM clickhouse_policy)", + ) + .bind::(policy.version.into()) + .bind::((&policy.mode).into()) + .bind::(policy.mode.target_servers().into()) + .bind::(policy.mode.target_keepers().into()) + .bind::(policy.time_created) + .execute_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::datastore::test_utils::datastore_test; + use nexus_inventory::now_db_precision; + use nexus_test_utils::db::test_setup_database; + use nexus_types::deployment::ClickhouseMode; + use omicron_test_utils::dev; + + #[tokio::test] + async fn test_clickhouse_policy_basic() { + // Setup + let logctx = dev::test_setup_log("test_clickhouse_policy_basic"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Listing an empty table should return an empty vec + + assert!(datastore + .clickhouse_policy_list(&opctx, &DataPageParams::max_page()) + .await + .unwrap() + .is_empty()); + + // Fail to insert a policy with version 0 + let mut policy = ClickhousePolicy { + version: 0, + mode: ClickhouseMode::SingleNodeOnly, + time_created: now_db_precision(), + }; + + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .unwrap_err() + .to_string() + .contains("policy version must be greater than 0")); + + // Inserting version 2 before version 1 should not work + policy.version = 2; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .unwrap_err() + .to_string() + .contains("policy version 2 is not the most recent")); + + // Inserting version 1 should work + policy.version = 1; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .is_ok()); + + // Inserting version 2 should work + policy.version = 2; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .is_ok()); + + // Inserting version 4 should not work, since the prior version is 2 + policy.version = 4; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .unwrap_err() + .to_string() + .contains("policy version 4 is not the most recent")); + + // Inserting version 3 should work + policy.version = 3; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .is_ok()); + + // Inserting version 4 should work + policy.version = 4; + policy.mode = + ClickhouseMode::Both { target_servers: 3, target_keepers: 5 }; + assert!(datastore + .clickhouse_policy_insert_latest_version(&opctx, &policy) + .await + .is_ok()); + + let history = datastore + .clickhouse_policy_list(&opctx, &DataPageParams::max_page()) + .await + .unwrap(); + + for i in 1..=4 { + let policy = &history[i - 1]; + assert_eq!(policy.version, i as u32); + if i != 4 { + assert!(matches!(policy.mode, ClickhouseMode::SingleNodeOnly)); + assert_eq!(policy.mode.target_servers(), 0); + assert_eq!(policy.mode.target_keepers(), 0); + } else { + assert!(matches!(policy.mode, ClickhouseMode::Both { .. })); + assert_eq!(policy.mode.target_servers(), 3); + assert_eq!(policy.mode.target_keepers(), 5); + } + } + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 258e43f18c..e724d3d9af 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -55,6 +55,7 @@ mod bfd; mod bgp; mod bootstore; mod certificate; +mod clickhouse_policy; mod cockroachdb_node_id; mod cockroachdb_settings; mod console_session; diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 6a40dc1da2..8798836f47 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1366,6 +1366,7 @@ mod test { target_cockroachdb_cluster_version: CockroachDbClusterVersion::POLICY, target_crucible_pantry_zone_count: CRUCIBLE_PANTRY_REDUNDANCY, + clickhouse_policy: None, log, } .build() diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 3302768a56..8a9f49823d 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -182,6 +182,9 @@ impl fmt::Display for Operation { ZoneExpungeReason::ClickhouseClusterDisabled => { "clickhouse cluster disabled via policy" } + ZoneExpungeReason::ClickhouseSingleNodeDisabled => { + "clickhouse single-node disabled via policy" + } }; write!( f, @@ -562,13 +565,6 @@ impl<'a> BlueprintBuilder<'a> { "sled_id" => sled_id.to_string(), )); - // If there are any `ClickhouseServer` or `ClickhouseKeeper` zones that - // are not expunged and we no longer have a `ClickhousePolicy` which - // indicates replicated clickhouse clusters should be running, we need - // to expunge all such zones. - let clickhouse_cluster_enabled = - self.input.clickhouse_cluster_enabled(); - // Do any zones need to be marked expunged? let mut zones_to_expunge = BTreeMap::new(); @@ -580,11 +576,9 @@ impl<'a> BlueprintBuilder<'a> { "zone_id" => zone_id.to_string() )); - let Some(reason) = zone_needs_expungement( - sled_details, - zone_config, - clickhouse_cluster_enabled, - ) else { + let Some(reason) = + zone_needs_expungement(sled_details, zone_config, &self.input) + else { continue; }; @@ -630,6 +624,13 @@ impl<'a> BlueprintBuilder<'a> { expunging related zone" ); } + ZoneExpungeReason::ClickhouseSingleNodeDisabled => { + info!( + &log, + "clickhouse single-node disabled via policy, \ + expunging related zone" + ); + } } zones_to_expunge.insert(zone_id, reason); @@ -661,6 +662,7 @@ impl<'a> BlueprintBuilder<'a> { let mut count_sled_decommissioned = 0; let mut count_sled_expunged = 0; let mut count_clickhouse_cluster_disabled = 0; + let mut count_clickhouse_single_node_disabled = 0; for reason in zones_to_expunge.values() { match reason { ZoneExpungeReason::DiskExpunged => count_disk_expunged += 1, @@ -671,6 +673,9 @@ impl<'a> BlueprintBuilder<'a> { ZoneExpungeReason::ClickhouseClusterDisabled => { count_clickhouse_cluster_disabled += 1 } + ZoneExpungeReason::ClickhouseSingleNodeDisabled => { + count_clickhouse_single_node_disabled += 1 + } }; } let count_and_reason = [ @@ -681,6 +686,10 @@ impl<'a> BlueprintBuilder<'a> { count_clickhouse_cluster_disabled, ZoneExpungeReason::ClickhouseClusterDisabled, ), + ( + count_clickhouse_single_node_disabled, + ZoneExpungeReason::ClickhouseSingleNodeDisabled, + ), ]; for (count, reason) in count_and_reason { if count > 0 { diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 145be867c6..c5ab26419a 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -749,7 +749,7 @@ fn sled_needs_all_zones_expunged( pub(crate) fn zone_needs_expungement( sled_details: &SledDetails, zone_config: &BlueprintZoneConfig, - clickhouse_cluster_enabled: bool, + input: &PlanningInput, ) -> Option { // Should we expunge the zone because the sled is gone? if let Some(reason) = @@ -776,7 +776,7 @@ pub(crate) fn zone_needs_expungement( // Should we expunge the zone because clickhouse clusters are no longer // enabled via policy? - if !clickhouse_cluster_enabled { + if !input.clickhouse_cluster_enabled() { if zone_config.zone_type.is_clickhouse_keeper() || zone_config.zone_type.is_clickhouse_server() { @@ -784,6 +784,14 @@ pub(crate) fn zone_needs_expungement( } } + // Should we expunge the zone because single-node clickhouse is no longer + // enabled via policy? + if !input.clickhouse_single_node_enabled() { + if zone_config.zone_type.is_clickhouse() { + return Some(ZoneExpungeReason::ClickhouseSingleNodeDisabled); + } + } + None } @@ -797,6 +805,7 @@ pub(crate) enum ZoneExpungeReason { SledDecommissioned, SledExpunged, ClickhouseClusterDisabled, + ClickhouseSingleNodeDisabled, } #[cfg(test)] @@ -821,6 +830,7 @@ mod test { use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; + use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -843,6 +853,12 @@ mod test { use std::net::IpAddr; use typed_rng::TypedUuidRng; + // Generate a ClickhousePolicy ignoring fields we don't care about for + /// planner tests + fn clickhouse_policy(mode: ClickhouseMode) -> ClickhousePolicy { + ClickhousePolicy { version: 0, mode, time_created: Utc::now() } + } + /// Runs through a basic sequence of blueprints for adding a sled #[test] fn test_basic_add_sled() { @@ -2622,11 +2638,11 @@ mod test { // Enable clickhouse clusters via policy let mut input_builder = input.into_builder(); - input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { - deploy_with_standalone: true, - target_servers, - target_keepers, - }); + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::Both { + target_servers, + target_keepers, + })); // Creating a new blueprint should deploy all the new clickhouse zones let input = input_builder.build(); @@ -2659,8 +2675,8 @@ mod test { .map(|z| z.id) .collect(); - assert_eq!(keeper_zone_ids.len(), target_keepers); - assert_eq!(server_zone_ids.len(), target_servers); + assert_eq!(keeper_zone_ids.len(), target_keepers as usize); + assert_eq!(server_zone_ids.len(), target_servers as usize); // We should be attempting to allocate all servers and keepers since // this the initial configuration @@ -2670,14 +2686,20 @@ mod test { assert_eq!(clickhouse_cluster_config.generation, 2.into()); assert_eq!( clickhouse_cluster_config.max_used_keeper_id, - (target_keepers as u64).into() + (u64::from(target_keepers)).into() ); assert_eq!( clickhouse_cluster_config.max_used_server_id, - (target_servers as u64).into() + (u64::from(target_servers)).into() + ); + assert_eq!( + clickhouse_cluster_config.keepers.len(), + target_keepers as usize + ); + assert_eq!( + clickhouse_cluster_config.servers.len(), + target_servers as usize ); - assert_eq!(clickhouse_cluster_config.keepers.len(), target_keepers); - assert_eq!(clickhouse_cluster_config.servers.len(), target_servers); // Ensure that the added keepers are in server zones for zone_id in clickhouse_cluster_config.keepers.keys() { @@ -2769,11 +2791,11 @@ mod test { // Enable clickhouse clusters via policy let target_keepers = 5; let mut input_builder = input.into_builder(); - input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { - deploy_with_standalone: true, - target_servers, - target_keepers, - }); + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::Both { + target_servers, + target_keepers, + })); let input = input_builder.build(); let blueprint5 = Planner::new_based_on( log.clone(), @@ -2799,7 +2821,7 @@ mod test { .collect(); // We should have allocated 2 new keeper zones - assert_eq!(new_keeper_zone_ids.len(), target_keepers); + assert_eq!(new_keeper_zone_ids.len(), target_keepers as usize); assert!(keeper_zone_ids.is_subset(&new_keeper_zone_ids)); // We should be trying to provision one new keeper for a keeper zone @@ -2869,7 +2891,7 @@ mod test { bp7_config.keepers.len(), bp7_config.max_used_keeper_id.0 as usize ); - assert_eq!(bp7_config.keepers.len(), target_keepers); + assert_eq!(bp7_config.keepers.len(), target_keepers as usize); assert_eq!( bp7_config.highest_seen_keeper_leader_committed_log_index, 2 @@ -2909,7 +2931,7 @@ mod test { bp7_config.max_used_keeper_id ); assert_eq!(bp8_config.keepers, bp7_config.keepers); - assert_eq!(bp7_config.keepers.len(), target_keepers); + assert_eq!(bp7_config.keepers.len(), target_keepers as usize); assert_eq!( bp8_config.highest_seen_keeper_leader_committed_log_index, 3 @@ -2935,11 +2957,11 @@ mod test { // Enable clickhouse clusters via policy let mut input_builder = input.into_builder(); - input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { - deploy_with_standalone: true, - target_servers, - target_keepers, - }); + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::Both { + target_servers, + target_keepers, + })); let input = input_builder.build(); // Create a new blueprint to deploy all our clickhouse zones @@ -2972,13 +2994,13 @@ mod test { .map(|z| z.id) .collect(); - assert_eq!(keeper_zone_ids.len(), target_keepers); - assert_eq!(server_zone_ids.len(), target_servers); + assert_eq!(keeper_zone_ids.len(), target_keepers as usize); + assert_eq!(server_zone_ids.len(), target_servers as usize); // Directly manipulate the blueprint and inventory so that the // clickhouse clusters are stable let config = blueprint2.clickhouse_cluster_config.as_mut().unwrap(); - config.max_used_keeper_id = (target_keepers as u64).into(); + config.max_used_keeper_id = (u64::from(target_keepers)).into(); config.keepers = keeper_zone_ids .iter() .enumerate() @@ -3118,7 +3140,7 @@ mod test { // We've only added one keeper from our desired state // This brings us back up to our target count assert_eq!(config.keepers.len(), old_config.keepers.len() + 1); - assert_eq!(config.keepers.len(), target_keepers); + assert_eq!(config.keepers.len(), target_keepers as usize); // We've allocated one new keeper assert_eq!( config.max_used_keeper_id, @@ -3129,8 +3151,9 @@ mod test { } #[test] - fn test_expunge_all_clickhouse_cluster_zones_after_policy_is_disabled() { - static TEST_NAME: &str = "planner_expunge_all_clickhouse_cluster_zones_after_policy_is_disabled"; + fn test_expunge_clickhouse_zones_after_policy_is_changed() { + static TEST_NAME: &str = + "planner_expunge_clickhouse__zones_after_policy_is_changed"; let logctx = test_setup_log(TEST_NAME); let log = logctx.log.clone(); @@ -3142,11 +3165,11 @@ mod test { // Enable clickhouse clusters via policy let mut input_builder = input.into_builder(); - input_builder.policy_mut().clickhouse_policy = Some(ClickhousePolicy { - deploy_with_standalone: true, - target_servers, - target_keepers, - }); + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::Both { + target_servers, + target_keepers, + })); let input = input_builder.build(); // Create a new blueprint to deploy all our clickhouse zones @@ -3179,15 +3202,24 @@ mod test { .map(|z| z.id) .collect(); - assert_eq!(keeper_zone_ids.len(), target_keepers); - assert_eq!(server_zone_ids.len(), target_servers); + assert_eq!(keeper_zone_ids.len(), target_keepers as usize); + assert_eq!(server_zone_ids.len(), target_servers as usize); - // Disable clickhouse clusters via policy + // Disable clickhouse single node via policy, and ensure the zone goes + // away. First ensure we have one. + assert_eq!( + 1, + active_zones.iter().filter(|z| z.zone_type.is_clickhouse()).count() + ); let mut input_builder = input.into_builder(); - input_builder.policy_mut().clickhouse_policy = None; + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::ClusterOnly { + target_servers, + target_keepers, + })); let input = input_builder.build(); - // Create a new blueprint with the disabled policy + // Create a new blueprint with `ClickhouseMode::ClusterOnly` let blueprint3 = Planner::new_based_on( log.clone(), &blueprint2, @@ -3200,9 +3232,37 @@ mod test { .plan() .expect("plan"); + // We should have expunged our single-node clickhouse zone + let expunged_zones: Vec<_> = blueprint3 + .all_omicron_zones(BlueprintZoneFilter::Expunged) + .map(|(_, z)| z.clone()) + .collect(); + + assert_eq!(1, expunged_zones.len()); + assert!(expunged_zones.first().unwrap().zone_type.is_clickhouse()); + + // Disable clickhouse clusters via policy and restart single node + let mut input_builder = input.into_builder(); + input_builder.policy_mut().clickhouse_policy = + Some(clickhouse_policy(ClickhouseMode::SingleNodeOnly)); + let input = input_builder.build(); + + // Create a new blueprint for `ClickhouseMode::SingleNodeOnly` + let blueprint4 = Planner::new_based_on( + log.clone(), + &blueprint3, + &input, + "test_blueprint4", + &collection, + ) + .expect("created planner") + .with_rng_seed((TEST_NAME, "bp4")) + .plan() + .expect("plan"); + // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. - let expunged_zones: Vec<_> = blueprint3 + let expunged_zones: Vec<_> = blueprint4 .all_omicron_zones(BlueprintZoneFilter::Expunged) .map(|(_, z)| z.clone()) .collect(); @@ -3221,6 +3281,15 @@ mod test { assert_eq!(keeper_zone_ids, expunged_keeper_zone_ids); assert_eq!(server_zone_ids, expunged_server_zone_ids); + // We should have a new single-node clickhouze zone + assert_eq!( + 1, + blueprint4 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .filter(|(_, z)| z.zone_type.is_clickhouse()) + .count() + ); + logctx.cleanup_successful(); } } diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 9e14289e8a..24f32e9187 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -16,6 +16,7 @@ use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintMetadata; +use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbSettings; use nexus_types::deployment::OmicronZoneExternalIp; @@ -75,6 +76,7 @@ pub struct PlanningInputFromDb<'a> { pub internal_dns_version: nexus_db_model::Generation, pub external_dns_version: nexus_db_model::Generation, pub cockroachdb_settings: &'a CockroachDbSettings, + pub clickhouse_policy: Option, pub log: &'a Logger, } @@ -138,6 +140,11 @@ impl PlanningInputFromDb<'_> { .await .internal_context("fetching cockroachdb settings")?; + let clickhouse_policy = datastore + .clickhouse_policy_get_latest(opctx) + .await + .internal_context("fetching clickhouse policy")?; + let planning_input = PlanningInputFromDb { sled_rows: &sled_rows, zpool_rows: &zpool_rows, @@ -156,6 +163,7 @@ impl PlanningInputFromDb<'_> { internal_dns_version, external_dns_version, cockroachdb_settings: &cockroachdb_settings, + clickhouse_policy, } .build() .internal_context("assembling planning_input")?; @@ -177,7 +185,7 @@ impl PlanningInputFromDb<'_> { .target_cockroachdb_cluster_version, target_crucible_pantry_zone_count: self .target_crucible_pantry_zone_count, - clickhouse_policy: None, + clickhouse_policy: self.clickhouse_policy.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index eabe4e5a3b..74ef1dd878 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -59,6 +59,7 @@ pub use network_resources::OmicronZoneExternalSnatIp; pub use network_resources::OmicronZoneNetworkResources; pub use network_resources::OmicronZoneNic; pub use network_resources::OmicronZoneNicEntry; +pub use planning_input::ClickhouseMode; pub use planning_input::ClickhousePolicy; pub use planning_input::CockroachDbClusterVersion; pub use planning_input::CockroachDbPreserveDowngrade; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 5541df60e6..741cf3b539 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -14,6 +14,8 @@ use crate::external_api::views::PhysicalDiskState; use crate::external_api::views::SledPolicy; use crate::external_api::views::SledProvisionPolicy; use crate::external_api::views::SledState; +use chrono::DateTime; +use chrono::Utc; use clap::ValueEnum; use ipnetwork::IpNetwork; use omicron_common::address::IpRange; @@ -123,14 +125,12 @@ impl PlanningInput { } pub fn target_clickhouse_zone_count(&self) -> usize { - if let Some(policy) = &self.policy.clickhouse_policy { - if policy.deploy_with_standalone { - SINGLE_NODE_CLICKHOUSE_REDUNDANCY - } else { - 0 - } - } else { - SINGLE_NODE_CLICKHOUSE_REDUNDANCY + match self.policy.clickhouse_policy.as_ref().map(|policy| &policy.mode) + { + Some(&ClickhouseMode::ClusterOnly { .. }) => 0, + Some(&ClickhouseMode::SingleNodeOnly) + | Some(&ClickhouseMode::Both { .. }) + | None => SINGLE_NODE_CLICKHOUSE_REDUNDANCY, } } @@ -138,7 +138,7 @@ impl PlanningInput { self.policy .clickhouse_policy .as_ref() - .map(|policy| policy.target_servers) + .map(|policy| usize::from(policy.mode.target_servers())) .unwrap_or(0) } @@ -146,7 +146,7 @@ impl PlanningInput { self.policy .clickhouse_policy .as_ref() - .map(|policy| policy.target_keepers) + .map(|policy| usize::from(policy.mode.target_keepers())) .unwrap_or(0) } @@ -155,7 +155,18 @@ impl PlanningInput { } pub fn clickhouse_cluster_enabled(&self) -> bool { - self.policy.clickhouse_policy.is_some() + let Some(clickhouse_policy) = &self.policy.clickhouse_policy else { + return false; + }; + clickhouse_policy.mode.cluster_enabled() + } + + pub fn clickhouse_single_node_enabled(&self) -> bool { + let Some(clickhouse_policy) = &self.policy.clickhouse_policy else { + // If there is no policy we assume single-node is enabled. + return true; + }; + clickhouse_policy.mode.single_node_enabled() } pub fn all_sleds( @@ -876,20 +887,58 @@ pub struct Policy { pub clickhouse_policy: Option, } -/// Policy for replicated clickhouse setups #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ClickhousePolicy { - /// Should we run the single-node cluster alongside the replicated cluster? - /// This is stage 1 of our deployment plan as laid out in RFD 468 - /// - /// If this is set to false, then we will only deploy replicated clusters. - pub deploy_with_standalone: bool, + pub version: u32, + pub mode: ClickhouseMode, + pub time_created: DateTime, +} - /// Desired number of clickhouse servers - pub target_servers: usize, +/// How to deploy clickhouse nodes +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ClickhouseMode { + SingleNodeOnly, + ClusterOnly { target_servers: u8, target_keepers: u8 }, + Both { target_servers: u8, target_keepers: u8 }, +} - /// Desired number of clickhouse keepers - pub target_keepers: usize, +impl ClickhouseMode { + pub fn cluster_enabled(&self) -> bool { + match self { + ClickhouseMode::SingleNodeOnly => false, + ClickhouseMode::ClusterOnly { .. } + | ClickhouseMode::Both { .. } => true, + } + } + + pub fn single_node_enabled(&self) -> bool { + match self { + ClickhouseMode::ClusterOnly { .. } => false, + ClickhouseMode::SingleNodeOnly | ClickhouseMode::Both { .. } => { + true + } + } + } + + pub fn target_servers(&self) -> u8 { + match self { + ClickhouseMode::SingleNodeOnly => 0, + ClickhouseMode::ClusterOnly { target_servers, .. } => { + *target_servers + } + ClickhouseMode::Both { target_servers, .. } => *target_servers, + } + } + + pub fn target_keepers(&self) -> u8 { + match self { + ClickhouseMode::SingleNodeOnly => 0, + ClickhouseMode::ClusterOnly { target_keepers, .. } => { + *target_keepers + } + ClickhouseMode::Both { target_keepers, .. } => *target_keepers, + } + } } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/schema/crdb/clickhouse-policy/up1.sql b/schema/crdb/clickhouse-policy/up1.sql new file mode 100644 index 0000000000..506e329b21 --- /dev/null +++ b/schema/crdb/clickhouse-policy/up1.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.clickhouse_mode AS ENUM ( + 'single_node_only', + 'cluster_only', + 'both' +); diff --git a/schema/crdb/clickhouse-policy/up2.sql b/schema/crdb/clickhouse-policy/up2.sql new file mode 100644 index 0000000000..52c950a570 --- /dev/null +++ b/schema/crdb/clickhouse-policy/up2.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.clickhouse_policy ( + version INT8 PRIMARY KEY, + clickhouse_mode omicron.public.clickhouse_mode NOT NULL, + clickhouse_cluster_target_servers INT2 NOT NULL, + clickhouse_cluster_target_keepers INT2 NOT NULL, + time_created TIMESTAMPTZ NOT NULL +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f78e034721..3a2f2a3a81 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -42,6 +42,46 @@ ALTER DEFAULT PRIVILEGES GRANT INSERT, SELECT, UPDATE, DELETE ON TABLES to omicr */ ALTER RANGE default CONFIGURE ZONE USING num_replicas = 5; + +/* + * The deployment strategy for clickhouse + */ +CREATE TYPE IF NOT EXISTS omicron.public.clickhouse_mode AS ENUM ( + -- Only deploy a single node clickhouse + 'single_node_only', + + -- Only deploy a clickhouse cluster without any single node deployments + 'cluster_only', + + -- Deploy both a single node and cluster deployment. + -- This is the strategy for stage 1 described in RFD 468 + 'both' +); + +/* + * A planning policy for clickhouse for a single multirack setup + * + * We currently implicitly tie this policy to a rack, as we don't yet support + * multirack. Multiple parts of this database schema are going to have to change + * to support multirack, so we add one more for now. + */ +CREATE TABLE IF NOT EXISTS omicron.public.clickhouse_policy ( + -- Monotonically increasing version for all policies + -- + -- This is similar to `bp_target` which will also require being changed for + -- multirack to associate with some sort of rack group ID. + version INT8 PRIMARY KEY, + + clickhouse_mode omicron.public.clickhouse_mode NOT NULL, + + -- Only greater than 0 when clickhouse cluster is enabled + clickhouse_cluster_target_servers INT2 NOT NULL, + -- Only greater than 0 when clickhouse cluster is enabled + clickhouse_cluster_target_keepers INT2 NOT NULL, + + time_created TIMESTAMPTZ NOT NULL +); + /* * Racks */ @@ -4498,7 +4538,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '109.0.0', NULL) + (TRUE, NOW(), NOW(), '110.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;