From 843033045c194ad12c251ab26a9920a4f1d8862c Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 22 Aug 2024 21:32:38 +0000 Subject: [PATCH] wip - more in the right direction --- .../planning/src/blueprint_builder/builder.rs | 110 ++++++++++++++++-- nexus/reconfigurator/planning/src/planner.rs | 1 + nexus/types/src/deployment.rs | 1 + nexus/types/src/deployment/clickhouse.rs | 92 +++++++++++++-- nexus/types/src/deployment/zone_type.rs | 12 +- nexus/types/src/inventory.rs | 65 ++++++----- 6 files changed, 227 insertions(+), 54 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 6d5c0a3f05..e3d242876f 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -24,6 +24,7 @@ use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::ClickhouseClusterConfig; +use nexus_types::deployment::ClickhouseIdAllocator; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::DiskFilter; use nexus_types::deployment::OmicronZoneExternalFloatingIp; @@ -35,6 +36,7 @@ use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::ZpoolName; use nexus_types::external_api::views::SledState; +use nexus_types::inventory::Collection; use omicron_common::address::get_internal_dns_server_addresses; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; @@ -206,6 +208,7 @@ pub struct BlueprintBuilder<'a> { disks: BlueprintDisksBuilder<'a>, sled_state: BTreeMap, cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade, + clickhouse_id_allocator: ClickhouseIdAllocator, creator: String, operations: Vec, @@ -263,7 +266,7 @@ impl<'a> BlueprintBuilder<'a> { .collect(); let cluster_name = format!("cluster-{}", Uuid::new_v4()); let clickhouse_cluster_config = - ClickhouseClusterConfig::new(cluster_name, &blueprint_zones); + ClickhouseClusterConfig::new(cluster_name); Blueprint { id: rng.blueprint_rng.next(), blueprint_zones, @@ -334,6 +337,9 @@ impl<'a> BlueprintBuilder<'a> { sled_state, cockroachdb_setting_preserve_downgrade: parent_blueprint .cockroachdb_setting_preserve_downgrade, + clickhouse_id_allocator: ClickhouseIdAllocator::from( + &parent_blueprint.clickhouse_cluster_config, + ), creator: creator.to_owned(), operations: Vec::new(), comments: Vec::new(), @@ -367,11 +373,6 @@ impl<'a> BlueprintBuilder<'a> { .disks .into_disks_map(self.input.all_sled_ids(SledFilter::InService)); - let clickhouse_cluster_config = ClickhouseClusterConfig::new_based_on( - &self.log, - &self.parent_blueprint.clickhouse_cluster_config, - &blueprint_zones, - ); Blueprint { id: self.rng.blueprint_rng.next(), blueprint_zones, @@ -387,7 +388,8 @@ impl<'a> BlueprintBuilder<'a> { .clone(), cockroachdb_setting_preserve_downgrade: self .cockroachdb_setting_preserve_downgrade, - clickhouse_cluster_config, + // TODO: This will change when generations and IDs get bumped + clickhouse_cluster_config: self.parent.clickhouse_cluster_config, time_created: now_db_precision(), creator: self.creator, comment: self @@ -971,6 +973,7 @@ impl<'a> BlueprintBuilder<'a> { let address = SocketAddrV6::new(underlay_ip, port, 0, 0); let zone_type = BlueprintZoneType::ClickhouseServer( blueprint_zone_type::ClickhouseServer { + server_id: self.clickhouse_id_allocator.next_server_id(), address, dataset: OmicronZoneDataset { pool_name: pool_name.clone(), @@ -999,6 +1002,7 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, desired_zone_count: usize, + inventory: &Collection, ) -> Result { // How many clickhouse keeper zones do we want to add? let clickhouse_keeper_count = self.sled_num_running_zones_of_kind( @@ -1017,7 +1021,19 @@ impl<'a> BlueprintBuilder<'a> { ))); } }; - for _ in 0..num_clickhouse_keepers_to_add { + + // Clickhouse keeper clusters only allow adding or removing one node at + // a time. We must make sure that a keeper node is not currently being + // added or removed before we add a new one. + // + // Therefore, we always return only 0 or 1 zones here, even if + // `num_clickhouse_keepers_to_add` is greater than 1. + if num_clickhouse_keepers_to_add > 0 { + if self.is_active_clickhouse_keeper_reconfiguration(inventory) { + // We are currently trying to add or remove a node and cannot + // perform a concurrent addition + return 0; + } let zone_id = self.rng.zone_rng.next(); let underlay_ip = self.sled_alloc_ip(sled_id)?; let pool_name = @@ -1026,6 +1042,7 @@ impl<'a> BlueprintBuilder<'a> { let address = SocketAddrV6::new(underlay_ip, port, 0, 0); let zone_type = BlueprintZoneType::ClickhouseKeeper( blueprint_zone_type::ClickhouseKeeper { + keeper_id: self.clickhouse_id_allocator.next_keeper_id(), address, dataset: OmicronZoneDataset { pool_name: pool_name.clone(), @@ -1044,10 +1061,7 @@ impl<'a> BlueprintBuilder<'a> { self.sled_add_zone(sled_id, zone)?; } - Ok(EnsureMultiple::Changed { - added: num_clickhouse_keepers_to_add, - removed: 0, - }) + Ok(EnsureMultiple::Changed { added: 1, removed: 0 }) } pub fn sled_promote_internal_ntp_to_boundary_ntp( @@ -1311,6 +1325,78 @@ impl<'a> BlueprintBuilder<'a> { )) }) } + + /// Is a keeper node being added or removed from the keeper configuration already? + fn is_active_clickhouse_keeper_reconfiguration( + &self, + inventory: &Collection, + ) -> bool { + /// Has this builder already made a modification? + if self + .parent_blueprint + .clickhouse_cluster_config + .has_configuration_changed(&self.clickhouse_id_allocator) + { + return true; + } + + /// Get the latest clickhouse cluster membership + let Some((keeper_zone_id, membership)) = + inventory.latest_clickhouse_keeper_membership() + else { + // We can't retrieve the latest membership and so we assume + // that a reconfiguration is underway for safety reasons. In + // any case, if we can't talk to any `clickhouse-admin` servers + // in the `ClickhouseKeeper` zones then we cannot attempt a + // reconfiguration. + return true; + }; + + // Ensure all in service keeper zones are part of the raft configuration + if !self + .parent_blueprint + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .all(|(_, bp_zone_config)| { + if let BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { keeper_id, .. }, + ) = bp_zone_config.zone_type + { + // Check the inventory to see if a keeper that should be + // running is not part of the raft configuration. + membership.raft_config.contains(&keeper_id) + } + }) + { + // We are still attempting to add this node + // + // TODO: We need a way to detect if the attempted addition actually + // failed here This will involve querying the keeper cluster via + // inventory in case the configuration doesn't chnage for a while. + // I don't know how to get that information out of the keeper yet, + // except for by reading the log files. + return true; + } + + /// Ensure all expunged zones are no longer part of the configuration + if !self + .parent_blueprint + .all_omicron_zones(BlueprintZoneFilter::Expunged) + .all(|(_, bp_zone_config)| { + if let BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { keeper_id, .. }, + ) = bp_zone_config.zone_type + { + !membership.raft_config.contains(&keeper_id) + } + }) + { + // We are still attempting to remove keepers for expunged zones from the configuration + return true; + } + + /// There is no ongoing reconfiguration of the keeper raft cluster + false + } } #[derive(Debug)] diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 04447aaf1c..a8743127e3 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -542,6 +542,7 @@ impl<'a> Planner<'a> { self.blueprint.sled_ensure_zone_multiple_clickhouse_keeper( sled_id, new_total_zone_count, + &self.inventory, )? } }; diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index de6d171aa0..d4f41a0b06 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -49,6 +49,7 @@ mod tri_map; mod zone_type; pub use clickhouse::ClickhouseClusterConfig; +pub use clickhouse::ClickhouseIdAllocator; pub use network_resources::AddNetworkResourceError; pub use network_resources::OmicronZoneExternalFloatingAddr; pub use network_resources::OmicronZoneExternalFloatingIp; diff --git a/nexus/types/src/deployment/clickhouse.rs b/nexus/types/src/deployment/clickhouse.rs index 9e5a7f35f6..c37e96ee4e 100644 --- a/nexus/types/src/deployment/clickhouse.rs +++ b/nexus/types/src/deployment/clickhouse.rs @@ -27,6 +27,52 @@ use uuid::Uuid; const BASE_DIR: &str = "/opt/oxide/clickhouse"; +/// A mechanism used by the `BlueprintBuilder` to update clickhouse server and +/// keeper ids +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct ClickhouseIdAllocator { + /// Clickhouse Server ids must be unique and are handed out monotonically. Keep track + /// of the last used one. + max_used_server_id: ServerId, + /// CLickhouse Keeper ids must be unique and are handed out monotonically. Keep track + /// of the last used one. + max_used_keeper_id: KeeperId, +} + +impl ClickhouseIdAllocator { + pub fn new( + max_used_server_id: ServerId, + max_used_keeper_id: KeeperId, + ) -> ClickhouseIdAllocator { + ClickhouseIdAllocator { max_used_server_id, max_used_keeper_id } + } + + pub fn next_server_id(&mut self) -> ServerId { + self.max_used_server_id += 1.into(); + self.max_used_server_id + } + pub fn next_keeper_id(&mut self) -> KeeperId { + self.max_used_keeper_id += 1.into(); + self.max_used_keeper_id + } + + pub fn max_used_server_id(&self) -> ServerId { + self.max_used_server_id + } + pub fn max_used_keeper_id(&self) -> KeeperId { + self.max_used_keeper_id + } +} + +impl From<&ClickhouseClusterConfig> for ClickhouseIdAllocator { + fn from(value: &ClickhouseClusterConfig) -> Self { + ClickhouseIdAllocator::new( + value.max_used_server_id, + value.max_used_keeper_id, + ) + } +} + /// Global configuration for all clickhouse servers (replicas) and keepers #[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] pub struct ClickhouseClusterConfig { @@ -35,12 +81,12 @@ pub struct ClickhouseClusterConfig { /// This is used by `clickhouse-admin` in the clickhouse server and keeper /// zones to discard old configurations. pub generation: Generation, - /// CLickhouse Server ids must be unique and are handed out monotonically. Keep track + /// Clickhouse Server ids must be unique and are handed out monotonically. Keep track /// of the last used one. - pub max_used_server_id: u64, + pub max_used_server_id: ServerId, /// CLickhouse Keeper ids must be unique and are handed out monotonically. Keep track /// of the last used one. - pub max_used_keeper_id: u64, + pub max_used_keeper_id: KeeperId, /// An arbitrary name for the Clickhouse cluster shared by all nodes pub cluster_name: String, /// An arbitrary string shared by all nodes used at runtime to determine whether @@ -49,18 +95,44 @@ pub struct ClickhouseClusterConfig { } impl ClickhouseClusterConfig { - pub fn new( - cluster_name: String, - secret: String, - ) -> ClickhouseClusterConfig { + pub fn new(cluster_name: String) -> ClickhouseClusterConfig { ClickhouseClusterConfig { generation: Generation::new(), - max_used_server_id: 0, - max_used_keeper_id: 0, + max_used_server_id: 0.into(), + max_used_keeper_id: 0.into(), cluster_name, - secret, + secret: Uuid::new_v4().to_string(), } } + + /// If new Ids have been allocated, then update the internal state and + /// return true, otherwise return false. + pub fn update_configuration( + &mut self, + allocator: &ClickhouseIdAllocator, + ) -> bool { + let mut updated = false; + if self.max_used_server_id < allocator.max_used_server_id() { + self.max_used_server_id = allocator.max_used_server_id(); + updated = true; + } + if self.max_used_keeper_id < allocator.max_used_keeper_id() { + self.max_used_keeper_id = allocator.max_used_keeper_id(); + updated = true; + } + if updated { + self.generation = self.generation.next(); + } + updated + } + + pub fn has_configuration_changed( + &self, + allocator: &ClickhouseIdAllocator, + ) -> bool { + self.max_used_server_id != allocator.max_used_server_id() + || self.max_used_keeper_id != allocator.max_used_keeper_id() + } } /*impl ClickhouseClusterConfig { diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index 449b536308..17b99064e2 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -168,10 +168,14 @@ impl BlueprintZoneType { blueprint_zone_type::Clickhouse { dataset, address }, ) => (dataset, DatasetKind::Clickhouse, address), BlueprintZoneType::ClickhouseKeeper( - blueprint_zone_type::ClickhouseKeeper { dataset, address }, + blueprint_zone_type::ClickhouseKeeper { + dataset, address, .. + }, ) => (dataset, DatasetKind::ClickhouseKeeper, address), BlueprintZoneType::ClickhouseServer( - blueprint_zone_type::ClickhouseServer { dataset, address }, + blueprint_zone_type::ClickhouseServer { + dataset, address, .. + }, ) => (dataset, DatasetKind::ClickhouseServer, address), BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { dataset, address }, @@ -337,7 +341,7 @@ pub mod blueprint_zone_type { Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] pub struct ClickhouseKeeper { - pub keeper_id: u64, + pub keeper_id: clickward::KeeperId, pub address: SocketAddrV6, pub dataset: OmicronZoneDataset, } @@ -347,7 +351,7 @@ pub mod blueprint_zone_type { Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] pub struct ClickhouseServer { - pub server_id: u64, + pub server_id: clickward::ServerId, pub address: SocketAddrV6, pub dataset: OmicronZoneDataset, } diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index e796c70f85..5fbd167649 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -13,6 +13,7 @@ use crate::external_api::params::PhysicalDiskKind; use crate::external_api::params::UninitializedSledId; use chrono::DateTime; use chrono::Utc; +use clickward::KeeperId; pub use gateway_client::types::PowerState; pub use gateway_client::types::RotImageError; pub use gateway_client::types::RotSlot; @@ -23,7 +24,6 @@ use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::SledRole; use omicron_common::api::external::ByteCount; -use omicron_common::api::external::Generation; pub use omicron_common::api::internal::shared::NetworkInterface; pub use omicron_common::api::internal::shared::NetworkInterfaceKind; pub use omicron_common::api::internal::shared::SourceNatConfig; @@ -118,8 +118,11 @@ pub struct Collection { /// Omicron zones found, by *sled* id pub omicron_zones: BTreeMap, - /// Clickhouse Keeper state, by *zone* id - pub clickhouse_keepers: BTreeMap, + /// The raft configuration (cluster membership) of the clickhouse keeper + /// cluster as returned from each available keeper via `clickhouse-admin` in + /// the `ClickhouseKeeper` zone + pub clickhouse_keeper_cluster_membership: + BTreeMap, } impl Collection { @@ -157,6 +160,27 @@ impl Collection { .filter(|(_, inventory)| inventory.sled_role == SledRole::Scrimlet) .map(|(sled_id, _)| *sled_id) } + + /// Return the latest clickhouse keeper configuration in this last collection, if there is one. + pub fn latest_clickhouse_keeper_membership( + &self, + ) -> Option<(OmicronZoneUuid, ClickhouseKeeperClusterMembership)> { + let mut latest = None; + for (zone_id, membership) in &self.clickhouse_keeper_cluster_membership + { + match &latest { + None => latest = Some((*zone_id, membership.clone())), + Some((_, latest_membership)) => { + if membership.leader_committed_log_index + > latest_membership.leader_committed_log_index + { + latest = Some((*zone_id, membership.clone())); + } + } + } + } + latest + } } /// A unique baseboard id found during a collection @@ -430,31 +454,16 @@ pub struct OmicronZonesFound { pub zones: OmicronZonesConfig, } -/// The state of a Clickhouse Keeper node +/// The configuration of the clickhouse keeper raft cluster returned from a +/// single keeper node /// -/// This is retrieved from the `clickhouse-admin` dropshot server running -/// in a `ClickhouseKeeper` zone and is used to manage reconfigurations of a -/// clickhouse keeper cluster. +/// Each keeper is asked for its known raft configuration via `clickhouse-admin` +/// dropshot servers running in `ClickhouseKeeper` zones. state. We include the +/// leader committed log index known to the current keeper node (whether or not +/// it is the leader) to determine which configuration is newest. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] -pub enum ClickhouseKeeperState { - /// The keeper process is not running because it has not received a - /// configuration yet - NeedsConfiguration, - - /// The keeper process is running, but is not yet part of a cluster. - /// It's generated configuration in the blueprint is `keeper_config_gen` - RunningStandalone { keeper_id: u64, keeper_config_gen: Generation }, - - /// The keeper process is part of a cluster at the given generation - ActiveMember { keeper_id: u64, keeper_config_gen: Generation }, - - /// The keeper process has failed to join the cluster. - /// - /// If this occurs, a new keeper process with a new id and configuration - /// should be started in the existing zone or the zone should be expunged. - FailedToJoin { keeper_id: u64, keeper_config_gen: Generation }, - - /// The keeper has been removed from the cluster. At no point may a keeper - /// that has been removed rejoin. - Removed { keeper_id: u64, keeper_config_gen: Generation }, +pub struct ClickhouseKeeperClusterMembership { + pub queried_keeper: KeeperId, + pub leader_committed_log_index: u64, + pub raft_config: BTreeSet, }