Skip to content

Commit

Permalink
wip - start rewriting to properly use BlueprintZoneType and inventory
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewjstone committed Aug 28, 2024
1 parent 63802cc commit d05044a
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 87 deletions.
128 changes: 41 additions & 87 deletions nexus/types/src/deployment/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,99 +19,51 @@ use omicron_common::address::{
CLICKHOUSE_KEEPER_PORT, CLICKHOUSE_KEEPER_RAFT_PORT, CLICKHOUSE_TCP_PORT,
};
use omicron_common::api::external::Generation;
use omicron_uuid_kinds::OmicronZoneUuid;
use omicron_uuid_kinds::SledUuid;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use slog::Logger;
use std::collections::BTreeMap;
use uuid::Uuid;

const BASE_DIR: &str = "/opt/oxide/clickhouse";

/// Clickhouse keeper clusters only allow adding one node at a time
///
/// The planner must progress through these states when adding a new keeper
/// node.
#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
pub enum AddKeeperState {
/// The executor should attempt to start the new keeper via `clickhouse-admin`
/// in the target zone.
///
/// The generated keeper config for this node must include all the other
/// nodes, but those nodes should net yet learn of this config. That is done
/// in the `Reconfiguring` state.
StartingKeeper { omicron_zone_id: OmicronZoneUuid, config: KeeperConfig },
/// The executor should attempt a reconfiguration by updating the
/// configurations at all the other keepers. It must stay in this state
/// until the reconfiguration either succeeds or fails, which it learns
/// by polling `clickhouse-admin` in one or more zones.
///
/// If the keeper addition succeeds then a transition to a
/// [`StableKeeperConfig`] should be made. At this point, the configuration
/// for the clickhouse servers should also be updated to point to the new
/// keeper.
Reconfiguring {
new_node_omicron_zone_id: OmicronZoneUuid,
keepers: BTreeMap<OmicronZoneUuid, KeeperConfig>,
},
/// In some cases, reconfiguration of the keeper can fail. In this case,
/// the existing zone must be expunged. Once the zone is expunged, then the
/// planner should go ahead and try to add a new keeper zone again.
Failed { stable_config: BTreeMap<OmicronZoneUuid, KeeperConfig> },
}

/// The current configuration state of the keeper cluster
#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
pub enum KeeperClusterState {
/// A configuration of a keeper cluster with no ongoing reconfigurations (node
/// additions or removals)
Stable { keepers: BTreeMap<OmicronZoneUuid, KeeperConfig> },
/// We're currently adding a node to the keeper cluster
AddingNode {
prior_stable_config: BTreeMap<OmicronZoneUuid, KeeperConfig>,
add_node_state: AddKeeperState,
},
// TODO: `RemovingNode`
}

/// The current configuration of the keeper cluster
#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
pub struct KeeperClusterConfig {
max_used_keeper_id: u64,
state: KeeperClusterState,
}

/// The current configuration of all clickhouse server replicas
///
/// In contrast to keepers, servers do not require a multi-step reconfiguration
/// to add, and multiple servers can be added or removed simultaneously. Removal
/// is slightly more complex in that we need to ensure that the servers are
/// shutdown and shutdown is noticed before we [drop](https://clickhouse.com/
/// docs/en/sql-reference/statements/system#drop-replica) the replica from the
/// cluster.
///
/// TODO: Model server removal
#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
pub struct ClickhouseServerClusterConfig {
max_used_server_id: u64,
servers: BTreeMap<OmicronZoneUuid, ReplicaConfig>,
}

/// Global configuration for all clickhouse servers (replicas) and keepers
#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
pub struct ClickhouseClusterConfig {
// The last update to the clickhouse cluster configuration
// This is used by clickhouse server and keeper zones to discard
// configurations they are up to date with.
generation: Generation,
cluster_name: String,
secret: String,
servers: ClickhouseServerClusterConfig,
keepers: KeeperClusterConfig,
/// The last update to the clickhouse cluster configuration
///
/// This is used by `clickhouse-admin` in the clickhouse server and keeper
/// zones to discard old configurations.
pub generation: Generation,
/// CLickhouse Server ids must be unique and are handed out monotonically. Keep track
/// of the last used one.
pub max_used_server_id: u64,
/// CLickhouse Keeper ids must be unique and are handed out monotonically. Keep track
/// of the last used one.
pub max_used_keeper_id: u64,
/// An arbitrary name for the Clickhouse cluster shared by all nodes
pub cluster_name: String,
/// An arbitrary string shared by all nodes used at runtime to determine whether
/// nodes are part of the same cluster.
pub secret: String,
}

impl ClickhouseClusterConfig {
pub fn new(
cluster_name: String,
secret: String,
) -> ClickhouseClusterConfig {
ClickhouseClusterConfig {
generation: Generation::new(),
max_used_server_id: 0,
max_used_keeper_id: 0,
cluster_name,
secret,
}
}
}

/*impl ClickhouseClusterConfig {
/// Create an intitial deployment for the first blueprint
pub fn new(
cluster_name: String,
Expand Down Expand Up @@ -245,14 +197,15 @@ impl ClickhouseClusterConfig {
}
}
/// Create a deployment dependent on the configuration from the parent
/// blueprint
pub fn new_based_on<'a>(
log: &Logger,
parent_config: &'a ClickhouseClusterConfig,
all_blueprint_zones: &BTreeMap<SledUuid, BlueprintZonesConfig>,
) -> ClickhouseClusterConfig {
todo!()
/// Is the clickhouse keeper configuration state `Stable`? In other words, are no nodes
/// currently being added or removed?
pub fn keeper_config_stable(&self) -> bool {
self.keepers.state.is_stable()
}
/// Are we currently adding a keeper to the config
pub fn adding_keeper(&self) -> bool {
self.keepers.state.is_adding_node()
}
}
Expand Down Expand Up @@ -512,3 +465,4 @@ mod tests {
}
}
}
*/
2 changes: 2 additions & 0 deletions nexus/types/src/deployment/zone_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ pub mod blueprint_zone_type {
Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize,
)]
pub struct ClickhouseKeeper {
pub keeper_id: u64,
pub address: SocketAddrV6,
pub dataset: OmicronZoneDataset,
}
Expand All @@ -346,6 +347,7 @@ pub mod blueprint_zone_type {
Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize,
)]
pub struct ClickhouseServer {
pub server_id: u64,
pub address: SocketAddrV6,
pub dataset: OmicronZoneDataset,
}
Expand Down
34 changes: 34 additions & 0 deletions nexus/types/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
use nexus_sled_agent_shared::inventory::OmicronZonesConfig;
use nexus_sled_agent_shared::inventory::SledRole;
use omicron_common::api::external::ByteCount;
use omicron_common::api::external::Generation;
pub use omicron_common::api::internal::shared::NetworkInterface;
pub use omicron_common::api::internal::shared::NetworkInterfaceKind;
pub use omicron_common::api::internal::shared::SourceNatConfig;
pub use omicron_common::zpool_name::ZpoolName;
use omicron_uuid_kinds::CollectionUuid;
use omicron_uuid_kinds::OmicronZoneUuid;
use omicron_uuid_kinds::SledUuid;
use omicron_uuid_kinds::ZpoolUuid;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -115,6 +117,9 @@ pub struct Collection {

/// Omicron zones found, by *sled* id
pub omicron_zones: BTreeMap<SledUuid, OmicronZonesFound>,

/// Clickhouse Keeper state, by *zone* id
pub clickhouse_keepers: BTreeMap<OmicronZoneUuid, ClickhouseKeeperState>,
}

impl Collection {
Expand Down Expand Up @@ -424,3 +429,32 @@ pub struct OmicronZonesFound {
pub sled_id: SledUuid,
pub zones: OmicronZonesConfig,
}

/// The state of a Clickhouse Keeper node
///
/// This is retrieved from the `clickhouse-admin` dropshot server running
/// in a `ClickhouseKeeper` zone and is used to manage reconfigurations of a
/// clickhouse keeper cluster.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
pub enum ClickhouseKeeperState {
/// The keeper process is not running because it has not received a
/// configuration yet
NeedsConfiguration,

/// The keeper process is running, but is not yet part of a cluster.
/// It's generated configuration in the blueprint is `keeper_config_gen`
RunningStandalone { keeper_id: u64, keeper_config_gen: Generation },

/// The keeper process is part of a cluster at the given generation
ActiveMember { keeper_id: u64, keeper_config_gen: Generation },

/// The keeper process has failed to join the cluster.
///
/// If this occurs, a new keeper process with a new id and configuration
/// should be started in the existing zone or the zone should be expunged.
FailedToJoin { keeper_id: u64, keeper_config_gen: Generation },

/// The keeper has been removed from the cluster. At no point may a keeper
/// that has been removed rejoin.
Removed { keeper_id: u64, keeper_config_gen: Generation },
}

0 comments on commit d05044a

Please sign in to comment.