Skip to content

Commit

Permalink
CP-46324: Send alert when a host leaves/joins the cluster
Browse files Browse the repository at this point in the history
We add three new alert messages in addition to the existing
`cluster_host_fencing`: `cluster_host_leaving`, `cluster_host_joining`
and `cluster_quorum_approaching_lost`.

The leave and join message are added whenever a host leaves the cluster,
whether as a result of the user operation, or due to unexpected errors
(in which case we will likely get an additional fencing alert).
The approaching quorum message is sent when the cluster cannot tolerate
any more loss of hosts.

Signed-off-by: Vincent Liu <[email protected]>
  • Loading branch information
Vincent-lau committed Feb 7, 2024
1 parent 2c95050 commit 2d61201
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 2 deletions.
7 changes: 7 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,18 @@ let pool_cpu_features_down = addMessage "POOL_CPU_FEATURES_DOWN" 5L
let pool_cpu_features_up = addMessage "POOL_CPU_FEATURES_UP" 5L

(* Cluster messages *)
let cluster_quorum_approaching_lost =
addMessage "CLUSTER_QUORUM_APPROACHING_LOST" 2L

let cluster_host_enable_failed = addMessage "CLUSTER_HOST_ENABLE_FAILED" 3L

(* raised by external script in clustering daemon, do not delete this: it is not dead code *)
let cluster_host_fencing = addMessage "CLUSTER_HOST_FENCING" 2L

let cluster_host_leaving = addMessage "CLUSTER_HOST_LEAVING" 3L

let cluster_host_joining = addMessage "CLUSTER_HOST_JOINING" 4L

(* Certificate expiration messages *)
let host_server_certificate_expiring = "HOST_SERVER_CERTIFICATE_EXPIRING"

Expand Down
54 changes: 54 additions & 0 deletions ocaml/xapi/xapi_cluster_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,57 @@ let cluster_health_enabled ~__context =
let pool = Helpers.get_pool ~__context in
let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in
List.assoc_opt "restrict_cluster_health" restrictions = Some "false"

let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
=
let generate_alert join cluster_host =
let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
let host_name = Db.Host.get_name_label ~__context ~self:host in
let body, name, priority =
match join with
| true ->
let body =
Printf.sprintf
"Host %s has joined the cluster, there are now %d host(s) in \
cluster and %d hosts are required to form a quorum"
host_name num_hosts quorum
in
let name, priority = Api_messages.cluster_host_joining in
(body, name, priority)
| false ->
let body =
Printf.sprintf
"Host %s has left the cluster, there are now %d host(s) in \
cluster and %d hosts are required to form a quorum"
host_name num_hosts quorum
in
let name, priority = Api_messages.cluster_host_leaving in
(body, name, priority)
in
Helpers.call_api_functions ~__context (fun rpc session_id ->
ignore
@@ Client.Client.Message.create ~rpc ~session_id ~name ~priority
~cls:`Host ~obj_uuid:host_uuid ~body
)
in
if cluster_health_enabled ~__context then (
List.iter (generate_alert false) missing_hosts ;
List.iter (generate_alert true) new_hosts ;
(* only generate this alert when the number of hosts is decreasing *)
if missing_hosts <> [] && num_hosts <= quorum then
let pool = Helpers.get_pool ~__context in
let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in
let name, priority = Api_messages.cluster_quorum_approaching_lost in
let body =
Printf.sprintf
"The cluster is losing quorum: current %d hosts, need %d hosts for a \
quorum"
num_hosts quorum
in
Helpers.call_api_functions ~__context (fun rpc session_id ->
ignore
@@ Client.Client.Message.create ~rpc ~session_id ~name ~priority
~cls:`Pool ~obj_uuid:pool_uuid ~body
)
)
19 changes: 19 additions & 0 deletions ocaml/xapi/xapi_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*)

open Xapi_clustering
open Xapi_cluster_helpers

module D = Debug.Make (struct let name = "xapi_cluster_host" end)

Expand Down Expand Up @@ -53,6 +54,20 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
raise err
)

let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum

let alert_for_cluster_host_leave ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
~new_hosts:[]

let alert_for_cluster_host_join ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
~new_hosts:[cluster_host]

(* Create xapi db object for cluster_host, resync_host calls clusterd *)
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
with_clustering_lock __LOC__ (fun () ->
Expand All @@ -65,6 +80,7 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
~enabled:false ~current_operations:[] ~allowed_operations:[]
~other_config:[] ~joined:false ~live:false
~last_update_live:API.Date.epoch ;
alert_for_cluster_host_join ~__context ~cluster_host:ref ;
ref
)

Expand Down Expand Up @@ -226,12 +242,14 @@ let destroy_op ~__context ~self ~force =
let result = local_fn (rpc ~__context) dbg in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.%s was successful" fn_str ;
Xapi_clustering.Daemon.disable ~__context
| Error error ->
warn "Error occurred during Cluster_host.%s" fn_str ;
if force then (
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
let ref_str = Ref.string_of self in
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host %s force destroyed." ref_str
Expand Down Expand Up @@ -279,6 +297,7 @@ let forget ~__context ~self =
Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ;
(* must not disable the daemon here, because we declared another unreachable node dead,
* not the current one *)
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.forget was successful"
| Error error ->
Expand Down
12 changes: 10 additions & 2 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*)

open Cluster_interface
open Xapi_cluster_helpers

module D = Debug.Make (struct let name = "xapi_clustering" end)

Expand Down Expand Up @@ -457,20 +458,27 @@ let on_corosync_update ~__context ~cluster updates =
(fun h -> not (List.mem h quorum_hosts))
all_cluster_hosts
in
let new_hosts =
List.filter
(fun h -> not (Db.Cluster_host.get_live ~__context ~self:h))
quorum_hosts
in
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:true ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
quorum_hosts ;
new_hosts ;
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:false ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
missing_hosts
missing_hosts ;
maybe_generate_alert ~__context ~missing_hosts ~new_hosts
~num_hosts:(List.length quorum_hosts) ~quorum:diag.quorum
) ;
Db.Cluster.set_quorum ~__context ~self:cluster
~value:(Int64.of_int diag.quorum) ;
Expand Down

0 comments on commit 2d61201

Please sign in to comment.