Skip to content

Commit

Permalink
Merge pull request #5408 from Vincent-lau/private/shul2/cluster-alert
Browse files Browse the repository at this point in the history
CP-46324: Send alert when a host leaves/joins the cluster
  • Loading branch information
robhoes authored Feb 7, 2024
2 parents f6ca6f8 + 2d61201 commit 8b7d5e4
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 8 deletions.
11 changes: 6 additions & 5 deletions ocaml/idl/datamodel_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ let create =
()

let destroy =
call ~name:"destroy" ~doc:"Remove a host from an existing cluster."
call ~name:"destroy"
~doc:
"Remove the host from an existing cluster. This operation is allowed \
even if a cluster host is not enabled."
~params:
[
( Ref _cluster_host
Expand Down Expand Up @@ -117,10 +120,8 @@ let t =
~default_value:(Some (VBool true))
"Whether the cluster host has joined the cluster. Contrary to \
enabled, a host that is not joined is not considered a member of \
the cluster, and hence no operations (e.g. enable/disable) can be \
performed on this host. This field can be altered by calling leave \
or destroy on a cluster host. It can also be set automatically if \
cluster stack believes that this node is not part of the cluster. "
the cluster, and hence enable and disable operations cannot be \
performed on this host."
; field ~qualifier:DynamicRO ~lifecycle:[] ~ty:Bool "live"
~default_value:(Some (VBool false))
"Whether the underlying cluster stack thinks we are live. This \
Expand Down
2 changes: 1 addition & 1 deletion ocaml/idl/schematest.ml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex
(* BEWARE: if this changes, check that schema has been bumped accordingly in
ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *)

let last_known_schema_hash = "1e43ef93af9de55620fda75281e8a992"
let last_known_schema_hash = "2de13a69470d10b12910322f8a6bce85"

let current_schema_hash : string =
let open Datamodel_types in
Expand Down
7 changes: 7 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,18 @@ let pool_cpu_features_down = addMessage "POOL_CPU_FEATURES_DOWN" 5L
let pool_cpu_features_up = addMessage "POOL_CPU_FEATURES_UP" 5L

(* Cluster messages *)
let cluster_quorum_approaching_lost =
addMessage "CLUSTER_QUORUM_APPROACHING_LOST" 2L

let cluster_host_enable_failed = addMessage "CLUSTER_HOST_ENABLE_FAILED" 3L

(* raised by external script in clustering daemon, do not delete this: it is not dead code *)
let cluster_host_fencing = addMessage "CLUSTER_HOST_FENCING" 2L

let cluster_host_leaving = addMessage "CLUSTER_HOST_LEAVING" 3L

let cluster_host_joining = addMessage "CLUSTER_HOST_JOINING" 4L

(* Certificate expiration messages *)
let host_server_certificate_expiring = "HOST_SERVER_CERTIFICATE_EXPIRING"

Expand Down
54 changes: 54 additions & 0 deletions ocaml/xapi/xapi_cluster_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,57 @@ let cluster_health_enabled ~__context =
let pool = Helpers.get_pool ~__context in
let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in
List.assoc_opt "restrict_cluster_health" restrictions = Some "false"

let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
=
let generate_alert join cluster_host =
let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
let host_name = Db.Host.get_name_label ~__context ~self:host in
let body, name, priority =
match join with
| true ->
let body =
Printf.sprintf
"Host %s has joined the cluster, there are now %d host(s) in \
cluster and %d hosts are required to form a quorum"
host_name num_hosts quorum
in
let name, priority = Api_messages.cluster_host_joining in
(body, name, priority)
| false ->
let body =
Printf.sprintf
"Host %s has left the cluster, there are now %d host(s) in \
cluster and %d hosts are required to form a quorum"
host_name num_hosts quorum
in
let name, priority = Api_messages.cluster_host_leaving in
(body, name, priority)
in
Helpers.call_api_functions ~__context (fun rpc session_id ->
ignore
@@ Client.Client.Message.create ~rpc ~session_id ~name ~priority
~cls:`Host ~obj_uuid:host_uuid ~body
)
in
if cluster_health_enabled ~__context then (
List.iter (generate_alert false) missing_hosts ;
List.iter (generate_alert true) new_hosts ;
(* only generate this alert when the number of hosts is decreasing *)
if missing_hosts <> [] && num_hosts <= quorum then
let pool = Helpers.get_pool ~__context in
let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in
let name, priority = Api_messages.cluster_quorum_approaching_lost in
let body =
Printf.sprintf
"The cluster is losing quorum: current %d hosts, need %d hosts for a \
quorum"
num_hosts quorum
in
Helpers.call_api_functions ~__context (fun rpc session_id ->
ignore
@@ Client.Client.Message.create ~rpc ~session_id ~name ~priority
~cls:`Pool ~obj_uuid:pool_uuid ~body
)
)
19 changes: 19 additions & 0 deletions ocaml/xapi/xapi_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*)

open Xapi_clustering
open Xapi_cluster_helpers

module D = Debug.Make (struct let name = "xapi_cluster_host" end)

Expand Down Expand Up @@ -53,6 +54,20 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
raise err
)

let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum

let alert_for_cluster_host_leave ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
~new_hosts:[]

let alert_for_cluster_host_join ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
~new_hosts:[cluster_host]

(* Create xapi db object for cluster_host, resync_host calls clusterd *)
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
with_clustering_lock __LOC__ (fun () ->
Expand All @@ -65,6 +80,7 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
~enabled:false ~current_operations:[] ~allowed_operations:[]
~other_config:[] ~joined:false ~live:false
~last_update_live:API.Date.epoch ;
alert_for_cluster_host_join ~__context ~cluster_host:ref ;
ref
)

Expand Down Expand Up @@ -226,12 +242,14 @@ let destroy_op ~__context ~self ~force =
let result = local_fn (rpc ~__context) dbg in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.%s was successful" fn_str ;
Xapi_clustering.Daemon.disable ~__context
| Error error ->
warn "Error occurred during Cluster_host.%s" fn_str ;
if force then (
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
let ref_str = Ref.string_of self in
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host %s force destroyed." ref_str
Expand Down Expand Up @@ -279,6 +297,7 @@ let forget ~__context ~self =
Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ;
(* must not disable the daemon here, because we declared another unreachable node dead,
* not the current one *)
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.forget was successful"
| Error error ->
Expand Down
12 changes: 10 additions & 2 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*)

open Cluster_interface
open Xapi_cluster_helpers

module D = Debug.Make (struct let name = "xapi_clustering" end)

Expand Down Expand Up @@ -457,20 +458,27 @@ let on_corosync_update ~__context ~cluster updates =
(fun h -> not (List.mem h quorum_hosts))
all_cluster_hosts
in
let new_hosts =
List.filter
(fun h -> not (Db.Cluster_host.get_live ~__context ~self:h))
quorum_hosts
in
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:true ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
quorum_hosts ;
new_hosts ;
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:false ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
missing_hosts
missing_hosts ;
maybe_generate_alert ~__context ~missing_hosts ~new_hosts
~num_hosts:(List.length quorum_hosts) ~quorum:diag.quorum
) ;
Db.Cluster.set_quorum ~__context ~self:cluster
~value:(Int64.of_int diag.quorum) ;
Expand Down

0 comments on commit 8b7d5e4

Please sign in to comment.