diff --git a/ocaml/xapi-consts/api_messages.ml b/ocaml/xapi-consts/api_messages.ml index 250099d2c08..bb63facfe2a 100644 --- a/ocaml/xapi-consts/api_messages.ml +++ b/ocaml/xapi-consts/api_messages.ml @@ -299,11 +299,18 @@ let pool_cpu_features_down = addMessage "POOL_CPU_FEATURES_DOWN" 5L let pool_cpu_features_up = addMessage "POOL_CPU_FEATURES_UP" 5L (* Cluster messages *) +let cluster_quorum_approaching_lost = + addMessage "CLUSTER_QUORUM_APPROACHING_LOST" 2L + let cluster_host_enable_failed = addMessage "CLUSTER_HOST_ENABLE_FAILED" 3L (* raised by external script in clustering daemon, do not delete this: it is not dead code *) let cluster_host_fencing = addMessage "CLUSTER_HOST_FENCING" 2L +let cluster_host_leaving = addMessage "CLUSTER_HOST_LEAVING" 3L + +let cluster_host_joining = addMessage "CLUSTER_HOST_JOINING" 4L + (* Certificate expiration messages *) let host_server_certificate_expiring = "HOST_SERVER_CERTIFICATE_EXPIRING" diff --git a/ocaml/xapi/xapi_cluster_helpers.ml b/ocaml/xapi/xapi_cluster_helpers.ml index bbfd5a7ce73..210ad5068a0 100644 --- a/ocaml/xapi/xapi_cluster_helpers.ml +++ b/ocaml/xapi/xapi_cluster_helpers.ml @@ -108,3 +108,57 @@ let cluster_health_enabled ~__context = let pool = Helpers.get_pool ~__context in let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in List.assoc_opt "restrict_cluster_health" restrictions = Some "false" + +let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum + = + let generate_alert join cluster_host = + let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in + let host_uuid = Db.Host.get_uuid ~__context ~self:host in + let host_name = Db.Host.get_name_label ~__context ~self:host in + let body, name, priority = + match join with + | true -> + let body = + Printf.sprintf + "Host %s has joined the cluster, there are now %d host(s) in \ + cluster and %d hosts are required to form a quorum" + host_name num_hosts quorum + in + let name, priority = Api_messages.cluster_host_joining in + (body, name, priority) + | false -> + let body = + Printf.sprintf + "Host %s has left the cluster, there are now %d host(s) in \ + cluster and %d hosts are required to form a quorum" + host_name num_hosts quorum + in + let name, priority = Api_messages.cluster_host_leaving in + (body, name, priority) + in + Helpers.call_api_functions ~__context (fun rpc session_id -> + ignore + @@ Client.Client.Message.create ~rpc ~session_id ~name ~priority + ~cls:`Host ~obj_uuid:host_uuid ~body + ) + in + if cluster_health_enabled ~__context then ( + List.iter (generate_alert false) missing_hosts ; + List.iter (generate_alert true) new_hosts ; + (* only generate this alert when the number of hosts is decreasing *) + if missing_hosts <> [] && num_hosts <= quorum then + let pool = Helpers.get_pool ~__context in + let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in + let name, priority = Api_messages.cluster_quorum_approaching_lost in + let body = + Printf.sprintf + "The cluster is losing quorum: current %d hosts, need %d hosts for a \ + quorum" + num_hosts quorum + in + Helpers.call_api_functions ~__context (fun rpc session_id -> + ignore + @@ Client.Client.Message.create ~rpc ~session_id ~name ~priority + ~cls:`Pool ~obj_uuid:pool_uuid ~body + ) + ) diff --git a/ocaml/xapi/xapi_cluster_host.ml b/ocaml/xapi/xapi_cluster_host.ml index 0d623c0cd21..05cc439619f 100644 --- a/ocaml/xapi/xapi_cluster_host.ml +++ b/ocaml/xapi/xapi_cluster_host.ml @@ -13,6 +13,7 @@ *) open Xapi_clustering +open Xapi_cluster_helpers module D = Debug.Make (struct let name = "xapi_cluster_host" end) @@ -53,6 +54,20 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body raise err ) +let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts = + let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in + let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in + let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in + maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum + +let alert_for_cluster_host_leave ~__context ~cluster_host = + alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host] + ~new_hosts:[] + +let alert_for_cluster_host_join ~__context ~cluster_host = + alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[] + ~new_hosts:[cluster_host] + (* Create xapi db object for cluster_host, resync_host calls clusterd *) let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = with_clustering_lock __LOC__ (fun () -> @@ -65,6 +80,7 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = ~enabled:false ~current_operations:[] ~allowed_operations:[] ~other_config:[] ~joined:false ~live:false ~last_update_live:API.Date.epoch ; + alert_for_cluster_host_join ~__context ~cluster_host:ref ; ref ) @@ -226,12 +242,14 @@ let destroy_op ~__context ~self ~force = let result = local_fn (rpc ~__context) dbg in match Idl.IdM.run @@ Cluster_client.IDL.T.get result with | Ok () -> + alert_for_cluster_host_leave ~__context ~cluster_host:self ; Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host.%s was successful" fn_str ; Xapi_clustering.Daemon.disable ~__context | Error error -> warn "Error occurred during Cluster_host.%s" fn_str ; if force then ( + alert_for_cluster_host_leave ~__context ~cluster_host:self ; let ref_str = Ref.string_of self in Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host %s force destroyed." ref_str @@ -279,6 +297,7 @@ let forget ~__context ~self = Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ; (* must not disable the daemon here, because we declared another unreachable node dead, * not the current one *) + alert_for_cluster_host_leave ~__context ~cluster_host:self ; Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host.forget was successful" | Error error -> diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml index 82a27959e1a..699aa93420a 100644 --- a/ocaml/xapi/xapi_clustering.ml +++ b/ocaml/xapi/xapi_clustering.ml @@ -13,6 +13,7 @@ *) open Cluster_interface +open Xapi_cluster_helpers module D = Debug.Make (struct let name = "xapi_clustering" end) @@ -457,20 +458,27 @@ let on_corosync_update ~__context ~cluster updates = (fun h -> not (List.mem h quorum_hosts)) all_cluster_hosts in + let new_hosts = + List.filter + (fun h -> not (Db.Cluster_host.get_live ~__context ~self:h)) + quorum_hosts + in List.iter (fun self -> Db.Cluster_host.set_live ~__context ~self ~value:true ; Db.Cluster_host.set_last_update_live ~__context ~self ~value:current_time ) - quorum_hosts ; + new_hosts ; List.iter (fun self -> Db.Cluster_host.set_live ~__context ~self ~value:false ; Db.Cluster_host.set_last_update_live ~__context ~self ~value:current_time ) - missing_hosts + missing_hosts ; + maybe_generate_alert ~__context ~missing_hosts ~new_hosts + ~num_hosts:(List.length quorum_hosts) ~quorum:diag.quorum ) ; Db.Cluster.set_quorum ~__context ~self:cluster ~value:(Int64.of_int diag.quorum) ;