From 43e710bfeda3cf38390dba014cd5729cc0a1e1d9 Mon Sep 17 00:00:00 2001
From: Vincent Liu <shuntian.liu2@cloud.com>
Date: Tue, 21 May 2024 17:01:49 +0100
Subject: [PATCH] CP-49635: Add FIST point for corosync upgrade

This forces the failure on a host that is trying to perform corosync
upgrade. There are ways to recover: if the failure happens early, before
the cluster is created in the DB, then a recreate ought to fix the
problem. This happens when the corosync upgrade fails on the
coordinator.

If the failure happens after the cluster is created on a pool member,
then a `pool-resync` should help retry this upgrade.

Hopefully this can simulate some of the failure paths, but is by no
means exhaustive. Other more complicated failures are not easily
recoverable and therefore not simulated for now.

Signed-off-by: Vincent Liu <shuntian.liu2@cloud.com>
---
 ocaml/xapi/xapi_clustering.ml | 29 ++++++++++++++++-------------
 ocaml/xapi/xapi_fist.ml       |  3 +++
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml
index bdaf8e9b39d..4fc9314aa2e 100644
--- a/ocaml/xapi/xapi_clustering.ml
+++ b/ocaml/xapi/xapi_clustering.ml
@@ -328,19 +328,22 @@ let rpc ~__context =
 
 let maybe_switch_cluster_stack_version ~__context ~self ~cluster_stack =
   if Xapi_cluster_helpers.corosync3_enabled ~__context then
-    let dbg = Context.string_of_task_and_tracing __context in
-    let result =
-      Cluster_client.LocalClient.switch_cluster_stack (rpc ~__context) dbg
-        cluster_stack
-    in
-    match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
-    | Ok () ->
-        debug "cluster stack switching was successful for cluster_host: %s"
-          (Ref.string_of self)
-    | Error error ->
-        warn "Error encountered when switching cluster stack cluster_host %s"
-          (Ref.string_of self) ;
-        handle_error error
+    if Xapi_fist.fail_corosync_upgrade () then
+      handle_error (InternalError "simulated corosync upgrade failure")
+    else
+      let dbg = Context.string_of_task_and_tracing __context in
+      let result =
+        Cluster_client.LocalClient.switch_cluster_stack (rpc ~__context) dbg
+          cluster_stack
+      in
+      match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
+      | Ok () ->
+          debug "cluster stack switching was successful for cluster_host: %s"
+            (Ref.string_of self)
+      | Error error ->
+          warn "Error encountered when switching cluster stack cluster_host %s"
+            (Ref.string_of self) ;
+          handle_error error
 
 let assert_cluster_host_quorate ~__context ~self =
   (* With the latest kernel GFS2 would hang on mount if clustering is not working yet,
diff --git a/ocaml/xapi/xapi_fist.ml b/ocaml/xapi/xapi_fist.ml
index 95d2f9e3874..d13cbd92628 100644
--- a/ocaml/xapi/xapi_fist.ml
+++ b/ocaml/xapi/xapi_fist.ml
@@ -69,6 +69,9 @@ let reconfigure_host () = fistpoint "reconfigure_host"
 (** allow starting up a corosync2 cluster *)
 let allow_corosync2 () = fistpoint "allow_corosync2"
 
+(** Make the current node fail the corosync upgrade *)
+let fail_corosync_upgrade () = fistpoint "fail_corosync_upgrade"
+
 (** Raise MTC_EXIT_CAN_NOT_ACCESS_STATEFILE *)
 let ha_cannot_access_statefile () = fistpoint "ha_cannot_access_statefile"