add cluster fail reason to cluster info

Signed-off-by: Binbin <[email protected]>
enjoy-binbin · Oct 22, 2024 · 238f5ae · 238f5ae
1 parent 17d5949
commit 238f5ae
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 9 deletions.
diff --git a/src/cluster.h b/src/cluster.h
@@ -12,7 +12,8 @@
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */
 
-/* Reason why the cluster state changes to fail. */
+/* Reason why the cluster state changes to fail. When adding new reasons,
+ * make sure to update getClusterFailReasonString and clusterLogFailReason. */
 #define CLUSTER_FAIL_NONE 0
 #define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
 #define CLUSTER_FAIL_MINORITY_PARTITION 2

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -4451,7 +4451,7 @@ void clusterLogCantFailover(int reason) {
     case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
     case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
     case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
-    default: serverPanic("Unknown cant failover reason code."); break;
+    default: serverPanic("Unknown cant failover reason code.");
     }
     lastlog_time = time(NULL);
     serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
@@ -5283,20 +5283,29 @@ void clusterCloseAllSlots(void) {
  * Cluster state evaluation function
  * -------------------------------------------------------------------------- */
 
-void clusterLogWhyFail(int reason) {
+const char *getClusterFailReasonString(void) {
+    switch (server.cluster->fail_reason) {
+    case CLUSTER_FAIL_NONE: return "none";
+    case CLUSTER_FAIL_NOT_FULL_COVERAGE: return "not-full-coverage";
+    case CLUSTER_FAIL_MINORITY_PARTITION: return "minority-partition";
+    default: serverPanic("Unknown fail reason code.");
+    }
+}
+
+void clusterLogFailReason(int reason) {
     if (reason == CLUSTER_FAIL_NONE) return;
 
     char *msg;
     switch (reason) {
     case CLUSTER_FAIL_NOT_FULL_COVERAGE:
-        msg = "At least one hash slot is not served by any available node.  "
+        msg = "At least one hash slot is not served by any available node. "
               "Please check the 'cluster-require-full-coverage' configuration.";
         break;
     case CLUSTER_FAIL_MINORITY_PARTITION: msg = "I am part of a minority partition."; break;
-    default: serverPanic("Unknown fail reason code."); break;
+    default: serverPanic("Unknown fail reason code.");
     }
-    serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
     server.cluster->fail_reason = reason;
+    serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
 }
 
 /* The following are defines that are only used in the evaluation function
@@ -5397,11 +5406,13 @@ void clusterUpdateState(void) {
         server.cluster->state = new_state;
 
         /* Cluster state changes from ok to fail, print a log. */
-        if (new_state == CLUSTER_FAIL) clusterLogWhyFail(new_reason);
+        if (new_state == CLUSTER_FAIL) clusterLogFailReason(new_reason);
     }
 
     /* Cluster state is still fail, but the reason has changed, print a log. */
-    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) clusterLogWhyFail(new_reason);
+    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) clusterLogFailReason(new_reason);
+
+    if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
 }
 
 /* This function is called after the node startup in order to verify that data
@@ -6012,6 +6023,7 @@ sds genClusterInfoString(void) {
 
     info = sdscatprintf(info,
                         "cluster_state:%s\r\n"
+                        "cluster_fail_reason:%s\r\n"
                         "cluster_slots_assigned:%d\r\n"
                         "cluster_slots_ok:%d\r\n"
                         "cluster_slots_pfail:%d\r\n"
@@ -6020,7 +6032,8 @@ sds genClusterInfoString(void) {
                         "cluster_size:%d\r\n"
                         "cluster_current_epoch:%llu\r\n"
                         "cluster_my_epoch:%llu\r\n",
-                        statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail,
+                        statestr[server.cluster->state], getClusterFailReasonString(),
+                        slots_assigned, slots_ok, slots_pfail, slots_fail,
                         dictSize(server.cluster->nodes), server.cluster->size,
                         (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
 

diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl
@@ -41,3 +41,40 @@ test "errorstats: rejected call due to MOVED Redirection" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
+    test "fail reason changed" {
+        # Kill one primary, so the cluster fail with not-full-coverage.
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail} &&
+            [CI 1 cluster_fail_reason] eq {not-full-coverage} &&
+            [CI 2 cluster_fail_reason] eq {not-full-coverage}
+        } else {
+            fail "Cluster doesn't fail or the fail reason is not changed"
+        }
+
+        # Kill one more primary, so the cluster fail with not-full-coverage.
+        pause_process [srv -1 pid]
+        wait_for_condition 1000 50 {
+            [CI 2 cluster_state] eq {fail} &&
+            [CI 2 cluster_fail_reason] eq {minority-partition}
+        } else {
+            fail "Cluster doesn't fail or the fail reason is not changed"
+        }
+
+        resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
+        wait_for_condition 1000 50 {
+            [CI 0 cluster_state] eq {ok} &&
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 0 cluster_fail_reason] eq {none} &&
+            [CI 1 cluster_fail_reason] eq {none} &&
+            [CI 2 cluster_fail_reason] eq {none}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+    }
+}