Skip to content

Commit

Permalink
add cluster fail reason to cluster info
Browse files Browse the repository at this point in the history
Signed-off-by: Binbin <[email protected]>
  • Loading branch information
enjoy-binbin committed Oct 22, 2024
1 parent 17d5949 commit 238f5ae
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 9 deletions.
3 changes: 2 additions & 1 deletion src/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
#define CLUSTER_FAIL 1 /* The cluster can't work */
#define CLUSTER_NAMELEN 40 /* sha1 hex length */

/* Reason why the cluster state changes to fail. */
/* Reason why the cluster state changes to fail. When adding new reasons,
* make sure to update getClusterFailReasonString and clusterLogFailReason. */
#define CLUSTER_FAIL_NONE 0
#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
#define CLUSTER_FAIL_MINORITY_PARTITION 2
Expand Down
29 changes: 21 additions & 8 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -4451,7 +4451,7 @@ void clusterLogCantFailover(int reason) {
case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
default: serverPanic("Unknown cant failover reason code."); break;
default: serverPanic("Unknown cant failover reason code.");
}
lastlog_time = time(NULL);
serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
Expand Down Expand Up @@ -5283,20 +5283,29 @@ void clusterCloseAllSlots(void) {
* Cluster state evaluation function
* -------------------------------------------------------------------------- */

void clusterLogWhyFail(int reason) {
const char *getClusterFailReasonString(void) {
switch (server.cluster->fail_reason) {
case CLUSTER_FAIL_NONE: return "none";
case CLUSTER_FAIL_NOT_FULL_COVERAGE: return "not-full-coverage";
case CLUSTER_FAIL_MINORITY_PARTITION: return "minority-partition";
default: serverPanic("Unknown fail reason code.");
}
}

void clusterLogFailReason(int reason) {
if (reason == CLUSTER_FAIL_NONE) return;

char *msg;
switch (reason) {
case CLUSTER_FAIL_NOT_FULL_COVERAGE:
msg = "At least one hash slot is not served by any available node. "
msg = "At least one hash slot is not served by any available node. "
"Please check the 'cluster-require-full-coverage' configuration.";
break;
case CLUSTER_FAIL_MINORITY_PARTITION: msg = "I am part of a minority partition."; break;
default: serverPanic("Unknown fail reason code."); break;
default: serverPanic("Unknown fail reason code.");
}
serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
server.cluster->fail_reason = reason;
serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
}

/* The following are defines that are only used in the evaluation function
Expand Down Expand Up @@ -5397,11 +5406,13 @@ void clusterUpdateState(void) {
server.cluster->state = new_state;

/* Cluster state changes from ok to fail, print a log. */
if (new_state == CLUSTER_FAIL) clusterLogWhyFail(new_reason);
if (new_state == CLUSTER_FAIL) clusterLogFailReason(new_reason);
}

/* Cluster state is still fail, but the reason has changed, print a log. */
if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) clusterLogWhyFail(new_reason);
if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) clusterLogFailReason(new_reason);

if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
}

/* This function is called after the node startup in order to verify that data
Expand Down Expand Up @@ -6012,6 +6023,7 @@ sds genClusterInfoString(void) {

info = sdscatprintf(info,
"cluster_state:%s\r\n"
"cluster_fail_reason:%s\r\n"
"cluster_slots_assigned:%d\r\n"
"cluster_slots_ok:%d\r\n"
"cluster_slots_pfail:%d\r\n"
Expand All @@ -6020,7 +6032,8 @@ sds genClusterInfoString(void) {
"cluster_size:%d\r\n"
"cluster_current_epoch:%llu\r\n"
"cluster_my_epoch:%llu\r\n",
statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail,
statestr[server.cluster->state], getClusterFailReasonString(),
slots_assigned, slots_ok, slots_pfail, slots_fail,
dictSize(server.cluster->nodes), server.cluster->size,
(unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));

Expand Down
37 changes: 37 additions & 0 deletions tests/unit/cluster/info.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,40 @@ test "errorstats: rejected call due to MOVED Redirection" {
}

} ;# start_cluster

start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
test "fail reason changed" {
# Kill one primary, so the cluster fail with not-full-coverage.
pause_process [srv 0 pid]
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {fail} &&
[CI 2 cluster_state] eq {fail} &&
[CI 1 cluster_fail_reason] eq {not-full-coverage} &&
[CI 2 cluster_fail_reason] eq {not-full-coverage}
} else {
fail "Cluster doesn't fail or the fail reason is not changed"
}

# Kill one more primary, so the cluster fail with not-full-coverage.
pause_process [srv -1 pid]
wait_for_condition 1000 50 {
[CI 2 cluster_state] eq {fail} &&
[CI 2 cluster_fail_reason] eq {minority-partition}
} else {
fail "Cluster doesn't fail or the fail reason is not changed"
}

resume_process [srv 0 pid]
resume_process [srv -1 pid]
wait_for_condition 1000 50 {
[CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_state] eq {ok} &&
[CI 2 cluster_state] eq {ok} &&
[CI 0 cluster_fail_reason] eq {none} &&
[CI 1 cluster_fail_reason] eq {none} &&
[CI 2 cluster_fail_reason] eq {none}
} else {
fail "Cluster doesn't stabilize"
}
}
}

0 comments on commit 238f5ae

Please sign in to comment.