From 380f70081686d68739536990f5c8535d1382524d Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 6 Aug 2024 21:14:18 +0800
Subject: [PATCH] Improve cluster cant failover log conditions (#780)

This PR adjusts the logging conditions of clusterLogCantFailover
in this two ways.

1. For the same cant_failover_reason, we will print the log once
in CLUSTER_CANT_FAILOVER_RELOG_PERIOD, but its value is 10s, which
is a bit long, shorten it to 1s, so we can better track its state.
We get to see the system making progress by watching the message.
Using 1s also covers pretty much all cases as i don't see a reason
for using a <1s node timeout, test or prod.

2. We will not print logs before the nolog_fail_time, its value
is cluster-node-timeout+5000. This may casue us to lose some logs,
for example, if cluster-node-timeout is small, auth_timeout will
be 2000, and auth_retry_time will be 4000. In this case, we will
lose all the reasons during the election if the failover is timedout.
So remove the nolog_fail_time logic, since we still do have the
CLUSTER_CANT_FAILOVER_RELOG_PERIOD logic, we won't print too many
logs.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 11 -----------
 src/cluster_legacy.h |  2 +-
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 8d391510a3..b35b0b3b8e 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4356,9 +4356,6 @@ int clusterGetReplicaRank(void) {
  * 2) Also, the log is emitted again if the primary is still down and
  *    the reason for not failing over is still the same, but more than
  *    CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
- * 3) Finally, the function only logs if the replica is down for more than
- *    five seconds + NODE_TIMEOUT. This way nothing is logged when a
- *    failover starts in a reasonable time.
  *
  * The function is called with the reason why the replica can't failover
  * which is one of the integer macros CLUSTER_CANT_FAILOVER_*.
@@ -4367,7 +4364,6 @@ int clusterGetReplicaRank(void) {
 void clusterLogCantFailover(int reason) {
     char *msg;
     static time_t lastlog_time = 0;
-    mstime_t nolog_fail_time = server.cluster_node_timeout + 5000;
 
     /* Don't log if we have the same reason for some time. */
     if (reason == server.cluster->cant_failover_reason &&
@@ -4376,13 +4372,6 @@ void clusterLogCantFailover(int reason) {
 
     server.cluster->cant_failover_reason = reason;
 
-    /* We also don't emit any log if the primary failed no long ago, the
-     * goal of this function is to log replicas in a stalled condition for
-     * a long time. */
-    if (myself->replicaof && nodeFailed(myself->replicaof) &&
-        (mstime() - myself->replicaof->fail_time) < nolog_fail_time)
-        return;
-
     switch (reason) {
     case CLUSTER_CANT_FAILOVER_DATA_AGE:
         msg = "Disconnected from primary for longer than allowed. "
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 7dbb33a708..7184a2b204 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -17,7 +17,7 @@
 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
 #define CLUSTER_CANT_FAILOVER_EXPIRED 3
 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
-#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */
+#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 1 /* seconds. */
 
 /* clusterState todo_before_sleep flags. */
 #define CLUSTER_TODO_HANDLE_FAILOVER (1 << 0)