From 167e8ab8de4c26a41222d94fcf0ccbd1864a9774 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 11 Nov 2024 21:43:46 +0800
Subject: [PATCH] Trigger the election immediately when doing a manual failover
 (#1081)

Currently when a manual failover is triggeded, we will set a
CLUSTER_TODO_HANDLE_FAILOVER to start the election as soon as
possible in the next beforeSleep. But in fact, we won't delay
the election in manual failover, waitting for the next beforeSleep
to kick in will delay the election a some milliseconds.

We can trigger the election immediately in this case in the
same function call, without waitting for beforeSleep, which
can save us some milliseconds.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index f1c9eb1fcf..04a04774fe 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4519,8 +4519,9 @@ void clusterFailoverReplaceYourPrimary(void) {
  * 3) Perform the failover informing all the other nodes.
  */
 void clusterHandleReplicaFailover(void) {
+    mstime_t now = mstime();
     mstime_t data_age;
-    mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
+    mstime_t auth_age = now - server.cluster->failover_auth_time;
     int needed_quorum = (server.cluster->size / 2) + 1;
     int manual_failover = server.cluster->mf_end != 0 && server.cluster->mf_can_start;
     mstime_t auth_timeout, auth_retry_time;
@@ -4582,7 +4583,7 @@ void clusterHandleReplicaFailover(void) {
     /* If the previous failover attempt timeout and the retry time has
      * elapsed, we can setup a new one. */
     if (auth_age > auth_retry_time) {
-        server.cluster->failover_auth_time = mstime() +
+        server.cluster->failover_auth_time = now +
                                              500 +           /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
                                              random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
@@ -4594,20 +4595,26 @@ void clusterHandleReplicaFailover(void) {
         server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000;
         /* However if this is a manual failover, no delay is needed. */
         if (server.cluster->mf_end) {
-            server.cluster->failover_auth_time = mstime();
+            server.cluster->failover_auth_time = now;
             server.cluster->failover_auth_rank = 0;
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+            /* Reset auth_age since it is outdated now and we can bypass the auth_timeout
+             * check in the next state and start the election ASAP. */
+            auth_age = 0;
         }
         serverLog(LL_NOTICE,
                   "Start of election delayed for %lld milliseconds "
                   "(rank #%d, offset %lld).",
-                  server.cluster->failover_auth_time - mstime(), server.cluster->failover_auth_rank,
+                  server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank,
                   replicationGetReplicaOffset());
         /* Now that we have a scheduled election, broadcast our offset
          * to all the other replicas so that they'll updated their offsets
          * if our offset is better. */
         clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_REPLICAS);
-        return;
+
+        /* Return ASAP if we can't start the election now. In a manual failover,
+         * we can start the election immediately, so in this case we continue to
+         * the next state without waiting for the next beforeSleep. */
+        if (now < server.cluster->failover_auth_time) return;
     }
 
     /* It is possible that we received more updated offsets from other
@@ -4627,7 +4634,7 @@ void clusterHandleReplicaFailover(void) {
     }
 
     /* Return ASAP if we can't still start the election. */
-    if (mstime() < server.cluster->failover_auth_time) {
+    if (now < server.cluster->failover_auth_time) {
         clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
         return;
     }