From 59aa00823c3691c50da452f2df2c4bc24e46696d Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 23 Jul 2024 14:43:16 +0800
Subject: [PATCH] Replicas with the same offset queue up for election (#762)

In some cases, like read more than write scenario, the replication
offset of the replicas are the same. When the primary fails, the
replicas have the same rankings (rank == 0). They issue the election
at the same time (although we have a random 500), the simultaneous
elections may lead to the failure of the election due to quorum.

In clusterGetReplicaRank, when we calculates the rank, if the offsets
are the same, the one with the smaller node name will have a better
rank to avoid this situation.

---------

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c            | 19 +++++++---
 tests/support/util.tcl          |  2 +-
 tests/unit/cluster/failover.tcl | 61 +++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bca0e95a4b..032f3449d7 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4146,6 +4146,10 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
  * replication offset, and so forth. Note that because how the rank is computed
  * multiple replicas may have the same rank, in case they have the same offset.
  *
+ * If the replication offsets are the same, the one with the lexicographically
+ * smaller node id will have a lower rank to avoid simultaneous elections
+ * of replicas.
+ *
  * The replica rank is used to add a delay to start an election in order to
  * get voted and replace a failing primary. Replicas with better replication
  * offsets are more likely to win. */
@@ -4159,10 +4163,17 @@ int clusterGetReplicaRank(void) {
     if (primary == NULL) return 0; /* Never called by replicas without primary. */
 
     myoffset = replicationGetReplicaOffset();
-    for (j = 0; j < primary->num_replicas; j++)
-        if (primary->replicas[j] != myself && !nodeCantFailover(primary->replicas[j]) &&
-            primary->replicas[j]->repl_offset > myoffset)
+    for (j = 0; j < primary->num_replicas; j++) {
+        if (primary->replicas[j] == myself) continue;
+        if (nodeCantFailover(primary->replicas[j])) continue;
+
+        if (primary->replicas[j]->repl_offset > myoffset) {
             rank++;
+        } else if (primary->replicas[j]->repl_offset == myoffset &&
+                   memcmp(primary->replicas[j]->name, myself->name, CLUSTER_NAMELEN) < 0) {
+            rank++;
+        }
+    }
     return rank;
 }
 
@@ -4372,7 +4383,7 @@ void clusterHandleReplicaFailover(void) {
      * Not performed if this is a manual failover. */
     if (server.cluster->failover_auth_sent == 0 && server.cluster->mf_end == 0) {
         int newrank = clusterGetReplicaRank();
-        if (newrank > server.cluster->failover_auth_rank) {
+        if (newrank != server.cluster->failover_auth_rank) {
             long long added_delay = (newrank - server.cluster->failover_auth_rank) * 1000;
             server.cluster->failover_auth_time += added_delay;
             server.cluster->failover_auth_rank = newrank;
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index 2e2c70f205..e53cda3071 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -171,7 +171,7 @@ proc count_log_message {srv_idx pattern} {
     return [count_message_lines $stdout $pattern]
 }
 
-# verify pattern exists in server's sdtout after a certain line number
+# verify pattern exists in server's stdout after a certain line number
 proc verify_log_message {srv_idx pattern from_line} {
     incr from_line
     set result [exec tail -n +$from_line < [srv $srv_idx stdout]]
diff --git a/tests/unit/cluster/failover.tcl b/tests/unit/cluster/failover.tcl
index f36fd47fe8..61873df6f1 100644
--- a/tests/unit/cluster/failover.tcl
+++ b/tests/unit/cluster/failover.tcl
@@ -70,3 +70,64 @@ test "Instance #0 gets converted into a slave" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 6 {tags {external:skip cluster}} {
+
+    test "Cluster is up" {
+        wait_for_cluster_state ok
+    }
+
+    test "Cluster is writable" {
+        cluster_write_test [srv 0 port]
+    }
+
+    set current_epoch [CI 1 cluster_current_epoch]
+
+    set paused_pid [srv 0 pid]
+    test "Killing the first primary node" {
+        pause_process $paused_pid
+    }
+
+    test "Wait for failover" {
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_current_epoch] > $current_epoch
+        } else {
+            fail "No failover detected"
+        }
+    }
+
+    test "Cluster should eventually be up again" {
+        for {set j 0} {$j < [llength $::servers]} {incr j} {
+            if {[process_is_paused [srv -$j pid]]} continue
+            wait_for_condition 1000 50 {
+                [CI $j cluster_state] eq "ok"
+            } else {
+                fail "Cluster node $j cluster_state:[CI $j cluster_state]"
+            }
+        }
+    }
+
+    test "Restarting the previously killed primary node" {
+        resume_process $paused_pid
+    }
+
+    test "Instance #0 gets converted into a replica" {
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave}
+        } else {
+            fail "Old primary was not converted into replica"
+        }
+        wait_for_cluster_propagation
+    }
+
+    test "Make sure the replicas always get the different ranks" {
+        if {[s -3 role] == "master"} {
+            verify_log_message -3 "*Start of election*rank #0*" 0
+            verify_log_message -6 "*Start of election*rank #1*" 0
+        } else {
+            verify_log_message -3 "*Start of election*rank #1*" 0
+            verify_log_message -6 "*Start of election*rank #0*" 0
+        }
+    }
+
+} ;# start_cluster