From ee680e67e40324af309252a90973be0c914546ab Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 13 Jul 2024 23:20:51 +0800 Subject: [PATCH] Optimize failover time when the new primary node is down again We will not reset failover_auth_time after setting it, this is used to check auth_timeout and auth_retry_time, but we should at least reset it after a successful failover. Let's assume the following scenario: 1. Two replicas initiate an election. 2. Replica 1 is elected as the primary node, and replica 2 does not have enough votes. 3. Replica 1 is down, ie the new primary node down again in a short time. 4. Replica 2 know that the new primary node is down and wants to initiate a failover, but because the failover_auth_time of the previous round has not been reset, it needs to wait for it to time out and then wait for the next retry time, which will take cluster-node-timeout * 4 times, this adds a lot of delay. There is another problem. Like we will set additional random time for failover_auth_time, such as random 500ms and replicas ranking 1s. If replica 2 receives PONG from the new primary node before sending the FAILOVER_AUTH_REQUEST, that is, before the failover_auth_time, it will change itself to a replica. If the new primary node goes down again at this time, replica 2 will use the previous failover_auth_time to initiate an election instead of going through the logic of random 500ms and replicas ranking 1s again, which may lead to unexpected consequences (for example, a low-ranking replica initiates an election and becomes the new primary node). That is, we need to reset failover_auth_time at the appropriate time. When the replica switches to a new primary, we reset it, because the existing failover_auth_time is already out of date in this case. Signed-off-by: Binbin --- src/cluster_legacy.c | 7 ++++ tests/unit/cluster/failover2.tcl | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tests/unit/cluster/failover2.tcl diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 035b9fc876..02b9edf35c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5259,6 +5259,13 @@ void clusterSetPrimary(clusterNode *n, int closeSlots) { replicationSetPrimary(n->ip, getNodeDefaultReplicationPort(n)); removeAllNotOwnedShardChannelSubscriptions(); resetManualFailover(); + + if (server.cluster->failover_auth_time) { + /* Since we have changed to a new primary node, the previously set + * failover_auth_time should no longer be used, whether it is in + * progress or timed out. */ + server.cluster->failover_auth_time = 0; + } } /* ----------------------------------------------------------------------------- diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl new file mode 100644 index 0000000000..8098f96221 --- /dev/null +++ b/tests/unit/cluster/failover2.tcl @@ -0,0 +1,62 @@ +# Check the basic monitoring and failover capabilities. + +start_cluster 3 6 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test [srv 0 port] +} + +set paused_pid [srv 0 pid] +test "Killing one primary node" { + pause_process $paused_pid +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [s -3 role] == "master" || [s -6 role] == "master" + } else { + fail "No failover detected" + } +} + +test "Killing the new primary node" { + if {[s -3 role] == "master"} { + set replica_to_be_primary -6 + set paused_pid2 [srv -3 pid] + } else { + set replica_to_be_primary -3 + set paused_pid2 [srv -6 pid] + } + pause_process $paused_pid2 +} + +test "Cluster should eventually be up again" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid2]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "wait new failover" { + wait_for_condition 1000 100 { + [s $replica_to_be_primary role] == "master" + } else { + fail "No failover detected" + } +} + +test "Restarting the previously killed primary nodes" { + resume_process $paused_pid + resume_process $paused_pid2 +} + +} ;# start_cluster