Skip to content

Commit

Permalink
Do shutdown failover only when offset is match
Browse files Browse the repository at this point in the history
Signed-off-by: Binbin <[email protected]>
  • Loading branch information
enjoy-binbin committed Oct 28, 2024
1 parent 64831c9 commit b06a8c4
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
9 changes: 6 additions & 3 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1200,10 +1200,13 @@ void clusterHandleServerShutdown(void) {
listRewind(server.replicas, &replicas_iter);
while ((replicas_list_node = listNext(&replicas_iter)) != NULL) {
client *replica = listNodeValue(replicas_list_node);
if (replica->repl_state != REPLICA_STATE_ONLINE) continue;
if (best_replica == NULL || replica->repl_ack_off > best_replica->repl_ack_off) best_replica = replica;
if (best_replica->repl_ack_off == server.primary_repl_offset) break;
/* This is done only when the replica offset is caught up, to avoid data loss */
if (replica->repl_state == REPLICA_STATE_ONLINE && replica->repl_ack_off == server.primary_repl_offset) {
best_replica = replica;
break;
}
}

if (best_replica) {
/* Send a CLUSTER FAILOVER FORCE to the best replica. */
const char *buf = "*3\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n";
Expand Down
2 changes: 1 addition & 1 deletion tests/support/util.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ proc check_replica_acked_ofs {primary replica_ip replica_port} {
proc wait_replica_acked_ofs {primary replica replica_ip replica_port} {
$primary config set repl-ping-replica-period 3600
$replica config set hz 500
wait_for_condition 50 100 {
wait_for_condition 100 100 {
[check_replica_acked_ofs $primary $replica_ip $replica_port] eq 1
} else {
puts "INFO REPLICATION: [$primary info replication]"
Expand Down
7 changes: 6 additions & 1 deletion tests/unit/cluster/auto-failover-on-shutdown.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ proc test_main {how shutdown_timeout} {

$primary config set auto-failover-on-shutdown yes
$primary config set shutdown-timeout $shutdown_timeout
$primary config set repl-ping-replica-period 3600

# To avoid failover kick in.
$replica2 config set cluster-replica-no-failover yes

# Pause a replica so it has no chance to catch up with the offset.
pause_process $replica1_pid
Expand All @@ -29,7 +33,7 @@ proc test_main {how shutdown_timeout} {
$primary incr key_991803
}

if {$shutdown_timeout != 0} {
if {$shutdown_timeout == 0} {
# Wait the replica2 catch up with the offset
wait_for_ofs_sync $primary $replica2
wait_replica_acked_ofs $primary $replica2 $replica2_ip $replica2_port
Expand Down Expand Up @@ -63,6 +67,7 @@ proc test_main {how shutdown_timeout} {

pause_process $replica1_pid

$primary config set auto-failover-on-shutdown yes
$primary client kill type replica
shutdown_how 6 $how
wait_for_log_messages -6 {"*Unable to find a replica to perform an auto failover on shutdown*"} 0 1000 10
Expand Down

0 comments on commit b06a8c4

Please sign in to comment.