diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index dc6ff45f5a..7449d9d428 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1200,10 +1200,13 @@ void clusterHandleServerShutdown(void) { listRewind(server.replicas, &replicas_iter); while ((replicas_list_node = listNext(&replicas_iter)) != NULL) { client *replica = listNodeValue(replicas_list_node); - if (replica->repl_state != REPLICA_STATE_ONLINE) continue; - if (best_replica == NULL || replica->repl_ack_off > best_replica->repl_ack_off) best_replica = replica; - if (best_replica->repl_ack_off == server.primary_repl_offset) break; + /* This is done only when the replica offset is caught up, to avoid data loss */ + if (replica->repl_state == REPLICA_STATE_ONLINE && replica->repl_ack_off == server.primary_repl_offset) { + best_replica = replica; + break; + } } + if (best_replica) { /* Send a CLUSTER FAILOVER FORCE to the best replica. */ const char *buf = "*3\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n"; diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 0ad83bb7c4..49a9c273e9 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -141,7 +141,7 @@ proc check_replica_acked_ofs {primary replica_ip replica_port} { proc wait_replica_acked_ofs {primary replica replica_ip replica_port} { $primary config set repl-ping-replica-period 3600 $replica config set hz 500 - wait_for_condition 50 100 { + wait_for_condition 100 100 { [check_replica_acked_ofs $primary $replica_ip $replica_port] eq 1 } else { puts "INFO REPLICATION: [$primary info replication]" diff --git a/tests/unit/cluster/auto-failover-on-shutdown.tcl b/tests/unit/cluster/auto-failover-on-shutdown.tcl index f03fdba289..00e15e009e 100644 --- a/tests/unit/cluster/auto-failover-on-shutdown.tcl +++ b/tests/unit/cluster/auto-failover-on-shutdown.tcl @@ -20,6 +20,10 @@ proc test_main {how shutdown_timeout} { $primary config set auto-failover-on-shutdown yes $primary config set shutdown-timeout $shutdown_timeout + $primary config set repl-ping-replica-period 3600 + + # To avoid failover kick in. + $replica2 config set cluster-replica-no-failover yes # Pause a replica so it has no chance to catch up with the offset. pause_process $replica1_pid @@ -29,7 +33,7 @@ proc test_main {how shutdown_timeout} { $primary incr key_991803 } - if {$shutdown_timeout != 0} { + if {$shutdown_timeout == 0} { # Wait the replica2 catch up with the offset wait_for_ofs_sync $primary $replica2 wait_replica_acked_ofs $primary $replica2 $replica2_ip $replica2_port @@ -63,6 +67,7 @@ proc test_main {how shutdown_timeout} { pause_process $replica1_pid + $primary config set auto-failover-on-shutdown yes $primary client kill type replica shutdown_how 6 $how wait_for_log_messages -6 {"*Unable to find a replica to perform an auto failover on shutdown*"} 0 1000 10