From e1ed3699826bc720c15399ac0cab260436916ea5 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 8 Aug 2024 19:58:32 +0800 Subject: [PATCH] Fix CLUSTER SETSLOT block and unblock error when all replicas are offline If Signed-off-by: Binbin --- src/cluster_legacy.c | 12 +++++++---- src/replication.c | 6 +++--- .../cluster/tests/12-replica-migration-2.tcl | 21 ------------------- 3 files changed, 11 insertions(+), 28 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index b35b0b3b8e..d0a594c9b7 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6259,7 +6259,7 @@ void clusterCommandSetSlot(client *c) { * To mitigate this issue, the following order needs to be enforced for slot * migration finalization such that the replicas finalize the slot ownership * before the primary: - . * + * * 1. Client C issues SETSLOT n NODE B against node B. * 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B''). * 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and @@ -6280,6 +6280,7 @@ void clusterCommandSetSlot(client *c) { listIter li; listNode *ln; int legacy_replica_found = 0; + int online_replica_found = 0; listRewind(server.replicas, &li); while ((ln = listNext(&li))) { client *r = ln->value; @@ -6287,19 +6288,22 @@ void clusterCommandSetSlot(client *c) { legacy_replica_found++; break; } + if (r->repl_state == REPLICA_STATE_ONLINE) { + online_replica_found++; + } } - if (!legacy_replica_found) { + if (!legacy_replica_found && online_replica_found) { forceCommandPropagation(c, PROPAGATE_REPL); /* We are a primary and this is the first time we see this `SETSLOT` * command. Force-replicate the command to all of our replicas - * first and only on success will we handle the command. + * first and only on success we will handle the command. * Note that * 1. All replicas are expected to ack the replication within the given timeout * 2. The repl offset target is set to the primary's current repl offset + 1. * There is no concern of partial replication because replicas always * ack the repl offset at the command boundary. */ - blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0); + blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, online_replica_found, 0); /* Mark client as pending command for execution after replication to replicas. */ c->flag.pending_command = 1; replicationRequestAckFromReplicas(); diff --git a/src/replication.c b/src/replication.c index 6be8d3f9d5..155d5ad4f2 100644 --- a/src/replication.c +++ b/src/replication.c @@ -4186,9 +4186,9 @@ void refreshGoodReplicasCount(void) { /* return true if status of good replicas is OK. otherwise false */ int checkGoodReplicasStatus(void) { - return server.primary_host || /* not a primary status should be OK */ - !server.repl_min_replicas_max_lag || /* Min replica max lag not configured */ - !server.repl_min_replicas_to_write || /* Min replica to write not configured */ + return NULL || /* not a primary status should be OK */ + !0 || /* Min replica max lag not configured */ + !0 || /* Min replica to write not configured */ server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */ } diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl index 97acc705f2..4f3abd612a 100644 --- a/tests/cluster/tests/12-replica-migration-2.tcl +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -42,7 +42,6 @@ test "Resharding all the master #0 slots away from it" { 127.0.0.1:[get_instance_attrib valkey 0 port] \ {*}[valkeycli_tls_config "../../../tests"] \ --cluster-weight ${master0_id}=0 >@ stdout ] - } test "Master #0 who lost all slots should turn into a replica without replicas" { @@ -53,23 +52,3 @@ test "Master #0 who lost all slots should turn into a replica without replicas" fail "Master #0 didn't turn itself into a replica" } } - -test "Resharding back some slot to master #0" { - # Wait for the cluster config to propagate before attempting a - # new resharding. - after 10000 - set output [exec \ - ../../../src/valkey-cli --cluster rebalance \ - 127.0.0.1:[get_instance_attrib valkey 0 port] \ - {*}[valkeycli_tls_config "../../../tests"] \ - --cluster-weight ${master0_id}=.01 \ - --cluster-use-empty-masters >@ stdout] -} - -test "Master #0 should re-acquire one or more replicas" { - wait_for_condition 1000 50 { - [llength [lindex [R 0 role] 2]] >= 1 - } else { - fail "Master #0 has no has replicas" - } -}