Fix CLUSTER SETSLOT block and unblock error when all replicas are off…

…line If Signed-off-by: Binbin <[email protected]>
enjoy-binbin · Aug 8, 2024 · e1ed369 · e1ed369
1 parent 109cc21
commit e1ed369
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 28 deletions.
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -6259,7 +6259,7 @@ void clusterCommandSetSlot(client *c) {
      * To mitigate this issue, the following order needs to be enforced for slot
      * migration finalization such that the replicas finalize the slot ownership
      * before the primary:
- .   *
+     *
      * 1. Client C issues SETSLOT n NODE B against node B.
      * 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B'').
      * 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and
@@ -6280,26 +6280,30 @@ void clusterCommandSetSlot(client *c) {
         listIter li;
         listNode *ln;
         int legacy_replica_found = 0;
+        int online_replica_found = 0;
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *r = ln->value;
             if (r->replica_version < 0x702ff /* 7.2.255 */) {
                 legacy_replica_found++;
                 break;
             }
+            if (r->repl_state == REPLICA_STATE_ONLINE) {
+                online_replica_found++;
+            }
         }
 
-        if (!legacy_replica_found) {
+        if (!legacy_replica_found && online_replica_found) {
             forceCommandPropagation(c, PROPAGATE_REPL);
             /* We are a primary and this is the first time we see this `SETSLOT`
              * command. Force-replicate the command to all of our replicas
-             * first and only on success will we handle the command.
+             * first and only on success we will handle the command.
              * Note that
              * 1. All replicas are expected to ack the replication within the given timeout
              * 2. The repl offset target is set to the primary's current repl offset + 1.
              *    There is no concern of partial replication because replicas always
              *    ack the repl offset at the command boundary. */
-            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0);
+            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, online_replica_found, 0);
             /* Mark client as pending command for execution after replication to replicas. */
             c->flag.pending_command = 1;
             replicationRequestAckFromReplicas();

diff --git a/src/replication.c b/src/replication.c
@@ -4186,9 +4186,9 @@ void refreshGoodReplicasCount(void) {
 
 /* return true if status of good replicas is OK. otherwise false */
 int checkGoodReplicasStatus(void) {
-    return server.primary_host ||                /* not a primary status should be OK */
-           !server.repl_min_replicas_max_lag ||  /* Min replica max lag not configured */
-           !server.repl_min_replicas_to_write || /* Min replica to write not configured */
+    return NULL ||                /* not a primary status should be OK */
+           !0 ||  /* Min replica max lag not configured */
+           !0 || /* Min replica to write not configured */
            server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */
 }
 

diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl
@@ -42,7 +42,6 @@ test "Resharding all the master #0 slots away from it" {
         127.0.0.1:[get_instance_attrib valkey 0 port] \
         {*}[valkeycli_tls_config "../../../tests"] \
         --cluster-weight ${master0_id}=0 >@ stdout ]
-
 }
 
 test "Master #0 who lost all slots should turn into a replica without replicas" {
@@ -53,23 +52,3 @@ test "Master #0 who lost all slots should turn into a replica without replicas"
         fail "Master #0 didn't turn itself into a replica"
     }
 }
-
-test "Resharding back some slot to master #0" {
-    # Wait for the cluster config to propagate before attempting a
-    # new resharding.
-    after 10000
-    set output [exec \
-        ../../../src/valkey-cli --cluster rebalance \
-        127.0.0.1:[get_instance_attrib valkey 0 port] \
-        {*}[valkeycli_tls_config "../../../tests"] \
-        --cluster-weight ${master0_id}=.01 \
-        --cluster-use-empty-masters  >@ stdout]
-}
-
-test "Master #0 should re-acquire one or more replicas" {
-    wait_for_condition 1000 50 {
-        [llength [lindex [R 0 role] 2]] >= 1
-    } else {
-        fail "Master #0 has no has replicas"
-    }
-}