From e1ed3699826bc720c15399ac0cab260436916ea5 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 8 Aug 2024 19:58:32 +0800
Subject: [PATCH] Fix CLUSTER SETSLOT block and unblock error when all replicas
 are offline

If

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c                          | 12 +++++++----
 src/replication.c                             |  6 +++---
 .../cluster/tests/12-replica-migration-2.tcl  | 21 -------------------
 3 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index b35b0b3b8e..d0a594c9b7 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -6259,7 +6259,7 @@ void clusterCommandSetSlot(client *c) {
      * To mitigate this issue, the following order needs to be enforced for slot
      * migration finalization such that the replicas finalize the slot ownership
      * before the primary:
- .   *
+     *
      * 1. Client C issues SETSLOT n NODE B against node B.
      * 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B'').
      * 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and
@@ -6280,6 +6280,7 @@ void clusterCommandSetSlot(client *c) {
         listIter li;
         listNode *ln;
         int legacy_replica_found = 0;
+        int online_replica_found = 0;
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *r = ln->value;
@@ -6287,19 +6288,22 @@ void clusterCommandSetSlot(client *c) {
                 legacy_replica_found++;
                 break;
             }
+            if (r->repl_state == REPLICA_STATE_ONLINE) {
+                online_replica_found++;
+            }
         }
 
-        if (!legacy_replica_found) {
+        if (!legacy_replica_found && online_replica_found) {
             forceCommandPropagation(c, PROPAGATE_REPL);
             /* We are a primary and this is the first time we see this `SETSLOT`
              * command. Force-replicate the command to all of our replicas
-             * first and only on success will we handle the command.
+             * first and only on success we will handle the command.
              * Note that
              * 1. All replicas are expected to ack the replication within the given timeout
              * 2. The repl offset target is set to the primary's current repl offset + 1.
              *    There is no concern of partial replication because replicas always
              *    ack the repl offset at the command boundary. */
-            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0);
+            blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, online_replica_found, 0);
             /* Mark client as pending command for execution after replication to replicas. */
             c->flag.pending_command = 1;
             replicationRequestAckFromReplicas();
diff --git a/src/replication.c b/src/replication.c
index 6be8d3f9d5..155d5ad4f2 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -4186,9 +4186,9 @@ void refreshGoodReplicasCount(void) {
 
 /* return true if status of good replicas is OK. otherwise false */
 int checkGoodReplicasStatus(void) {
-    return server.primary_host ||                /* not a primary status should be OK */
-           !server.repl_min_replicas_max_lag ||  /* Min replica max lag not configured */
-           !server.repl_min_replicas_to_write || /* Min replica to write not configured */
+    return NULL ||                /* not a primary status should be OK */
+           !0 ||  /* Min replica max lag not configured */
+           !0 || /* Min replica to write not configured */
            server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */
 }
 
diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl
index 97acc705f2..4f3abd612a 100644
--- a/tests/cluster/tests/12-replica-migration-2.tcl
+++ b/tests/cluster/tests/12-replica-migration-2.tcl
@@ -42,7 +42,6 @@ test "Resharding all the master #0 slots away from it" {
         127.0.0.1:[get_instance_attrib valkey 0 port] \
         {*}[valkeycli_tls_config "../../../tests"] \
         --cluster-weight ${master0_id}=0 >@ stdout ]
-
 }
 
 test "Master #0 who lost all slots should turn into a replica without replicas" {
@@ -53,23 +52,3 @@ test "Master #0 who lost all slots should turn into a replica without replicas"
         fail "Master #0 didn't turn itself into a replica"
     }
 }
-
-test "Resharding back some slot to master #0" {
-    # Wait for the cluster config to propagate before attempting a
-    # new resharding.
-    after 10000
-    set output [exec \
-        ../../../src/valkey-cli --cluster rebalance \
-        127.0.0.1:[get_instance_attrib valkey 0 port] \
-        {*}[valkeycli_tls_config "../../../tests"] \
-        --cluster-weight ${master0_id}=.01 \
-        --cluster-use-empty-masters  >@ stdout]
-}
-
-test "Master #0 should re-acquire one or more replicas" {
-    wait_for_condition 1000 50 {
-        [llength [lindex [R 0 role] 2]] >= 1
-    } else {
-        fail "Master #0 has no has replicas"
-    }
-}