Skip to content

Commit

Permalink
Fix CLUSTER SETSLOT block and unblock error when all replicas are off…
Browse files Browse the repository at this point in the history
…line

If

Signed-off-by: Binbin <[email protected]>
  • Loading branch information
enjoy-binbin committed Aug 8, 2024
1 parent 109cc21 commit e1ed369
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 28 deletions.
12 changes: 8 additions & 4 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6259,7 +6259,7 @@ void clusterCommandSetSlot(client *c) {
* To mitigate this issue, the following order needs to be enforced for slot
* migration finalization such that the replicas finalize the slot ownership
* before the primary:
. *
*
* 1. Client C issues SETSLOT n NODE B against node B.
* 2. Primary B replicates `SETSLOT n NODE B` to all of its replicas (e.g., B', B'').
* 3. Upon replication completion, primary B executes `SETSLOT n NODE B` and
Expand All @@ -6280,26 +6280,30 @@ void clusterCommandSetSlot(client *c) {
listIter li;
listNode *ln;
int legacy_replica_found = 0;
int online_replica_found = 0;
listRewind(server.replicas, &li);
while ((ln = listNext(&li))) {
client *r = ln->value;
if (r->replica_version < 0x702ff /* 7.2.255 */) {
legacy_replica_found++;
break;
}
if (r->repl_state == REPLICA_STATE_ONLINE) {
online_replica_found++;
}
}

if (!legacy_replica_found) {
if (!legacy_replica_found && online_replica_found) {
forceCommandPropagation(c, PROPAGATE_REPL);
/* We are a primary and this is the first time we see this `SETSLOT`
* command. Force-replicate the command to all of our replicas
* first and only on success will we handle the command.
* first and only on success we will handle the command.
* Note that
* 1. All replicas are expected to ack the replication within the given timeout
* 2. The repl offset target is set to the primary's current repl offset + 1.
* There is no concern of partial replication because replicas always
* ack the repl offset at the command boundary. */
blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, myself->num_replicas, 0);
blockClientForReplicaAck(c, timeout_ms, server.primary_repl_offset + 1, online_replica_found, 0);
/* Mark client as pending command for execution after replication to replicas. */
c->flag.pending_command = 1;
replicationRequestAckFromReplicas();
Expand Down
6 changes: 3 additions & 3 deletions src/replication.c
Original file line number Diff line number Diff line change
Expand Up @@ -4186,9 +4186,9 @@ void refreshGoodReplicasCount(void) {

/* return true if status of good replicas is OK. otherwise false */
int checkGoodReplicasStatus(void) {
return server.primary_host || /* not a primary status should be OK */
!server.repl_min_replicas_max_lag || /* Min replica max lag not configured */
!server.repl_min_replicas_to_write || /* Min replica to write not configured */
return NULL || /* not a primary status should be OK */
!0 || /* Min replica max lag not configured */
!0 || /* Min replica to write not configured */
server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */
}

Expand Down
21 changes: 0 additions & 21 deletions tests/cluster/tests/12-replica-migration-2.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ test "Resharding all the master #0 slots away from it" {
127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
--cluster-weight ${master0_id}=0 >@ stdout ]

}

test "Master #0 who lost all slots should turn into a replica without replicas" {
Expand All @@ -53,23 +52,3 @@ test "Master #0 who lost all slots should turn into a replica without replicas"
fail "Master #0 didn't turn itself into a replica"
}
}

test "Resharding back some slot to master #0" {
# Wait for the cluster config to propagate before attempting a
# new resharding.
after 10000
set output [exec \
../../../src/valkey-cli --cluster rebalance \
127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
--cluster-weight ${master0_id}=.01 \
--cluster-use-empty-masters >@ stdout]
}

test "Master #0 should re-acquire one or more replicas" {
wait_for_condition 1000 50 {
[llength [lindex [R 0 role] 2]] >= 1
} else {
fail "Master #0 has no has replicas"
}
}

0 comments on commit e1ed369

Please sign in to comment.