diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 892c722b00..6d32ddf26c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -726,10 +726,7 @@ int clusterLoadConfig(char *filename) { return C_OK; fmterr: - serverLog(LL_WARNING, "Unrecoverable error: corrupted cluster config file \"%s\".", line); - zfree(line); - if (fp) fclose(fp); - exit(1); + serverPanic("Unrecoverable error: corrupted cluster config file \"%s\".", line); } /* Cluster node configuration is exactly the same as CLUSTER NODES output. @@ -973,6 +970,27 @@ static void updateAnnouncedClientIpV6(clusterNode *node, char *value) { } static void updateShardId(clusterNode *node, const char *shard_id) { + /* Ensure replica shard IDs match their primary's to maintain cluster consistency. + * + * Shard ID updates must prioritize the primary, then propagate to replicas. + * This is critical due to the eventual consistency of shard IDs during cluster + * expansion. New replicas might replicate from a primary before fully + * synchronizing shard IDs with the rest of the cluster. + * + * Without this enforcement, a temporary inconsistency can arise where a + * replica's shard ID diverges from its primary's. This inconsistency is + * persisted in the primary's nodes.conf file. While this divergence will + * eventually resolve, if the primary crashes beforehand, it will enter a + * crash-restart loop due to the mismatch in its nodes.conf. */ + if (shard_id && nodeIsReplica(node) && + memcmp(clusterNodeGetPrimary(node)->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { + serverLog( + LL_NOTICE, + "Shard id %.40s update request for node id %.40s diverges from existing primary shard id %.40s, rejecting!", + shard_id, node->name, clusterNodeGetPrimary(node)->shard_id); + return; + } + if (shard_id && memcmp(node->shard_id, shard_id, CLUSTER_NAMELEN) != 0) { clusterRemoveNodeFromShard(node); memcpy(node->shard_id, shard_id, CLUSTER_NAMELEN); diff --git a/tests/support/cluster.tcl b/tests/support/cluster.tcl index 2b9e44f64f..b69b9714f0 100644 --- a/tests/support/cluster.tcl +++ b/tests/support/cluster.tcl @@ -365,3 +365,9 @@ proc ::valkey_cluster::get_slot_from_keys {keys} { } return $slot } + +# Check if the server responds with "PONG" +proc check_server_response {server_id} { + # Send a PING command and check if the response is "PONG" + return [expr {[catch {R $server_id PING} result] == 0 && $result eq "PONG"}] +} diff --git a/tests/unit/cluster/shardid-propagation.tcl b/tests/unit/cluster/shardid-propagation.tcl new file mode 100644 index 0000000000..f34e25601f --- /dev/null +++ b/tests/unit/cluster/shardid-propagation.tcl @@ -0,0 +1,59 @@ +set old_singledb $::singledb +set ::singledb 1 + +tags {tls:skip external:skip cluster} { + set base_conf [list cluster-enabled yes save ""] + start_multiple_servers 2 [list overrides $base_conf] { + test "Cluster nodes are reachable" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } + } + + test "Before slots allocation, all nodes report cluster failure" { + wait_for_cluster_state fail + } + + test "Cluster nodes haven't met each other" { + assert {[llength [get_cluster_nodes 1]] == 1} + assert {[llength [get_cluster_nodes 0]] == 1} + } + + test "Allocate slots" { + cluster_allocate_slots 1 1 + } + + test "Restart of node in cluster mode doesn't cause nodes.conf corruption due to shard id mismatch" { + set primary_id [R 0 CLUSTER MYID] + R 1 CLUSTER MEET 127.0.0.1 [srv 0 port] + set pattern *$primary_id* + wait_for_condition 100 50 { + [string match $pattern [R 1 CLUSTER NODES]] + } else { + fail "Meet took longer" + } + R 1 CLUSTER REPLICATE $primary_id + wait_replica_online [srv 0 client] + set result [catch {R 0 DEBUG RESTART 0} err] + + if {$result != 0 && $err ne "I/O error reading reply"} { + fail "Unexpected error restarting server: $err" + } + + wait_for_condition 100 100 { + [check_server_response 0] eq 1 + } else { + fail "Server didn't come back online in time" + } + } + } +} + +set ::singledb $old_singledb diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl index abc50ca035..2c84eae20f 100644 --- a/tests/unit/cluster/slot-migration.tcl +++ b/tests/unit/cluster/slot-migration.tcl @@ -36,12 +36,6 @@ proc wait_for_slot_state {srv_idx pattern} { } } -# Check if the server responds with "PONG" -proc check_server_response {server_id} { - # Send a PING command and check if the response is "PONG" - return [expr {[catch {R $server_id PING} result] == 0 && $result eq "PONG"}] -} - # restart a server and wait for it to come back online proc fail_server {server_id} { set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1]