From fa0285cb4cfe987d19c3b11291fc62987e6f8e43 Mon Sep 17 00:00:00 2001 From: Ping Xie Date: Mon, 6 May 2024 17:21:52 -0700 Subject: [PATCH] Add missing test file Signed-off-by: Ping Xie --- tests/unit/cluster/slot-migration.tcl | 357 ++++++++++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 tests/unit/cluster/slot-migration.tcl diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl new file mode 100644 index 0000000000..793b00312d --- /dev/null +++ b/tests/unit/cluster/slot-migration.tcl @@ -0,0 +1,357 @@ +proc get_open_slots {srv_idx} { + set slots [dict get [cluster_get_myself $srv_idx] slots] + if {[regexp {\[.*} $slots slots]} { + set slots [regsub -all {[{}]} $slots ""] + return $slots + } else { + return {} + } +} + +proc get_cluster_role {srv_idx} { + set flags [dict get [cluster_get_myself $srv_idx] flags] + set role [lindex $flags 1] + return $role +} + +proc wait_for_role {srv_idx role} { + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + # wait for a gossip cycle for states to be propagated throughout the cluster + after $node_timeout + wait_for_condition 100 100 { + [lindex [split [R $srv_idx ROLE] " "] 0] eq $role + } else { + fail "R $srv_idx didn't assume the replication $role in time" + } + wait_for_condition 100 100 { + [get_cluster_role $srv_idx] eq $role + } else { + fail "R $srv_idx didn't assume the cluster $role in time" + } + wait_for_cluster_propagation +} + +proc wait_for_slot_state {srv_idx pattern} { + wait_for_condition 100 100 { + [get_open_slots $srv_idx] eq $pattern + } else { + fail "incorrect slot state on R $srv_idx: expected $pattern; got [get_open_slots $srv_idx]" + } +} + +# Check if the server responds with "PONG" +proc check_server_response {server_id} { + # Send a PING command and check if the response is "PONG" + return [expr {[catch {R $server_id PING} result] == 0 && $result eq "PONG"}] +} + +# restart a server and wait for it to come back online +proc restart_server_and_wait {server_id} { + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + set result [catch {R $server_id DEBUG RESTART [expr 3*$node_timeout]} err] + + # Check if the error is the expected "I/O error reading reply" + if {$result != 0 && $err ne "I/O error reading reply"} { + fail "Unexpected error restarting server $server_id: $err" + } + + wait_for_condition 100 100 { + [check_server_response $server_id] eq 1 + } else { + fail "Server $server_id didn't come back online in time" + } +} + +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no cluster-node-timeout 1000} } { + + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + set R0_id [R 0 CLUSTER MYID] + set R1_id [R 1 CLUSTER MYID] + set R2_id [R 2 CLUSTER MYID] + set R3_id [R 3 CLUSTER MYID] + set R4_id [R 4 CLUSTER MYID] + set R5_id [R 5 CLUSTER MYID] + + test "Slot migration states are replicated" { + # Validate initial states + assert_not_equal [get_open_slots 0] "\[609->-$R1_id\]" + assert_not_equal [get_open_slots 1] "\[609-<-$R0_id\]" + assert_not_equal [get_open_slots 3] "\[609->-$R1_id\]" + assert_not_equal [get_open_slots 4] "\[609-<-$R0_id\]" + # Kick off the migration of slot 609 from R0 to R1 + assert_equal {OK} [R 0 CLUSTER SETSLOT 609 MIGRATING $R1_id] + assert_equal {OK} [R 1 CLUSTER SETSLOT 609 IMPORTING $R0_id] + # Validate that R0 is migrating slot 609 to R1 + assert_equal [get_open_slots 0] "\[609->-$R1_id\]" + # Validate that R1 is importing slot 609 from R0 + assert_equal [get_open_slots 1] "\[609-<-$R0_id\]" + # Validate final states + wait_for_slot_state 0 "\[609->-$R1_id\]" + wait_for_slot_state 1 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R1_id\]" + wait_for_slot_state 4 "\[609-<-$R0_id\]" + } + + test "Migration target is auto-updated after failover in target shard" { + # Restart R1 to trigger an auto-failover to R4 + restart_server_and_wait 1 + # Wait for R1 to become a replica + wait_for_role 1 slave + # Validate final states + wait_for_slot_state 0 "\[609->-$R4_id\]" + wait_for_slot_state 1 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R4_id\]" + wait_for_slot_state 4 "\[609-<-$R0_id\]" + # Restore R1's primaryship + assert_equal {OK} [R 1 cluster failover] + wait_for_role 1 master + # Validate initial states + wait_for_slot_state 0 "\[609->-$R1_id\]" + wait_for_slot_state 1 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R1_id\]" + wait_for_slot_state 4 "\[609-<-$R0_id\]" + } + + test "Migration source is auto-updated after failover in source shard" { + # Restart R0 to trigger an auto-failover to R3 + restart_server_and_wait 0 + # Wait for R0 to become a replica + wait_for_role 0 slave + # Validate final states + wait_for_slot_state 0 "\[609->-$R1_id\]" + wait_for_slot_state 1 "\[609-<-$R3_id\]" + wait_for_slot_state 3 "\[609->-$R1_id\]" + wait_for_slot_state 4 "\[609-<-$R3_id\]" + # Restore R0's primaryship + assert_equal {OK} [R 0 cluster failover] + wait_for_role 0 master + # Validate final states + wait_for_slot_state 0 "\[609->-$R1_id\]" + wait_for_slot_state 1 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R1_id\]" + wait_for_slot_state 4 "\[609-<-$R0_id\]" + } + + test "Replica redirects key access in migrating slots" { + # Validate initial states + assert_equal [get_open_slots 0] "\[609->-$R1_id\]" + assert_equal [get_open_slots 1] "\[609-<-$R0_id\]" + assert_equal [get_open_slots 3] "\[609->-$R1_id\]" + assert_equal [get_open_slots 4] "\[609-<-$R0_id\]" + catch {[R 3 get aga]} e + assert_equal {MOVED} [lindex [split $e] 0] + assert_equal {609} [lindex [split $e] 1] + } + + test "New replica inherits migrating slot" { + # Reset R3 to turn it into an empty node + assert_equal [get_open_slots 3] "\[609->-$R1_id\]" + assert_equal {OK} [R 3 CLUSTER RESET] + assert_not_equal [get_open_slots 3] "\[609->-$R1_id\]" + # Add R3 back as a replica of R0 + assert_equal {OK} [R 3 CLUSTER MEET [srv 0 "host"] [srv 0 "port"]] + wait_for_role 0 master + assert_equal {OK} [R 3 CLUSTER REPLICATE $R0_id] + wait_for_role 3 slave + # Validate that R3 now sees slot 609 open + assert_equal [get_open_slots 3] "\[609->-$R1_id\]" + } + + test "New replica inherits importing slot" { + # Reset R4 to turn it into an empty node + assert_equal [get_open_slots 4] "\[609-<-$R0_id\]" + assert_equal {OK} [R 4 CLUSTER RESET] + assert_not_equal [get_open_slots 4] "\[609-<-$R0_id\]" + # Add R4 back as a replica of R1 + assert_equal {OK} [R 4 CLUSTER MEET [srv -1 "host"] [srv -1 "port"]] + wait_for_role 1 master + assert_equal {OK} [R 4 CLUSTER REPLICATE $R1_id] + wait_for_role 4 slave + # Validate that R4 now sees slot 609 open + assert_equal [get_open_slots 4] "\[609-<-$R0_id\]" + } +} + +proc create_empty_shard {p r} { + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + assert_equal {OK} [R $p CLUSTER RESET] + assert_equal {OK} [R $r CLUSTER RESET] + assert_equal {OK} [R $p CLUSTER MEET [srv 0 "host"] [srv 0 "port"]] + assert_equal {OK} [R $r CLUSTER MEET [srv 0 "host"] [srv 0 "port"]] + wait_for_role $p master + assert_equal {OK} [R $r CLUSTER REPLICATE [R $p CLUSTER MYID]] + wait_for_role $r slave + wait_for_role $p master +} + +start_cluster 3 5 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no cluster-node-timeout 1000} } { + + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + set R0_id [R 0 CLUSTER MYID] + set R1_id [R 1 CLUSTER MYID] + set R2_id [R 2 CLUSTER MYID] + set R3_id [R 3 CLUSTER MYID] + set R4_id [R 4 CLUSTER MYID] + set R5_id [R 5 CLUSTER MYID] + + create_empty_shard 6 7 + set R6_id [R 6 CLUSTER MYID] + set R7_id [R 7 CLUSTER MYID] + + test "Empty-shard migration replicates slot importing states" { + # Validate initial states + assert_not_equal [get_open_slots 0] "\[609->-$R6_id\]" + assert_not_equal [get_open_slots 6] "\[609-<-$R0_id\]" + assert_not_equal [get_open_slots 3] "\[609->-$R6_id\]" + assert_not_equal [get_open_slots 7] "\[609-<-$R0_id\]" + # Kick off the migration of slot 609 from R0 to R6 + assert_equal {OK} [R 0 CLUSTER SETSLOT 609 MIGRATING $R6_id] + assert_equal {OK} [R 6 CLUSTER SETSLOT 609 IMPORTING $R0_id] + # Validate that R0 is migrating slot 609 to R6 + assert_equal [get_open_slots 0] "\[609->-$R6_id\]" + # Validate that R6 is importing slot 609 from R0 + assert_equal [get_open_slots 6] "\[609-<-$R0_id\]" + # Validate final states + wait_for_slot_state 0 "\[609->-$R6_id\]" + wait_for_slot_state 6 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R6_id\]" + wait_for_slot_state 7 "\[609-<-$R0_id\]" + } + + test "Empty-shard migration target is auto-updated after faiover in target shard" { + wait_for_role 6 master + # Restart R6 to trigger an auto-failover to R7 + restart_server_and_wait 6 + # Wait for R6 to become a replica + wait_for_role 6 slave + # Validate final states + wait_for_slot_state 0 "\[609->-$R7_id\]" + wait_for_slot_state 6 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R7_id\]" + wait_for_slot_state 7 "\[609-<-$R0_id\]" + # Restore R6's primaryship + assert_equal {OK} [R 6 cluster failover] + wait_for_role 6 master + # Validate final states + wait_for_slot_state 0 "\[609->-$R6_id\]" + wait_for_slot_state 6 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R6_id\]" + wait_for_slot_state 7 "\[609-<-$R0_id\]" + } + + test "Empty-shard migration source is auto-updated after faiover in source shard" { + wait_for_role 0 master + # Restart R0 to trigger an auto-failover to R3 + restart_server_and_wait 0 + # Wait for R0 to become a replica + wait_for_role 0 slave + # Validate final states + wait_for_slot_state 0 "\[609->-$R6_id\]" + wait_for_slot_state 6 "\[609-<-$R3_id\]" + wait_for_slot_state 3 "\[609->-$R6_id\]" + wait_for_slot_state 7 "\[609-<-$R3_id\]" + # Restore R0's primaryship + assert_equal {OK} [R 0 cluster failover] + wait_for_role 0 master + # Validate final states + wait_for_slot_state 0 "\[609->-$R6_id\]" + wait_for_slot_state 6 "\[609-<-$R0_id\]" + wait_for_slot_state 3 "\[609->-$R6_id\]" + wait_for_slot_state 7 "\[609-<-$R0_id\]" + } +} + +proc migrate_slot {from to slot} { + set from_id [R $from CLUSTER MYID] + set to_id [R $to CLUSTER MYID] + assert_equal {OK} [R $from CLUSTER SETSLOT $slot MIGRATING $to_id] + assert_equal {OK} [R $to CLUSTER SETSLOT $slot IMPORTING $from_id] +} + +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no cluster-node-timeout 1000} } { + + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + set R0_id [R 0 CLUSTER MYID] + set R1_id [R 1 CLUSTER MYID] + set R2_id [R 2 CLUSTER MYID] + set R3_id [R 3 CLUSTER MYID] + set R4_id [R 4 CLUSTER MYID] + set R5_id [R 5 CLUSTER MYID] + + test "Multiple slot migration states are replicated" { + migrate_slot 0 1 13 + migrate_slot 0 1 7 + migrate_slot 0 1 17 + # Validate final states + wait_for_slot_state 0 "\[7->-$R1_id\] \[13->-$R1_id\] \[17->-$R1_id\]" + wait_for_slot_state 1 "\[7-<-$R0_id\] \[13-<-$R0_id\] \[17-<-$R0_id\]" + wait_for_slot_state 3 "\[7->-$R1_id\] \[13->-$R1_id\] \[17->-$R1_id\]" + wait_for_slot_state 4 "\[7-<-$R0_id\] \[13-<-$R0_id\] \[17-<-$R0_id\]" + } + + test "New replica inherits multiple migrating slots" { + # Reset R3 to turn it into an empty node + assert_equal {OK} [R 3 CLUSTER RESET] + # Add R3 back as a replica of R0 + assert_equal {OK} [R 3 CLUSTER MEET [srv 0 "host"] [srv 0 "port"]] + wait_for_role 0 master + assert_equal {OK} [R 3 CLUSTER REPLICATE $R0_id] + wait_for_role 3 slave + # Validate final states + wait_for_slot_state 3 "\[7->-$R1_id\] \[13->-$R1_id\] \[17->-$R1_id\]" + } + + test "Slot finalization succeeds on both primary and replicas" { + assert_equal {OK} [R 1 CLUSTER SETSLOT 7 NODE $R1_id] + wait_for_slot_state 1 "\[13-<-$R0_id\] \[17-<-$R0_id\]" + wait_for_slot_state 4 "\[13-<-$R0_id\] \[17-<-$R0_id\]" + assert_equal {OK} [R 1 CLUSTER SETSLOT 13 NODE $R1_id] + wait_for_slot_state 1 "\[17-<-$R0_id\]" + wait_for_slot_state 4 "\[17-<-$R0_id\]" + assert_equal {OK} [R 1 CLUSTER SETSLOT 17 NODE $R1_id] + wait_for_slot_state 1 "" + wait_for_slot_state 4 "" + } + +} + +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no cluster-node-timeout 1000} } { + + set node_timeout [lindex [R 0 CONFIG GET cluster-node-timeout] 1] + set R0_id [R 0 CLUSTER MYID] + set R1_id [R 1 CLUSTER MYID] + + test "Slot is auto-claimed by target after source relinquishes ownership" { + migrate_slot 0 1 609 + #Validate that R1 doesn't own slot 609 + catch {[R 1 get aga]} e + assert_equal {MOVED} [lindex [split $e] 0] + #Finalize the slot on the source first + assert_equal {OK} [R 0 CLUSTER SETSLOT 609 NODE $R1_id] + after $node_timeout + #R1 should claim slot 609 since it is still importing slot 609 + #from R0 but R0 no longer owns this slot + assert_equal {OK} [R 1 set aga foo] + } +} + +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no cluster-node-timeout 1000} } { + set R1_id [R 1 CLUSTER MYID] + + test "CLUSTER SETSLOT with an explicit timeout" { + # Simulate a replica crash + catch {R 3 DEBUG SEGFAULT} e + + # Setslot with an explicit 1ms timeoout + set start_time [clock milliseconds] + catch {R 0 CLUSTER SETSLOT 609 MIGRATING $R1_id TIMEOUT 3000} e + set end_time [clock milliseconds] + set duration [expr {$end_time - $start_time}] + + # Assert that the execution time is greater than the default 2s timeout + assert {$duration > 2000} + + # Setslot should fail with not enough good replicas to write after the timeout + assert_equal {NOREPLICAS Not enough good replicas to write.} $e + } +}