Skip to content

Commit

Permalink
Fix primary crash when processing dirty slots during shutdown wait / …
Browse files Browse the repository at this point in the history
…failover wait / client pause (#1131)

We have an assert in propagateNow. If the primary node receives a
CLUSTER UPDATE such as dirty slots during SIGTERM waitting or during
a manual failover pausing or during a client pause, the delKeysInSlot
call will trigger this assert and cause primary crash.

In this case, we added a new server_del_keys_in_slot state just like
client_pause_in_transaction to track the state to avoid the assert
in propagateNow, the dirty slots will be deleted in the end without
affecting the data consistency.

Signed-off-by: Binbin <[email protected]>
Co-authored-by: Viktor Söderqvist <[email protected]>
  • Loading branch information
enjoy-binbin and zuiderkwast authored Nov 15, 2024
1 parent 4e2493e commit 92181b6
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 4 deletions.
5 changes: 5 additions & 0 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6084,6 +6084,9 @@ void removeChannelsInSlot(unsigned int slot) {
unsigned int delKeysInSlot(unsigned int hashslot) {
if (!countKeysInSlot(hashslot)) return 0;

/* We may lose a slot during the pause. We need to track this
* state so that we don't assert in propagateNow(). */
server.server_del_keys_in_slot = 1;
unsigned int j = 0;

kvstoreDictIterator *kvs_di = NULL;
Expand All @@ -6108,6 +6111,8 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
}
kvstoreReleaseDictIterator(kvs_di);

server.server_del_keys_in_slot = 0;
serverAssert(server.execution_nesting == 0);
return j;
}

Expand Down
2 changes: 1 addition & 1 deletion src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -4571,7 +4571,7 @@ static void pauseClientsByClient(mstime_t endTime, int isPauseClientAll) {
}

/* Pause actions up to the specified unixtime (in ms) for a given type of
* commands.
* purpose.
*
* A main use case of this function is to allow pausing replication traffic
* so that a failover without data loss to occur. Replicas will continue to receive
Expand Down
24 changes: 22 additions & 2 deletions src/server.c
Original file line number Diff line number Diff line change
Expand Up @@ -3315,8 +3315,28 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) {
if (!shouldPropagate(target)) return;

/* This needs to be unreachable since the dataset should be fixed during
* replica pause (otherwise data may be lost during a failover) */
serverAssert(!(isPausedActions(PAUSE_ACTION_REPLICA) && (!server.client_pause_in_transaction)));
* replica pause (otherwise data may be lost during a failover).
*
* Though, there are exceptions:
*
* 1. We allow write commands that were queued up before and after to
* execute, if a CLIENT PAUSE executed during a transaction, we will
* track the state, the CLIENT PAUSE takes effect only after a transaction
* has finished.
* 2. Primary loses a slot during the pause, deletes all keys and replicates
* DEL to its replicas. In this case, we will track the state, the dirty
* slots will be deleted in the end without affecting the data consistency.
*
* Note that case 2 can happen in one of the following scenarios:
* 1) The primary waits for the replica to replicate before exiting, see
* shutdown-timeout in conf for more details. In this case, primary lost
* a slot during the SIGTERM waiting.
* 2) The primary waits for the replica to replicate during a manual failover.
* In this case, primary lost a slot during the pausing.
* 3) The primary was paused by CLIENT PAUSE, and lost a slot during the
* pausing. */
serverAssert(!isPausedActions(PAUSE_ACTION_REPLICA) || server.client_pause_in_transaction ||
server.server_del_keys_in_slot);

if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc);
if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc);
Expand Down
3 changes: 2 additions & 1 deletion src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,7 @@ struct valkeyServer {
const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */
char *ignore_warnings; /* Config: warnings that should be ignored. */
int client_pause_in_transaction; /* Was a client pause executed during this Exec? */
int server_del_keys_in_slot; /* The server is deleting the keys in the dirty slot. */
int thp_enabled; /* If true, THP is enabled. */
size_t page_size; /* The page size of OS. */
/* Modules */
Expand Down Expand Up @@ -2863,7 +2864,7 @@ void flushReplicasOutputBuffers(void);
void disconnectReplicas(void);
void evictClients(void);
int listenToPort(connListener *fds);
void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions_bitmask);
void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions);
void unpauseActions(pause_purpose purpose);
uint32_t isPausedActions(uint32_t action_bitmask);
uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask);
Expand Down
85 changes: 85 additions & 0 deletions tests/unit/cluster/slot-ownership.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,88 @@ start_cluster 2 2 {tags {external:skip cluster}} {
}
}
}

start_cluster 3 1 {tags {external:skip cluster} overrides {shutdown-timeout 100}} {
test "Primary lost a slot during the shutdown waiting" {
R 0 set FOO 0

# Pause the replica.
pause_process [srv -3 pid]

# Incr the key and immediately shutdown the primary.
# The primary waits for the replica to replicate before exiting.
R 0 incr FOO
exec kill -SIGTERM [srv 0 pid]
wait_for_condition 50 100 {
[s 0 shutdown_in_milliseconds] > 0
} else {
fail "Primary not indicating ongoing shutdown."
}

# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]

# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10

# Resume the replica and make sure primary exits normally instead of crashing.
resume_process [srv -3 pid]
wait_for_log_messages 0 {"*Valkey is now ready to exit, bye bye*"} 0 1000 10

# Make sure that the replica will become the new primary and does not own the key.
wait_for_condition 1000 50 {
[s -3 role] eq {master}
} else {
fail "The replica was not converted into primary"
}
assert_error {ERR no such key} {R 3 debug object foo}
}
}

start_cluster 3 1 {tags {external:skip cluster}} {
test "Primary lost a slot during the manual failover pausing" {
R 0 set FOO 0

# Set primaries to drop the FAILOVER_AUTH_REQUEST packets, so that
# primary 0 will pause until the failover times out.
R 1 debug drop-cluster-packet-filter 5
R 2 debug drop-cluster-packet-filter 5

# Replica doing the manual failover.
R 3 cluster failover

# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]

# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10

# Make sure primary doesn't crash when deleting the keys.
R 0 ping

R 1 debug drop-cluster-packet-filter -1
R 2 debug drop-cluster-packet-filter -1
}
}

start_cluster 3 1 {tags {external:skip cluster}} {
test "Primary lost a slot during the client pause command" {
R 0 set FOO 0

R 0 client pause 1000000000 write

# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]

# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10

# Make sure primary doesn't crash when deleting the keys.
R 0 ping

R 0 client unpause
}
}
27 changes: 27 additions & 0 deletions tests/unit/pause.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,33 @@ start_server {tags {"pause network"}} {
r client unpause
}

test "Test eviction is skipped during client pause" {
r flushall
set evicted_keys [s 0 evicted_keys]

r multi
r set foo{t} bar
r config set maxmemory-policy allkeys-random
r config set maxmemory 1
r client PAUSE 50000 WRITE
r exec

# No keys should actually have been evicted.
assert_match $evicted_keys [s 0 evicted_keys]

# The previous config set triggers a time event, but due to the pause,
# no eviction has been made. After the unpause, a eviction will happen.
r client unpause
wait_for_condition 1000 10 {
[expr $evicted_keys + 1] eq [s 0 evicted_keys]
} else {
fail "Key is not evicted"
}

r config set maxmemory 0
r config set maxmemory-policy noeviction
}

test "Test both active and passive expires are skipped during client pause" {
set expired_keys [s 0 expired_keys]
r multi
Expand Down

0 comments on commit 92181b6

Please sign in to comment.