From 6f9d73744c46f74b15574d70d086dbf98c56c9e1 Mon Sep 17 00:00:00 2001 From: naglera Date: Mon, 21 Oct 2024 16:45:32 +0000 Subject: [PATCH] Consistently reproduce crash and improve test reliability - Add test to consistently reproduce rdb load callback crash - Avoid checking close_asap when no data was processed Signed-off-by: naglera --- src/rdb.c | 8 +-- .../integration/dual-channel-replication.tcl | 66 ++++++++++++++++++- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 2c24d9b20b..13bd01aa36 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2929,14 +2929,14 @@ int rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { loadingAbsProgress(r->processed_bytes); processEventsWhileBlocked(); processModuleLoadingProgressEvent(0); + if (server.repl_provisional_primary.close_asap == 1) { + serverLog(LL_WARNING, "Primary main connection dropped during RDB load callback"); + return -1; + } } if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { server.stat_net_repl_input_bytes += len; } - if (server.repl_provisional_primary.close_asap == 1) { - serverLog(LL_WARNING, "Primary main connection dropped during RDB load callback"); - return -1; - } return 0; } diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 5302030db9..93dcbda9a7 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -1174,7 +1174,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } $primary debug log "killing replica main connection" - set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"] + set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] assert {$replica_main_conn_id != ""} set loglines [count_log_lines -1] $primary client kill id $replica_main_conn_id @@ -1197,3 +1197,67 @@ start_server {tags {"dual-channel-replication external:skip"}} { stop_write_load $load_handle } } + + +start_server {tags {"dual-channel-replication external:skip"}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set loglines [count_log_lines 0] + + $primary config set repl-diskless-sync yes + $primary config set dual-channel-replication-enabled yes + $primary config set loglevel debug + $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry + + # Generating RDB will cost 100 sec to generate + $primary debug populate 1000000 primary 1 + $primary config set rdb-key-save-delay -1000 + + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_log [srv 0 stdout] + + $replica config set dual-channel-replication-enabled yes + $replica config set loglevel debug + $replica config set repl-timeout 10 + $replica config set repl-diskless-load flush-before-load + + test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152 + set loglines [count_log_lines 0] + $replica replicaof $primary_host $primary_port + # Wait for sync session to start + wait_for_condition 500 1000 { + [string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] && + [string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't start sync session in time" + } + wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10 + $primary set key val + set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"] + $primary debug log "killing replica main connection $replica_main_conn_id" + assert {$replica_main_conn_id != ""} + set loglines [count_log_lines 0] + $primary client kill id $replica_main_conn_id + # Wait for primary to abort the sync + wait_for_condition 50 1000 { + [string match {*replicas_waiting_psync:0*} [$primary info replication]] + } else { + fail "Primary did not free repl buf block after sync failure" + } + wait_for_log_messages 0 {"*Primary main connection dropped during RDB load callback*"} $loglines 1000 10 + # Replica should retry + wait_for_condition 500 1000 { + [string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] && + [string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] && + [s -1 rdb_bgsave_in_progress] eq 1 + } else { + fail "replica didn't retry after connection close" + } + } + } +}