From eff45f546762577bd1877dfeece41c53b39926ad Mon Sep 17 00:00:00 2001 From: Sankar <1890648+srgsanky@users.noreply.github.com> Date: Mon, 1 Jul 2024 22:27:38 -0700 Subject: [PATCH] Fix flakiness of cluster-multiple-meets and cluster-reliable-meet (#728) Tests in cluster-multiple-meets were flaky as reported by @madolson * https://github.com/valkey-io/valkey/actions/runs/9688455588/job/26776953320 * https://github.com/valkey-io/valkey/actions/runs/9688455588/job/26776953585 I wasn't able to reproduce this locally, but I suspect that the flakiness is coming from the fact that nodes are reported as "connected" as long as there is an outgoing link. An outgoing link is created before MEET is sent out. Signed-off-by: Sankar <1890648+srgsanky@users.noreply.github.com> --- tests/unit/cluster/cluster-multiple-meets.tcl | 8 +++++++- tests/unit/cluster/cluster-reliable-meet.tcl | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl index 07a2582133..059f03fbe4 100644 --- a/tests/unit/cluster/cluster-multiple-meets.tcl +++ b/tests/unit/cluster/cluster-multiple-meets.tcl @@ -51,7 +51,13 @@ tags {tls:skip external:skip cluster} { } # 0 will be connected to 1, but 1 won't see that 0 is connected - assert {[llength [get_cluster_nodes 1 connected]] == 1} + # Using a wait condition here as an assert can be flaky - especially + # when cluster nodes is processed when the link is established to send MEET. + wait_for_condition 1000 50 { + [llength [get_cluster_nodes 1 connected]] == 1 + } else { + fail "Node 1 recognizes node 0 even though it drops PONGs from node 0" + } assert {[llength [get_cluster_nodes 0 connected]] == 2} # Drop incoming and outgoing links from/to 1 diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl index 41da97ab9b..45f5a6dc89 100644 --- a/tests/unit/cluster/cluster-reliable-meet.tcl +++ b/tests/unit/cluster/cluster-reliable-meet.tcl @@ -50,7 +50,13 @@ tags {tls:skip external:skip cluster} { } # Make sure the nodes still don't know about each other - assert {[llength [get_cluster_nodes 1 connected]] == 1} + # Using a wait condition here as an assert can be flaky - especially + # when cluster nodes is processed when the link is established to send MEET. + wait_for_condition 1000 50 { + [llength [get_cluster_nodes 1 connected]] == 1 + } else { + fail "Node 1 recognizes node 0 even though node 0 drops MEETs from node 1" + } assert {[llength [get_cluster_nodes 0 connected]] == 1} R 0 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE