From c1bf0e6cc9239e8bdf55dbc9bdf82b518333175c Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 18 Oct 2024 13:00:29 +0800 Subject: [PATCH 1/5] Mark the node as FAIL when the node is marked as NOADDR Imagine we have a cluster, for example a three-shard cluster, if shard 1 doing a CLUSTER RESET HARD, it will change the node name, and then other nodes will mark it as NOADR since the node name received by PONG has changed. In the eyes of other nodes, there is one working primary node left but with no address, and in this case, the address report in MOVED will be invalid and will confuse the clients. And in the same time, the replica will not failover since its primary is not in the FAIL state. And the cluster looks OK to everyone. This leaves a cluster that appears OK, but with no coverage for shard 1, obviously we should do something like CLUSTER FORGET to remove the node and fix the cluster before using it. But the point in here, we can mark the NOADDR node as FAIL to advance the cluster state. If a node is NOADDR means it does not have a valid address, so we won't reconnect it, we won't send PING, we won't gossip it, it seems reasonable to mark it as FAIL. Signed-off-by: Binbin --- src/cluster_legacy.c | 13 +++++++++--- tests/unit/cluster/noaddr.tcl | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 tests/unit/cluster/noaddr.tcl diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 14f8a6bd1e..17e8fdde4c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -654,7 +654,7 @@ int clusterLoadConfig(char *filename) { } else if (!strcasecmp(s, "handshake")) { n->flags |= CLUSTER_NODE_HANDSHAKE; } else if (!strcasecmp(s, "noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; + n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); } else if (!strcasecmp(s, "nofailover")) { n->flags |= CLUSTER_NODE_NOFAILOVER; } else if (!strcasecmp(s, "noflags")) { @@ -3250,13 +3250,20 @@ int clusterProcessPacket(clusterLink *link) { "having flags %d", link->node->name, link->node->human_nodename, link->node->shard_id, (int)(now - (link->node->ctime)), link->node->flags); - link->node->flags |= CLUSTER_NODE_NOADDR; + /* We also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the client, and the replica will not initiate a failover since the node + * is not actually in FAIL state. */ + link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); link->node->ip[0] = '\0'; link->node->tcp_port = 0; link->node->tls_port = 0; link->node->cport = 0; freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); return 0; } } diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl new file mode 100644 index 0000000000..5516fc8303 --- /dev/null +++ b/tests/unit/cluster/noaddr.tcl @@ -0,0 +1,40 @@ +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} { + test "NOADDR nodes will be marked as FAIL" { + set primary0_id [R 0 CLUSTER MYID] + + # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified, + # and other nodes will set it to NOADDR. + R 0 cluster reset hard + wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10 + assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] 1 + + # Also we will set the node to FAIL, so the cluster will eventually be down. + assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] 1 + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} && + [CI 3 cluster_state] eq {fail} && + [CI 4 cluster_state] eq {fail} && + [CI 5 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + + # key_977613 belong to slot 0 and belong to R 0. + # Make sure we get a CLUSTER DOWN instead of an invalid MOVED. + assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar} + + # Let the replica 3 do the failover. + R 3 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} && + [CI 4 cluster_state] eq {ok} && + [CI 5 cluster_state] eq {ok} + } else { + fail "Cluster doesn't ok" + } + } +} ;# start_cluster From bc863655180f2e6b3b82ff137eb4b0798c42c526 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 19 Oct 2024 20:27:06 +0800 Subject: [PATCH 2/5] update comment Signed-off-by: Binbin --- src/cluster_legacy.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 17e8fdde4c..ebbf88f19a 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3244,19 +3244,21 @@ int clusterProcessPacket(clusterLink *link) { } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) { /* If the reply has a non matching node ID we * disconnect this node and set it as not having an associated - * address. */ + * address. This can happen if the node did CLUSTER RESET and changed + * its node ID. In this case, the old node ID will not come back. + * + * We will also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the clients, and the replica will never initiate a failover since the + * node is not actually in FAIL state. */ serverLog(LL_NOTICE, "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, " "having flags %d", link->node->name, link->node->human_nodename, link->node->shard_id, (int)(now - (link->node->ctime)), link->node->flags); - /* We also mark the node as fail because we have disconnected from it, - * and will not reconnect, and obviously we will not gossip NOADDR nodes. - * Marking it as FAIL can help us advance the state, such as the cluster - * state becomes FAIL or the replica can do the failover. Otherwise, the - * NOADDR node will provide an invalid address in redirection and confuse - * the client, and the replica will not initiate a failover since the node - * is not actually in FAIL state. */ link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); link->node->ip[0] = '\0'; link->node->tcp_port = 0; From 4d2780abc11c3a6bd691973daf7d6aed7b876591 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 20 Oct 2024 00:22:33 +0800 Subject: [PATCH 3/5] fix timing issue Signed-off-by: Binbin --- tests/unit/cluster/noaddr.tcl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl index 5516fc8303..8ad2effbfa 100644 --- a/tests/unit/cluster/noaddr.tcl +++ b/tests/unit/cluster/noaddr.tcl @@ -6,10 +6,14 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa # and other nodes will set it to NOADDR. R 0 cluster reset hard wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10 - assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] 1 + wait_for_condition 1000 10 { + [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 + } else { + fail "The node is not marked with the correct flag" + } # Also we will set the node to FAIL, so the cluster will eventually be down. - assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] 1 wait_for_condition 1000 50 { [CI 1 cluster_state] eq {fail} && [CI 2 cluster_state] eq {fail} && @@ -34,7 +38,7 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa [CI 4 cluster_state] eq {ok} && [CI 5 cluster_state] eq {ok} } else { - fail "Cluster doesn't ok" + fail "Cluster doesn't stabilize" } } } ;# start_cluster From a594b86a006824b5349296e7c16be4326cb34758 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 22 Oct 2024 15:08:10 +0800 Subject: [PATCH 4/5] Broadcast the node when marked with NOADDR Signed-off-by: Binbin --- src/cluster_legacy.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ebbf88f19a..863fb9dc5c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3245,26 +3245,31 @@ int clusterProcessPacket(clusterLink *link) { /* If the reply has a non matching node ID we * disconnect this node and set it as not having an associated * address. This can happen if the node did CLUSTER RESET and changed - * its node ID. In this case, the old node ID will not come back. - * - * We will also mark the node as fail because we have disconnected from it, - * and will not reconnect, and obviously we will not gossip NOADDR nodes. - * Marking it as FAIL can help us advance the state, such as the cluster - * state becomes FAIL or the replica can do the failover. Otherwise, the - * NOADDR node will provide an invalid address in redirection and confuse - * the clients, and the replica will never initiate a failover since the - * node is not actually in FAIL state. */ + * its node ID. In this case, the old node ID will not come back. */ + clusterNode *noaddr_node = link->node; serverLog(LL_NOTICE, "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, " "having flags %d", link->node->name, link->node->human_nodename, link->node->shard_id, (int)(now - (link->node->ctime)), link->node->flags); - link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); + link->node->flags |= CLUSTER_NODE_NOADDR; link->node->ip[0] = '\0'; link->node->tcp_port = 0; link->node->tls_port = 0; link->node->cport = 0; freeClusterLink(link); + /* We will also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the clients, and the replica will never initiate a failover since the + * node is not actually in FAIL state. */ + if (!nodeFailed(noaddr_node)) { + noaddr_node->flags |= CLUSTER_NODE_FAIL; + noaddr_node->fail_time = now; + clusterSendFail(noaddr_node->name); + } clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); return 0; } From 80651b3481b2bc2f5c604b099aedc3122be68a52 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 17 Nov 2024 17:34:36 +0800 Subject: [PATCH 5/5] set fail_time when maring FAIL and cleanup the test code Signed-off-by: Binbin --- src/cluster_legacy.c | 1 + tests/unit/cluster/noaddr.tcl | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 700ebfa6c3..6ed8cacb0a 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -655,6 +655,7 @@ int clusterLoadConfig(char *filename) { n->flags |= CLUSTER_NODE_HANDSHAKE; } else if (!strcasecmp(s, "noaddr")) { n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); + n->fail_time = mstime(); } else if (!strcasecmp(s, "nofailover")) { n->flags |= CLUSTER_NODE_NOFAILOVER; } else if (!strcasecmp(s, "noflags")) { diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl index 8ad2effbfa..fb4e501809 100644 --- a/tests/unit/cluster/noaddr.tcl +++ b/tests/unit/cluster/noaddr.tcl @@ -2,13 +2,16 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa test "NOADDR nodes will be marked as FAIL" { set primary0_id [R 0 CLUSTER MYID] - # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified, - # and other nodes will set it to NOADDR. + # R 0 is a primary, after doing a CLUSTER RESET, the node name will be modified, + # and other nodes will set it to NOADDR and FAIL. R 0 cluster reset hard wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10 + wait_for_log_messages -2 {"*PONG contains mismatching sender ID*"} 0 1000 10 wait_for_condition 1000 10 { [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 && - [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 + [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] noaddr] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] fail] eq 1 } else { fail "The node is not marked with the correct flag" }