From c1bf0e6cc9239e8bdf55dbc9bdf82b518333175c Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 18 Oct 2024 13:00:29 +0800
Subject: [PATCH 1/5] Mark the node as FAIL when the node is marked as NOADDR

Imagine we have a cluster, for example a three-shard cluster,
if shard 1 doing a CLUSTER RESET HARD, it will change the node
name, and then other nodes will mark it as NOADR since the node
name received by PONG has changed.

In the eyes of other nodes, there is one working primary node
left but with no address, and in this case, the address report
in MOVED will be invalid and will confuse the clients. And in
the same time, the replica will not failover since its primary
is not in the FAIL state. And the cluster looks OK to everyone.

This leaves a cluster that appears OK, but with no coverage for
shard 1, obviously we should do something like CLUSTER FORGET
to remove the node and fix the cluster before using it.

But the point in here, we can mark the NOADDR node as FAIL to
advance the cluster state. If a node is NOADDR means it does
not have a valid address, so we won't reconnect it, we won't
send PING, we won't gossip it, it seems reasonable to mark it
as FAIL.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c          | 13 +++++++++---
 tests/unit/cluster/noaddr.tcl | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/cluster/noaddr.tcl

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 14f8a6bd1e..17e8fdde4c 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -654,7 +654,7 @@ int clusterLoadConfig(char *filename) {
             } else if (!strcasecmp(s, "handshake")) {
                 n->flags |= CLUSTER_NODE_HANDSHAKE;
             } else if (!strcasecmp(s, "noaddr")) {
-                n->flags |= CLUSTER_NODE_NOADDR;
+                n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
             } else if (!strcasecmp(s, "nofailover")) {
                 n->flags |= CLUSTER_NODE_NOFAILOVER;
             } else if (!strcasecmp(s, "noflags")) {
@@ -3250,13 +3250,20 @@ int clusterProcessPacket(clusterLink *link) {
                           "having flags %d",
                           link->node->name, link->node->human_nodename, link->node->shard_id,
                           (int)(now - (link->node->ctime)), link->node->flags);
-                link->node->flags |= CLUSTER_NODE_NOADDR;
+                /* We also mark the node as fail because we have disconnected from it,
+                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
+                 * Marking it as FAIL can help us advance the state, such as the cluster
+                 * state becomes FAIL or the replica can do the failover. Otherwise, the
+                 * NOADDR node will provide an invalid address in redirection and confuse
+                 * the client, and the replica will not initiate a failover since the node
+                 * is not actually in FAIL state. */
+                link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
                 link->node->ip[0] = '\0';
                 link->node->tcp_port = 0;
                 link->node->tls_port = 0;
                 link->node->cport = 0;
                 freeClusterLink(link);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
             }
         }
diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl
new file mode 100644
index 0000000000..5516fc8303
--- /dev/null
+++ b/tests/unit/cluster/noaddr.tcl
@@ -0,0 +1,40 @@
+start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
+    test "NOADDR nodes will be marked as FAIL" {
+        set primary0_id [R 0 CLUSTER MYID]
+
+        # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified,
+        # and other nodes will set it to NOADDR.
+        R 0 cluster reset hard
+        wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
+        assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] 1
+
+        # Also we will set the node to FAIL, so the cluster will eventually be down.
+        assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] 1
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail} &&
+            [CI 3 cluster_state] eq {fail} &&
+            [CI 4 cluster_state] eq {fail} &&
+            [CI 5 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+
+        # key_977613 belong to slot 0 and belong to R 0.
+        # Make sure we get a CLUSTER DOWN instead of an invalid MOVED.
+        assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar}
+
+        # Let the replica 3 do the failover.
+        R 3 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-replica-no-failover no
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok} &&
+            [CI 4 cluster_state] eq {ok} &&
+            [CI 5 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't ok"
+        }
+    }
+} ;# start_cluster

From bc863655180f2e6b3b82ff137eb4b0798c42c526 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 19 Oct 2024 20:27:06 +0800
Subject: [PATCH 2/5] update comment

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 17e8fdde4c..ebbf88f19a 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3244,19 +3244,21 @@ int clusterProcessPacket(clusterLink *link) {
             } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) {
                 /* If the reply has a non matching node ID we
                  * disconnect this node and set it as not having an associated
-                 * address. */
+                 * address. This can happen if the node did CLUSTER RESET and changed
+                 * its node ID. In this case, the old node ID will not come back.
+                 *
+                 * We will also mark the node as fail because we have disconnected from it,
+                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
+                 * Marking it as FAIL can help us advance the state, such as the cluster
+                 * state becomes FAIL or the replica can do the failover. Otherwise, the
+                 * NOADDR node will provide an invalid address in redirection and confuse
+                 * the clients, and the replica will never initiate a failover since the
+                 * node is not actually in FAIL state. */
                 serverLog(LL_NOTICE,
                           "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, "
                           "having flags %d",
                           link->node->name, link->node->human_nodename, link->node->shard_id,
                           (int)(now - (link->node->ctime)), link->node->flags);
-                /* We also mark the node as fail because we have disconnected from it,
-                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
-                 * Marking it as FAIL can help us advance the state, such as the cluster
-                 * state becomes FAIL or the replica can do the failover. Otherwise, the
-                 * NOADDR node will provide an invalid address in redirection and confuse
-                 * the client, and the replica will not initiate a failover since the node
-                 * is not actually in FAIL state. */
                 link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
                 link->node->ip[0] = '\0';
                 link->node->tcp_port = 0;

From 4d2780abc11c3a6bd691973daf7d6aed7b876591 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 20 Oct 2024 00:22:33 +0800
Subject: [PATCH 3/5] fix timing issue

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/unit/cluster/noaddr.tcl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl
index 5516fc8303..8ad2effbfa 100644
--- a/tests/unit/cluster/noaddr.tcl
+++ b/tests/unit/cluster/noaddr.tcl
@@ -6,10 +6,14 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa
         # and other nodes will set it to NOADDR.
         R 0 cluster reset hard
         wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
-        assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] 1
+        wait_for_condition 1000 10 {
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1
+        } else {
+            fail "The node is not marked with the correct flag"
+        }
 
         # Also we will set the node to FAIL, so the cluster will eventually be down.
-        assert_equal [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] 1
         wait_for_condition 1000 50 {
             [CI 1 cluster_state] eq {fail} &&
             [CI 2 cluster_state] eq {fail} &&
@@ -34,7 +38,7 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa
             [CI 4 cluster_state] eq {ok} &&
             [CI 5 cluster_state] eq {ok}
         } else {
-            fail "Cluster doesn't ok"
+            fail "Cluster doesn't stabilize"
         }
     }
 } ;# start_cluster

From a594b86a006824b5349296e7c16be4326cb34758 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 22 Oct 2024 15:08:10 +0800
Subject: [PATCH 4/5] Broadcast the node when marked with NOADDR

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index ebbf88f19a..863fb9dc5c 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3245,26 +3245,31 @@ int clusterProcessPacket(clusterLink *link) {
                 /* If the reply has a non matching node ID we
                  * disconnect this node and set it as not having an associated
                  * address. This can happen if the node did CLUSTER RESET and changed
-                 * its node ID. In this case, the old node ID will not come back.
-                 *
-                 * We will also mark the node as fail because we have disconnected from it,
-                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
-                 * Marking it as FAIL can help us advance the state, such as the cluster
-                 * state becomes FAIL or the replica can do the failover. Otherwise, the
-                 * NOADDR node will provide an invalid address in redirection and confuse
-                 * the clients, and the replica will never initiate a failover since the
-                 * node is not actually in FAIL state. */
+                 * its node ID. In this case, the old node ID will not come back. */
+                clusterNode *noaddr_node = link->node;
                 serverLog(LL_NOTICE,
                           "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, "
                           "having flags %d",
                           link->node->name, link->node->human_nodename, link->node->shard_id,
                           (int)(now - (link->node->ctime)), link->node->flags);
-                link->node->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
+                link->node->flags |= CLUSTER_NODE_NOADDR;
                 link->node->ip[0] = '\0';
                 link->node->tcp_port = 0;
                 link->node->tls_port = 0;
                 link->node->cport = 0;
                 freeClusterLink(link);
+                /* We will also mark the node as fail because we have disconnected from it,
+                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
+                 * Marking it as FAIL can help us advance the state, such as the cluster
+                 * state becomes FAIL or the replica can do the failover. Otherwise, the
+                 * NOADDR node will provide an invalid address in redirection and confuse
+                 * the clients, and the replica will never initiate a failover since the
+                 * node is not actually in FAIL state. */
+                if (!nodeFailed(noaddr_node)) {
+                    noaddr_node->flags |= CLUSTER_NODE_FAIL;
+                    noaddr_node->fail_time = now;
+                    clusterSendFail(noaddr_node->name);
+                }
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
             }

From 80651b3481b2bc2f5c604b099aedc3122be68a52 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 17 Nov 2024 17:34:36 +0800
Subject: [PATCH 5/5] set fail_time when maring FAIL and cleanup the test code

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c          | 1 +
 tests/unit/cluster/noaddr.tcl | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 700ebfa6c3..6ed8cacb0a 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -655,6 +655,7 @@ int clusterLoadConfig(char *filename) {
                 n->flags |= CLUSTER_NODE_HANDSHAKE;
             } else if (!strcasecmp(s, "noaddr")) {
                 n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
+                n->fail_time = mstime();
             } else if (!strcasecmp(s, "nofailover")) {
                 n->flags |= CLUSTER_NODE_NOFAILOVER;
             } else if (!strcasecmp(s, "noflags")) {
diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl
index 8ad2effbfa..fb4e501809 100644
--- a/tests/unit/cluster/noaddr.tcl
+++ b/tests/unit/cluster/noaddr.tcl
@@ -2,13 +2,16 @@ start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-fa
     test "NOADDR nodes will be marked as FAIL" {
         set primary0_id [R 0 CLUSTER MYID]
 
-        # R 0 is a primary, doing a CLUSTER RESET and the node name will be modified,
-        # and other nodes will set it to NOADDR.
+        # R 0 is a primary, after doing a CLUSTER RESET, the node name will be modified,
+        # and other nodes will set it to NOADDR and FAIL.
         R 0 cluster reset hard
         wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
+        wait_for_log_messages -2 {"*PONG contains mismatching sender ID*"} 0 1000 10
         wait_for_condition 1000 10 {
             [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 &&
-            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] noaddr] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] fail] eq 1
         } else {
             fail "The node is not marked with the correct flag"
         }