From 96a30728f36e6ef0b90459725584836f07af65b0 Mon Sep 17 00:00:00 2001 From: narvikd <84069271+narvikd@users.noreply.github.com> Date: Mon, 27 Feb 2023 20:50:04 +0100 Subject: [PATCH 1/2] feat: Add resilience in case the cluster falls bellow 1 node Signed-off-by: narvikd <84069271+narvikd@users.noreply.github.com> --- cluster/consensus/consensus.go | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cluster/consensus/consensus.go b/cluster/consensus/consensus.go index a4173f1..8bc3ded 100644 --- a/cluster/consensus/consensus.go +++ b/cluster/consensus/consensus.go @@ -2,6 +2,7 @@ package consensus import ( "errors" + "fmt" "github.com/hashicorp/go-hclog" "github.com/hashicorp/raft" "github.com/hashicorp/raft-boltdb/v2" @@ -267,11 +268,32 @@ func (n *Node) waitForClusterReadiness() error { if currentTry > maxRetryCount { return errors.New("quorum retry max reached") } + if n.IsQuorumPossible(true) { n.logger.Info("quorum possible.") break } - n.logger.Error("it is not possible to reach Quorum due to lack of nodes. Retrying...") + + leader, errLeader := discover.SearchLeader(n.ID) + if errLeader != nil { + msg := fmt.Sprintf("it isn't possible to reach Quorum due to lack of nodes. "+ + "Tried to search for a leader to join an existent consensus. "+ + "Leader not found: %v", + errLeader.Error(), + ) + n.logger.Error(msg) + n.logger.Error("it isn't possible to reach Quorum due to lack of nodes and leader not available. Retrying...") + continue + } + + errJoin := cluster.ConsensusJoin(n.ID, config.MakeConsensusAddr(n.ID), config.MakeGrpcAddress(leader)) + if errJoin != nil { + n.logger.Error("couldn't join existing consensus: " + errJoin.Error()) + } else { + n.logger.Info("joined existing consensus @ " + leader) + break + } + time.Sleep(sleepTime) } return nil From b9d33594592c4757ac0d2a170cc09ef20b873768 Mon Sep 17 00:00:00 2001 From: narvikd <84069271+narvikd@users.noreply.github.com> Date: Mon, 27 Feb 2023 21:29:39 +0100 Subject: [PATCH 2/2] Call reinstall Signed-off-by: narvikd <84069271+narvikd@users.noreply.github.com> --- cluster/consensus/consensus.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/cluster/consensus/consensus.go b/cluster/consensus/consensus.go index 8bc3ded..4b47b33 100644 --- a/cluster/consensus/consensus.go +++ b/cluster/consensus/consensus.go @@ -274,7 +274,7 @@ func (n *Node) waitForClusterReadiness() error { break } - leader, errLeader := discover.SearchLeader(n.ID) + _, errLeader := discover.SearchLeader(n.ID) if errLeader != nil { msg := fmt.Sprintf("it isn't possible to reach Quorum due to lack of nodes. "+ "Tried to search for a leader to join an existent consensus. "+ @@ -283,18 +283,12 @@ func (n *Node) waitForClusterReadiness() error { ) n.logger.Error(msg) n.logger.Error("it isn't possible to reach Quorum due to lack of nodes and leader not available. Retrying...") + time.Sleep(sleepTime) continue } - errJoin := cluster.ConsensusJoin(n.ID, config.MakeConsensusAddr(n.ID), config.MakeGrpcAddress(leader)) - if errJoin != nil { - n.logger.Error("couldn't join existing consensus: " + errJoin.Error()) - } else { - n.logger.Info("joined existing consensus @ " + leader) - break - } - - time.Sleep(sleepTime) + n.logger.Warn("existent leader found, reinstalling....") + n.ReinstallNode() } return nil }