Skip to content

Commit

Permalink
m/n/c/consensus: fix startup after removing a cluster node
Browse files Browse the repository at this point in the history
The consensus service was waiting for all initial peers to be DNS
resolvable before starting etcd. However, the list of initial peers is
never updated. If an etcd member is removed from the cluster, it is no
longer resolvable, but may still be contained in initial peer lists. The
consensus service then fails to start, as it is blocked forever waiting
for the removed peer to become resolvable.

The wait for resolvability was added in c1cb37c with this
explanation:

> It also makes the consensus service wait for DNS resolvability before
> attempting to join an existing cluster, which makes etcd startup much
> cleaner (as etcd will itself crash if it cannot immediately resolve
> its ExistingPeers in startup).

This does not appear to be needed anymore. I did not observe etcd
crashes after removing the wait for resolvability.

I extended the e2e test to test this scenario. After removing the
consensus role, it also deletes the node and reboots the remaining
nodes. I moved these tests to the ha_cold suite, because with encryption
enabled, we currently cannot reboot a node in a 2-node cluster.

Change-Id: If811c79ea127550fa9ca750014272fa885767c77
Reviewed-on: https://review.monogon.dev/c/monogon/+/3454
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <[email protected]>
  • Loading branch information
jscissr committed Sep 26, 2024
1 parent 1dcede9 commit d5538b5
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 85 deletions.
37 changes: 0 additions & 37 deletions metropolis/node/core/consensus/consensus.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,43 +258,6 @@ func (s *Service) Run(ctx context.Context) error {
}
}

// If we're joining a cluster, make sure that our peers are actually DNS
// resolvable. This prevents us from immediately failing due to transient DNS
// issues.
if jc := s.config.JoinCluster; jc != nil {
supervisor.Logger(ctx).Infof("Waiting for initial peers to be DNS resolvable...")
startLogging := time.Now().Add(5 * time.Second)
for {
allOkay := true
shouldLog := time.Now().After(startLogging)
for _, node := range jc.ExistingNodes {
u, err := url.Parse(node.URL)
if err != nil {
// Just pretend this node is up. If the URL is really bad, etcd will complain
// more clearly than us. This shouldn't happen, anyway.
continue
}
host := u.Hostname()
if _, err := net.LookupIP(host); err == nil {
continue
}
if shouldLog {
supervisor.Logger(ctx).Errorf("Still can't resolve peer %s (%s): %v", node.Name, host, err)
}
allOkay = false
}
if allOkay {
supervisor.Logger(ctx).Infof("All peers resolvable, continuing startup.")
break
}

time.Sleep(100 * time.Millisecond)
if shouldLog {
startLogging = time.Now().Add(5 * time.Second)
}
}
}

// Start etcd ...
supervisor.Logger(ctx).Infof("Starting etcd...")
cfg := s.config.build(true)
Expand Down
2 changes: 0 additions & 2 deletions metropolis/test/e2e/suites/ha/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ go_test(
"xTestImagesManifestPath": "$(rlocationpath //metropolis/test/e2e:testimages_manifest )",
},
deps = [
"//metropolis/node/core/curator/proto/api",
"//metropolis/proto/api",
"//metropolis/test/launch",
"//metropolis/test/localregistry",
"//metropolis/test/util",
Expand Down
46 changes: 0 additions & 46 deletions metropolis/test/e2e/suites/ha/run_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ import (
"source.monogon.dev/metropolis/test/localregistry"
"source.monogon.dev/metropolis/test/util"
"source.monogon.dev/osbase/test/launch"

cpb "source.monogon.dev/metropolis/node/core/curator/proto/api"
apb "source.monogon.dev/metropolis/proto/api"
)

var (
Expand Down Expand Up @@ -108,47 +105,4 @@ func TestE2ECoreHA(t *testing.T) {
return nil
})
}

// Test node role removal.
curC, err := cluster.CuratorClient()
if err != nil {
t.Fatalf("Could not get CuratorClient: %v", err)
}
mgmt := apb.NewManagementClient(curC)
cur := cpb.NewCuratorClient(curC)

util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
fa := false
_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
Node: &apb.UpdateNodeRolesRequest_Id{
Id: cluster.NodeIDs[0],
},
KubernetesController: &fa,
})
return err
})
util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
fa := false
_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
Node: &apb.UpdateNodeRolesRequest_Id{
Id: cluster.NodeIDs[0],
},
ConsensusMember: &fa,
})
return err
})

// Test that removing the ConsensusMember role from a node removed the
// corresponding etcd member from the cluster.
var st *cpb.GetConsensusStatusResponse
util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
st, err = cur.GetConsensusStatus(ctx, &cpb.GetConsensusStatusRequest{})
return err
})

for _, member := range st.EtcdMember {
if member.Id == cluster.NodeIDs[0] {
t.Errorf("member still present in etcd")
}
}
}
2 changes: 2 additions & 0 deletions metropolis/test/e2e/suites/ha_cold/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ go_test(
"resources:ram:7000",
],
deps = [
"//metropolis/node/core/curator/proto/api",
"//metropolis/proto/api",
"//metropolis/proto/common",
"//metropolis/test/launch",
"//metropolis/test/util",
Expand Down
72 changes: 72 additions & 0 deletions metropolis/test/e2e/suites/ha_cold/run_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"source.monogon.dev/metropolis/test/util"
"source.monogon.dev/osbase/test/launch"

ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
apb "source.monogon.dev/metropolis/proto/api"
cpb "source.monogon.dev/metropolis/proto/common"
)

Expand Down Expand Up @@ -83,4 +85,74 @@ func TestE2EColdStartHA(t *testing.T) {
}
// Check if the cluster comes back up.
util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)

// Test node role removal.
curC, err := cluster.CuratorClient()
if err != nil {
t.Fatalf("Could not get CuratorClient: %v", err)
}
mgmt := apb.NewManagementClient(curC)
cur := ipb.NewCuratorClient(curC)

util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
fa := false
_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
Node: &apb.UpdateNodeRolesRequest_Id{
Id: cluster.NodeIDs[0],
},
KubernetesController: &fa,
})
return err
})
util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
fa := false
_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
Node: &apb.UpdateNodeRolesRequest_Id{
Id: cluster.NodeIDs[0],
},
ConsensusMember: &fa,
})
return err
})

// Test that removing the ConsensusMember role from a node removed the
// corresponding etcd member from the cluster.
var st *ipb.GetConsensusStatusResponse
util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
return err
})

for _, member := range st.EtcdMember {
if member.Id == cluster.NodeIDs[0] {
t.Errorf("member still present in etcd")
}
}

// Test that that the cluster still works after deleting the first node and
// restarting the remaining nodes.
util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
_, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
Node: &apb.DeleteNodeRequest_Id{
Id: cluster.NodeIDs[0],
},
SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
})
return err
})

// Shut every remaining node down.
for i := 1; i < clusterOptions.NumNodes; i++ {
if err := cluster.ShutdownNode(i); err != nil {
t.Fatalf("Could not shutdown node %d", i)
}
}
// Start every remaining node back up.
for i := 1; i < clusterOptions.NumNodes; i++ {
if err := cluster.StartNode(i); err != nil {
t.Fatalf("Could not shutdown node %d", i)
}
}
// Check if the cluster comes back up.
util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
}

0 comments on commit d5538b5

Please sign in to comment.