Skip to content

Commit

Permalink
Merge pull request rancher#43753 from slickwarren/cwarren/2.8/unhealt…
Browse files Browse the repository at this point in the history
…hy-node-timeout

[v2.8] adding machineConfig and test for AutoReplace RKE2/K3s
  • Loading branch information
slickwarren authored Mar 20, 2024
2 parents 5348f98 + a8ea25a commit 69990ef
Show file tree
Hide file tree
Showing 12 changed files with 255 additions and 70 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ require (
github.com/containers/image/v5 v5.25.0
github.com/google/gnostic-models v0.6.8
github.com/rancher/rancher/pkg/apis v0.0.0-20240213233515-935d309ebad4
github.com/rancher/shepherd v0.0.0-20240307235216-45526fb052a5
github.com/rancher/shepherd v0.0.0-20240319182018-6c29a53b0a23
go.qase.io/client v0.0.0-20231114201952-65195ec001fa
)

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1641,8 +1641,8 @@ github.com/rancher/remotedialer v0.3.0 h1:y1EO8JCsgZo0RcqTUp6U8FXcBAv27R+TLnWRcp
github.com/rancher/remotedialer v0.3.0/go.mod h1:BwwztuvViX2JrLLUwDlsYt5DiyUwHLlzynRwkZLAY0Q=
github.com/rancher/rke v1.5.7-rc3 h1:UyPGHCE3m69A2UoIQ4VKoXIKYrM593XCgw5USVL741Y=
github.com/rancher/rke v1.5.7-rc3/go.mod h1:+lcRKCxBLtfaSZQ9Q+BA82cHhSImF62mfElqJvHJUls=
github.com/rancher/shepherd v0.0.0-20240307235216-45526fb052a5 h1:iONiMurJulhfFufIUwOYqTnuBlVffa9DHvaeA4ZEjl8=
github.com/rancher/shepherd v0.0.0-20240307235216-45526fb052a5/go.mod h1:RZBKxW7aL5Pio4F7KLqVVBWoaR20QzvTf3DFWrJzmjU=
github.com/rancher/shepherd v0.0.0-20240319182018-6c29a53b0a23 h1:UryN2bkdEsFJRYKJAGtxD795zpo9CkN9Pk627gi8lbM=
github.com/rancher/shepherd v0.0.0-20240319182018-6c29a53b0a23/go.mod h1:RZBKxW7aL5Pio4F7KLqVVBWoaR20QzvTf3DFWrJzmjU=
github.com/rancher/steve v0.0.0-20240305150728-3943409601f1 h1:6wNYy3q9jget45syTN6K2uOLSYaptLYCHscY2WRmhDI=
github.com/rancher/steve v0.0.0-20240305150728-3943409601f1/go.mod h1:o4vLBzMTKbHHhIiAcbgOiaN3aK1vIjL6ZTgaGxQYpsY=
github.com/rancher/system-upgrade-controller/pkg/apis v0.0.0-20210727200656-10b094e30007 h1:ru+mqGnxMmKeU0Q3XIDxkARvInDIqT1hH2amTcsjxI4=
Expand Down
13 changes: 12 additions & 1 deletion tests/v2/validation/nodescaling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,15 @@ These tests utilize Go build tags. Due to this, see the below examples on how to
`gotestsum --format standard-verbose --packages=github.com/rancher/rancher/tests/v2/validation/nodescaling --junitfile results.xml -- -timeout=60m -tags=validation -v -run "TestGKENodeScalingTestSuite/TestScalingGKENodePools"` \
`gotestsum --format standard-verbose --packages=github.com/rancher/rancher/tests/v2/validation/nodescaling --junitfile results.xml -- -timeout=60m -tags=validation -v -run "TestGKENodeScalingTestSuite/TestScalingGKENodePoolsDynamicInput"`

If the specified test passes immediately without warning, try adding the `-count=1` flag to get around this issue. This will avoid previous results from interfering with the new test run.
If the specified test passes immediately without warning, try adding the `-count=1` flag to get around this issue. This will avoid previous results from interfering with the new test run.


## Auto Replacing Nodes
If UnhealthyNodeTimeout is set on your machinepools, auto_replace_test.go will replace a single node with the given role. There are static tests for Etcd, ControlPlane and Worker roles.

If UnhealthyNodeTimeout is not set, the test(s) in this suite will wait for the cluster upgrade default timeout to be reached (30 mins), expecting an error on the node to remain as a negative test.

Each test requires 2 or more nodes in the specified role's pool. i.e. if you're running the entire suite, you would need 3etcd, 2controlplane, 2worker, minimum.

### RKE2 | K3S
`gotestsum --format standard-verbose --packages=github.com/rancher/rancher/tests/v2/validation/nodescaling --junitfile results.xml -- -timeout=60m -tags=validation -v -run "TestEtcdAutoReplaceRKE2K3S/TestEtcdAutoReplaceRKE2K3S"`
61 changes: 61 additions & 0 deletions tests/v2/validation/nodescaling/auto_replace_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//go:build (validation || extended) && !infra.any && !infra.aks && !infra.eks && !infra.gke && !infra.rke2k3s && !cluster.any && !cluster.custom && !cluster.nodedriver && !sanity && !stress

package nodescaling

import (
"testing"

"github.com/rancher/shepherd/clients/rancher"
"github.com/rancher/shepherd/extensions/rancherversion"
"github.com/rancher/shepherd/pkg/config"
"github.com/rancher/shepherd/pkg/session"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
)

const (
fleetNamespace = "fleet-default"
deletingState = "deleting"
machineNameAnnotation = "cluster.x-k8s.io/machine"
)

type AutoReplaceSuite struct {
suite.Suite
client *rancher.Client
session *session.Session
rancherConfig *rancherversion.Config
}

func (s *AutoReplaceSuite) TearDownSuite() {
s.session.Cleanup()
}

func (s *AutoReplaceSuite) SetupSuite() {
testSession := session.NewSession()
s.session = testSession

rancherConfig := new(rancherversion.Config)
config.LoadConfig(rancherversion.ConfigurationFileKey, rancherConfig)
s.rancherConfig = rancherConfig

client, err := rancher.NewClient("", testSession)
require.NoError(s.T(), err)

s.client = client
}

func (s *AutoReplaceSuite) TestEtcdAutoReplaceRKE2K3S() {
AutoReplaceFirstNodeWithRole(s.T(), s.client, s.client.RancherConfig.ClusterName, "etcd")
}

func (s *AutoReplaceSuite) TestControlPlaneAutoReplaceRKE2K3S() {
AutoReplaceFirstNodeWithRole(s.T(), s.client, s.client.RancherConfig.ClusterName, "control-plane")
}

func (s *AutoReplaceSuite) TestWorkerAutoReplaceRKE2K3S() {
AutoReplaceFirstNodeWithRole(s.T(), s.client, s.client.RancherConfig.ClusterName, "worker")
}

func TestAutoReplaceSuite(t *testing.T) {
suite.Run(t, new(AutoReplaceSuite))
}
125 changes: 125 additions & 0 deletions tests/v2/validation/nodescaling/replace.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
package nodescaling

import (
"errors"
"net/url"
"strings"
"testing"
"time"

"github.com/rancher/norman/types"
provv1 "github.com/rancher/rancher/pkg/apis/provisioning.cattle.io/v1"
"github.com/rancher/shepherd/clients/rancher"
management "github.com/rancher/shepherd/clients/rancher/generated/management/v3"
steveV1 "github.com/rancher/shepherd/clients/rancher/v1"
"github.com/rancher/shepherd/extensions/clusters"
"github.com/rancher/shepherd/extensions/defaults"
"github.com/rancher/shepherd/extensions/nodes"
nodestat "github.com/rancher/shepherd/extensions/nodes"
"github.com/rancher/shepherd/extensions/provisioninginput"
"github.com/rancher/shepherd/extensions/sshkeys"
"github.com/rancher/shepherd/extensions/workloads/pods"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/crypto/ssh"
)

const (
shutdownCommand = "sudo shutdown -h now"
controlPlane = "control-plane"
etcd = "etcd"
worker = "worker"

unreachableCondition = "NodeStatusUnknown"
namespace = "fleet-default"
ProvisioningSteveResouceType = "provisioning.cattle.io.cluster"
machineSteveResourceType = "cluster.x-k8s.io.machine"
Expand Down Expand Up @@ -95,3 +111,112 @@ func ReplaceRKE1Nodes(t *testing.T, client *rancher.Client, clusterName string,
assert.Empty(t, podErrors)
}
}

// shutdownFirstNodeWithRole uses ssh to shutdown the first node matching the specified role in a given cluster.
func shutdownFirstNodeWithRole(client *rancher.Client, stevecluster *steveV1.SteveAPIObject, clusterID, nodeRole string) (*steveV1.SteveAPIObject, error) {
steveclient, err := client.Steve.ProxyDownstream(clusterID)
if err != nil {
return nil, err
}

query, err := url.ParseQuery("labelSelector=node-role.kubernetes.io/" + nodeRole + "=true")
if err != nil {
return nil, err
}

nodeList, err := steveclient.SteveType("node").List(query)
if err != nil {
return nil, err
}

firstMachine := nodeList.Data[0]

sshUser, err := sshkeys.GetSSHUser(client, stevecluster)
if err != nil {
return nil, err
}

if sshUser == "" {
return nil, errors.New("sshUser does not exist")
}

sshNode, err := sshkeys.GetSSHNodeFromMachine(client, sshUser, &firstMachine)
if err != nil {
return nil, err
}

logrus.Infof("Running node auto-replace on node %s", firstMachine.Name)

// Shutdown node using ssh outside of Rancher to simulate unhealthy node
_, err = sshNode.ExecuteCommand(shutdownCommand)
if err != nil && !errors.Is(err, &ssh.ExitMissingError{}) {
return nil, err
}

return &firstMachine, nil
}

// matchNodeToMachinePool takes a given node name and returns the cluster's first matching machinePool from its RKEConfig, if any.
func matchNodeToMachinePool(client *rancher.Client, clusterObject *steveV1.SteveAPIObject, nodeName string) (*provv1.RKEMachinePool, error) {
clusterSpec := &provv1.ClusterSpec{}
err := steveV1.ConvertToK8sType(clusterObject.Spec, clusterSpec)
if err != nil {
return nil, err
}

for _, pool := range clusterSpec.RKEConfig.MachinePools {
if strings.Contains(nodeName, "-"+pool.Name+"-") {

return &pool, nil
}
}

return nil, errors.New("could not find matching machine pool for this node")
}

// AutoReplaceFirstNodeWithRole ssh into the first node with the specified role and shuts it down. If the node is replacable,
// wait for the cluster to return to a healthy state. Otherwise, we expect the cluster to never return to active, as the node will remain unreachable.
func AutoReplaceFirstNodeWithRole(t *testing.T, client *rancher.Client, clusterName, nodeRole string) {
clusterID, err := clusters.GetClusterIDByName(client, clusterName)
require.NoError(t, err)

_, stevecluster, err := clusters.GetProvisioningClusterByName(client, clusterName, provisioninginput.Namespace)
require.NoError(t, err)

machine, err := shutdownFirstNodeWithRole(client, stevecluster, clusterID, nodeRole)
require.NoError(t, err)

machinePool, err := matchNodeToMachinePool(client, stevecluster, machine.Name)
require.NoError(t, err)

if nodeRole == controlPlane || nodeRole == etcd {
err = clusters.WaitClusterToBeUpgraded(client, clusterID)
if machinePool.UnhealthyNodeTimeout.String() == "0s" {
require.Error(t, err, "UnhealthyNodeTimeout set to 0s, but node was replaced!")
return
}
require.NoError(t, err)
}

err = nodes.Isv1NodeConditionMet(client, machine.ID, clusterID, unreachableCondition)
if machinePool.UnhealthyNodeTimeout.String() == "0s" {
require.Error(t, err, "UnhealthyNodeTimeout set to 0s, but node was replaced!")
return
}
require.NoError(t, err)

steveclient, err := client.Steve.ProxyDownstream(clusterID)
require.NoError(t, err)

v1NodeList, err := steveclient.SteveType("node").List(nil)
require.NoError(t, err)

_, err = nodes.IsNodeReplaced(client, machine.Name, clusterID, len(v1NodeList.Data))
require.NoError(t, err)

err = nodes.AllMachineReady(client, clusterID, machinePool.UnhealthyNodeTimeout.Duration+time.Duration(1800))
require.NoError(t, err)

err = clusters.WaitClusterToBeUpgraded(client, clusterID)
require.NoError(t, err)
}
3 changes: 2 additions & 1 deletion tests/v2/validation/provisioning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ From there, your config should contain the tests you want to run (provisioningIn
1. [RKE1 Provisioning](rke1/README.md)
2. [RKE2 Provisioning](rke2/README.md)
3. [Hosted Provider Provisioning](hosted/README.md)
3. [K3s Provisioning](k3s/README.md)
4. [Hosted Provider Provisioning](hosted/README.md)
13 changes: 10 additions & 3 deletions tests/v2/validation/provisioning/k3s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,23 @@ provisioningInput is needed to the run the K3S tests, specifically kubernetesVer
```yaml
provisioningInput:
machinePools:
- nodeRoles:
- machinePoolConfig:
etcd: true
controlplane: true
worker: true
quantity: 1
- nodeRoles:
# the following are optional parameters
drainBeforeDelete: true
hostnameLengthLimit: 29
nodeStartupTimeout: "600s"
unhealthyNodeTimeout: "300s"
maxUnhealthy: "2"
unhealthyRange: "2-4"
- machinePoolConfig:
worker: true
quantity: 2
drainBeforeDelete: true
- nodeRoles:
- machinePoolConfig:
windows: true
quantity: 1
flags:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func (c *CustomClusterProvisioningTestSuite) TestProvisioningK3SCustomCluster()
func (c *CustomClusterProvisioningTestSuite) TestProvisioningK3SCustomClusterDynamicInput() {
isWindows := false
for _, pool := range c.provisioningConfig.MachinePools {
if pool.NodeRoles.Windows {
if pool.MachinePoolConfig.Windows {
isWindows = true
break
}
Expand Down
15 changes: 11 additions & 4 deletions tests/v2/validation/provisioning/rke2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,23 @@ provisioningInput is needed to the run the RKE2 tests, specifically kubernetesVe
```yaml
provisioningInput:
machinePools:
- nodeRoles:
- machinePoolConfig:
etcd: true
controlplane: true
worker: true
quantity: 1
- nodeRoles:
quantity: 5
# the following are optional parameters
drainBeforeDelete: true
hostnameLengthLimit: 29
nodeStartupTimeout: "600s"
unhealthyNodeTimeout: "300s"
maxUnhealthy: "2"
unhealthyRange: "2-4"
- machinePoolConfig:
worker: true
quantity: 2
drainBeforeDelete: true
- nodeRoles:
- machinePoolConfig:
windows: true
quantity: 1
flags:
Expand Down
36 changes: 21 additions & 15 deletions tests/v2/validation/provisioning/rke2/ace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,27 +66,33 @@ func (r *RKE2ACETestSuite) SetupSuite() {
func (r *RKE2ACETestSuite) TestProvisioningRKE2ClusterACE() {
nodeRoles0 := []provisioninginput.MachinePools{
{
NodeRoles: machinepools.NodeRoles{
ControlPlane: true,
Etcd: false,
Worker: false,
Quantity: 3,
MachinePoolConfig: machinepools.MachinePoolConfig{
NodeRoles: machinepools.NodeRoles{
ControlPlane: true,
Etcd: false,
Worker: false,
Quantity: 3,
},
},
},
{
NodeRoles: machinepools.NodeRoles{
ControlPlane: false,
Etcd: true,
Worker: false,
Quantity: 1,
MachinePoolConfig: machinepools.MachinePoolConfig{
NodeRoles: machinepools.NodeRoles{
ControlPlane: false,
Etcd: true,
Worker: false,
Quantity: 1,
},
},
},
{
NodeRoles: machinepools.NodeRoles{
ControlPlane: false,
Etcd: false,
Worker: true,
Quantity: 1,
MachinePoolConfig: machinepools.MachinePoolConfig{
NodeRoles: machinepools.NodeRoles{
ControlPlane: false,
Etcd: false,
Worker: true,
Quantity: 1,
},
},
},
}
Expand Down
Loading

0 comments on commit 69990ef

Please sign in to comment.