Make maxUnhealthy count configurable for control plane and worker mac…

…hines
aws · Jan 16, 2024 · 0e30882 · 0e30882
1 parent 9bb82e1
commit 0e30882
Show file tree

Hide file tree

Showing 21 changed files with 433 additions and 38 deletions.
diff --git a/cmd/eksctl-anywhere/cmd/constants.go b/cmd/eksctl-anywhere/cmd/constants.go
@@ -9,6 +9,7 @@ const (
 	unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
 	nodeStartupTimeoutFlag      = "node-startup-timeout"
 	noTimeoutsFlag              = "no-timeouts"
+	maxUnhealthyFlag            = "max-unhealthy"
 )
 
 type Operation int

diff --git a/cmd/eksctl-anywhere/cmd/createcluster.go b/cmd/eksctl-anywhere/cmd/createcluster.go
@@ -12,6 +12,7 @@ import (
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/awsiamauth"
 	"github.com/aws/eks-anywhere/pkg/clustermanager"
+	"github.com/aws/eks-anywhere/pkg/constants"
 	"github.com/aws/eks-anywhere/pkg/dependencies"
 	"github.com/aws/eks-anywhere/pkg/executables"
 	"github.com/aws/eks-anywhere/pkg/features"
@@ -29,6 +30,7 @@ import (
 type createClusterOptions struct {
 	clusterOptions
 	timeoutOptions
+	maxUnhealthy          string
 	forceClean            bool
 	skipIpCheck           bool
 	hardwareCSVPath       string
@@ -61,6 +63,7 @@ func init() {
 	createCmd.AddCommand(createClusterCmd)
 	applyClusterOptionFlags(createClusterCmd.Flags(), &cc.clusterOptions)
 	applyTimeoutFlags(createClusterCmd.Flags(), &cc.timeoutOptions)
+	createClusterCmd.Flags().StringVar(&cc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
 	applyTinkerbellHardwareFlag(createClusterCmd.Flags(), &cc.hardwareCSVPath)
 	aflag.String(aflag.TinkerbellBootstrapIP, &cc.tinkerbellBootstrapIP, createClusterCmd.Flags())
 	createClusterCmd.Flags().BoolVar(&cc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")

diff --git a/cmd/eksctl-anywhere/cmd/options.go b/cmd/eksctl-anywhere/cmd/options.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/spf13/pflag"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/cluster"
@@ -195,6 +196,7 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC
 
 	createCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	createCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)
 
 	return createCliConfig, nil
 }
@@ -221,6 +223,7 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra
 
 	upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	upgradeCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)
 
 	return &upgradeCliConfig, nil
 }

diff --git a/cmd/eksctl-anywhere/cmd/upgradecluster.go b/cmd/eksctl-anywhere/cmd/upgradecluster.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/aws/eks-anywhere/cmd/eksctl-anywhere/cmd/aflag"
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
+	"github.com/aws/eks-anywhere/pkg/constants"
 	"github.com/aws/eks-anywhere/pkg/dependencies"
 	"github.com/aws/eks-anywhere/pkg/features"
 	"github.com/aws/eks-anywhere/pkg/kubeconfig"
@@ -26,6 +27,7 @@ import (
 type upgradeClusterOptions struct {
 	clusterOptions
 	timeoutOptions
+	maxUnhealthy          string
 	wConfig               string
 	forceClean            bool
 	hardwareCSVPath       string
@@ -68,12 +70,12 @@ func init() {
 	upgradeCmd.AddCommand(upgradeClusterCmd)
 	applyClusterOptionFlags(upgradeClusterCmd.Flags(), &uc.clusterOptions)
 	applyTimeoutFlags(upgradeClusterCmd.Flags(), &uc.timeoutOptions)
+	upgradeClusterCmd.Flags().StringVar(&uc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
 	applyTinkerbellHardwareFlag(upgradeClusterCmd.Flags(), &uc.hardwareCSVPath)
 	upgradeClusterCmd.Flags().StringVarP(&uc.wConfig, "w-config", "w", "", "Kubeconfig file to use when upgrading a workload cluster")
 	upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
 	hideForceCleanup(upgradeClusterCmd.Flags())
 	upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ",")))
-
 	aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name)
 	tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC)
 }

diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
@@ -186,6 +186,41 @@ spec:
                       name:
                         type: string
                     type: object
+                  machineHealthCheck:
+                    description: MachineHealthCheck is a control-plane level override
+                      for the timeouts and maxUnhealthy specified in the top-level
+                      MHC configuration. If not configured, the defaults in the top-level
+                      MHC configuration are used.
+                    properties:
+                      maxUnhealthy:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: MaxUnhealthy is used to configure the maximum
+                          number of unhealthy machines in machine health checks. This
+                          setting applies to both control plane and worker machines.
+                          If the number of unhealthy machines exceeds the limit set
+                          by maxUnhealthy, further remediation will not be performed.
+                          If not configured, the default value is set to "100%".
+                        x-kubernetes-int-or-string: true
+                      nodeStartupTimeout:
+                        description: NodeStartupTimeout is used to configure the node
+                          startup timeout in machine health checks. It determines
+                          how long a MachineHealthCheck should wait for a Node to
+                          join the cluster, before considering a Machine unhealthy.
+                          If not configured, the default value is set to "10m0s" (10
+                          minutes) for all providers. For Tinkerbell provider the
+                          default is "20m0s".
+                        type: string
+                      unhealthyMachineTimeout:
+                        description: UnhealthyMachineTimeout is used to configure
+                          the unhealthy machine timeout in machine health checks.
+                          If any unhealthy conditions are met for the amount of time
+                          specified as the timeout, the machines are considered unhealthy.
+                          If not configured, the default value is set to "5m0s" (5
+                          minutes).
+                        type: string
+                    type: object
                   skipLoadBalancerDeployment:
                     description: SkipLoadBalancerDeployment skip deploying control
                       plane load balancer. Make sure your infrastructure can handle
@@ -342,6 +377,17 @@ spec:
                   to wait to remediate unhealthy machine or determine health of nodes'
                   machines.
                 properties:
+                  maxUnhealthy:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxUnhealthy is used to configure the maximum number
+                      of unhealthy machines in machine health checks. This setting
+                      applies to both control plane and worker machines. If the number
+                      of unhealthy machines exceeds the limit set by maxUnhealthy,
+                      further remediation will not be performed. If not configured,
+                      the default value is set to "100%".
+                    x-kubernetes-int-or-string: true
                   nodeStartupTimeout:
                     description: NodeStartupTimeout is used to configure the node
                       startup timeout in machine health checks. It determines how
@@ -534,6 +580,42 @@ spec:
                         name:
                           type: string
                       type: object
+                    machineHealthCheck:
+                      description: MachineHealthCheck is a control-plane level override
+                        for the timeouts and maxUnhealthy specified in the top-level
+                        MHC configuration. If not configured, the defaults in the
+                        top-level MHC configuration are used.
+                      properties:
+                        maxUnhealthy:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: MaxUnhealthy is used to configure the maximum
+                            number of unhealthy machines in machine health checks.
+                            This setting applies to both control plane and worker
+                            machines. If the number of unhealthy machines exceeds
+                            the limit set by maxUnhealthy, further remediation will
+                            not be performed. If not configured, the default value
+                            is set to "100%".
+                          x-kubernetes-int-or-string: true
+                        nodeStartupTimeout:
+                          description: NodeStartupTimeout is used to configure the
+                            node startup timeout in machine health checks. It determines
+                            how long a MachineHealthCheck should wait for a Node to
+                            join the cluster, before considering a Machine unhealthy.
+                            If not configured, the default value is set to "10m0s"
+                            (10 minutes) for all providers. For Tinkerbell provider
+                            the default is "20m0s".
+                          type: string
+                        unhealthyMachineTimeout:
+                          description: UnhealthyMachineTimeout is used to configure
+                            the unhealthy machine timeout in machine health checks.
+                            If any unhealthy conditions are met for the amount of
+                            time specified as the timeout, the machines are considered
+                            unhealthy. If not configured, the default value is set
+                            to "5m0s" (5 minutes).
+                          type: string
+                      type: object
                     name:
                       description: Name refers to the name of the worker node group
                       type: string

diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
@@ -46,8 +46,6 @@ spec:
                   nodes and ignored for worker nodes.
                 type: boolean
               kubernetesVersion:
-                description: 'TODO(in-place): Determine if there''s a way to get these
-                  dynamically instead of expecting it from the CRD.'
                 type: string
               machine:
                 description: Machine is a reference to the CAPI Machine that needs