Make maxUnhealthy count configurable for control plane and worker mac…

…hines
aws · Feb 15, 2024 · e540c2e · e540c2e
1 parent a915a47
commit e540c2e
Show file tree

Hide file tree

Showing 18 changed files with 437 additions and 32 deletions.
diff --git a/cmd/eksctl-anywhere/cmd/options.go b/cmd/eksctl-anywhere/cmd/options.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/spf13/pflag"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/cluster"
@@ -195,6 +196,8 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC
 
 	createCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	createCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy)
+	createCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy)
 
 	return createCliConfig, nil
 }
@@ -221,6 +224,8 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra
 
 	upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	upgradeCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy)
+	upgradeCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy)
 
 	return &upgradeCliConfig, nil
 }

diff --git a/cmd/eksctl-anywhere/cmd/upgradecluster.go b/cmd/eksctl-anywhere/cmd/upgradecluster.go
@@ -71,7 +71,6 @@ func init() {
 	upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
 	hideForceCleanup(upgradeClusterCmd.Flags())
 	upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ",")))
-
 	aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name)
 	tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC)
 }

diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
@@ -186,6 +186,42 @@ spec:
                       name:
                         type: string
                     type: object
+                  machineHealthCheck:
+                    description: MachineHealthCheck is a control-plane level override
+                      for the timeouts and maxUnhealthy specified in the top-level
+                      MHC configuration. If not configured, the defaults in the top-level
+                      MHC configuration are used.
+                    properties:
+                      maxUnhealthy:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: MaxUnhealthy is used to configure the maximum
+                          number of unhealthy machines in machine health checks. This
+                          setting applies to both control plane and worker machines.
+                          If the number of unhealthy machines exceeds the limit set
+                          by maxUnhealthy, further remediation will not be performed.
+                          If not configured, the default value is set to "100%" for
+                          controlplane machines and "40%" for worker machines.
+                        x-kubernetes-int-or-string: true
+                      nodeStartupTimeout:
+                        description: NodeStartupTimeout is used to configure the node
+                          startup timeout in machine health checks. It determines
+                          how long a MachineHealthCheck should wait for a Node to
+                          join the cluster, before considering a Machine unhealthy.
+                          If not configured, the default value is set to "10m0s" (10
+                          minutes) for all providers. For Tinkerbell provider the
+                          default is "20m0s".
+                        type: string
+                      unhealthyMachineTimeout:
+                        description: UnhealthyMachineTimeout is used to configure
+                          the unhealthy machine timeout in machine health checks.
+                          If any unhealthy conditions are met for the amount of time
+                          specified as the timeout, the machines are considered unhealthy.
+                          If not configured, the default value is set to "5m0s" (5
+                          minutes).
+                        type: string
+                    type: object
                   skipLoadBalancerDeployment:
                     description: SkipLoadBalancerDeployment skip deploying control
                       plane load balancer. Make sure your infrastructure can handle
@@ -344,6 +380,18 @@ spec:
                   to wait to remediate unhealthy machine or determine health of nodes'
                   machines.
                 properties:
+                  maxUnhealthy:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxUnhealthy is used to configure the maximum number
+                      of unhealthy machines in machine health checks. This setting
+                      applies to both control plane and worker machines. If the number
+                      of unhealthy machines exceeds the limit set by maxUnhealthy,
+                      further remediation will not be performed. If not configured,
+                      the default value is set to "100%" for controlplane machines
+                      and "40%" for worker machines.
+                    x-kubernetes-int-or-string: true
                   nodeStartupTimeout:
                     description: NodeStartupTimeout is used to configure the node
                       startup timeout in machine health checks. It determines how
@@ -536,6 +584,43 @@ spec:
                         name:
                           type: string
                       type: object
+                    machineHealthCheck:
+                      description: MachineHealthCheck is a control-plane level override
+                        for the timeouts and maxUnhealthy specified in the top-level
+                        MHC configuration. If not configured, the defaults in the
+                        top-level MHC configuration are used.
+                      properties:
+                        maxUnhealthy:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: MaxUnhealthy is used to configure the maximum
+                            number of unhealthy machines in machine health checks.
+                            This setting applies to both control plane and worker
+                            machines. If the number of unhealthy machines exceeds
+                            the limit set by maxUnhealthy, further remediation will
+                            not be performed. If not configured, the default value
+                            is set to "100%" for controlplane machines and "40%" for
+                            worker machines.
+                          x-kubernetes-int-or-string: true
+                        nodeStartupTimeout:
+                          description: NodeStartupTimeout is used to configure the
+                            node startup timeout in machine health checks. It determines
+                            how long a MachineHealthCheck should wait for a Node to
+                            join the cluster, before considering a Machine unhealthy.
+                            If not configured, the default value is set to "10m0s"
+                            (10 minutes) for all providers. For Tinkerbell provider
+                            the default is "20m0s".
+                          type: string
+                        unhealthyMachineTimeout:
+                          description: UnhealthyMachineTimeout is used to configure
+                            the unhealthy machine timeout in machine health checks.
+                            If any unhealthy conditions are met for the amount of
+                            time specified as the timeout, the machines are considered
+                            unhealthy. If not configured, the default value is set
+                            to "5m0s" (5 minutes).
+                          type: string
+                      type: object
                     name:
                       description: Name refers to the name of the worker node group
                       type: string

diff --git a/config/manifest/eksa-components.yaml b/config/manifest/eksa-components.yaml
@@ -3889,6 +3889,42 @@ spec:
                       name:
                         type: string
                     type: object
+                  machineHealthCheck:
+                    description: MachineHealthCheck is a control-plane level override
+                      for the timeouts and maxUnhealthy specified in the top-level
+                      MHC configuration. If not configured, the defaults in the top-level
+                      MHC configuration are used.
+                    properties:
+                      maxUnhealthy:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: MaxUnhealthy is used to configure the maximum
+                          number of unhealthy machines in machine health checks. This
+                          setting applies to both control plane and worker machines.
+                          If the number of unhealthy machines exceeds the limit set
+                          by maxUnhealthy, further remediation will not be performed.
+                          If not configured, the default value is set to "100%" for
+                          controlplane machines and "40%" for worker machines.
+                        x-kubernetes-int-or-string: true
+                      nodeStartupTimeout:
+                        description: NodeStartupTimeout is used to configure the node
+                          startup timeout in machine health checks. It determines
+                          how long a MachineHealthCheck should wait for a Node to
+                          join the cluster, before considering a Machine unhealthy.
+                          If not configured, the default value is set to "10m0s" (10
+                          minutes) for all providers. For Tinkerbell provider the
+                          default is "20m0s".
+                        type: string
+                      unhealthyMachineTimeout:
+                        description: UnhealthyMachineTimeout is used to configure
+                          the unhealthy machine timeout in machine health checks.
+                          If any unhealthy conditions are met for the amount of time
+                          specified as the timeout, the machines are considered unhealthy.
+                          If not configured, the default value is set to "5m0s" (5
+                          minutes).
+                        type: string
+                    type: object
                   skipLoadBalancerDeployment:
                     description: SkipLoadBalancerDeployment skip deploying control
                       plane load balancer. Make sure your infrastructure can handle
@@ -4047,6 +4083,18 @@ spec:
                   to wait to remediate unhealthy machine or determine health of nodes'
                   machines.
                 properties:
+                  maxUnhealthy:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxUnhealthy is used to configure the maximum number
+                      of unhealthy machines in machine health checks. This setting
+                      applies to both control plane and worker machines. If the number
+                      of unhealthy machines exceeds the limit set by maxUnhealthy,
+                      further remediation will not be performed. If not configured,
+                      the default value is set to "100%" for controlplane machines
+                      and "40%" for worker machines.
+                    x-kubernetes-int-or-string: true
                   nodeStartupTimeout:
                     description: NodeStartupTimeout is used to configure the node
                       startup timeout in machine health checks. It determines how
@@ -4239,6 +4287,43 @@ spec:
                         name:
                           type: string
                       type: object
+                    machineHealthCheck:
+                      description: MachineHealthCheck is a control-plane level override
+                        for the timeouts and maxUnhealthy specified in the top-level
+                        MHC configuration. If not configured, the defaults in the
+                        top-level MHC configuration are used.
+                      properties:
+                        maxUnhealthy:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: MaxUnhealthy is used to configure the maximum
+                            number of unhealthy machines in machine health checks.
+                            This setting applies to both control plane and worker
+                            machines. If the number of unhealthy machines exceeds
+                            the limit set by maxUnhealthy, further remediation will
+                            not be performed. If not configured, the default value
+                            is set to "100%" for controlplane machines and "40%" for
+                            worker machines.
+                          x-kubernetes-int-or-string: true
+                        nodeStartupTimeout:
+                          description: NodeStartupTimeout is used to configure the
+                            node startup timeout in machine health checks. It determines
+                            how long a MachineHealthCheck should wait for a Node to
+                            join the cluster, before considering a Machine unhealthy.
+                            If not configured, the default value is set to "10m0s"
+                            (10 minutes) for all providers. For Tinkerbell provider
+                            the default is "20m0s".
+                          type: string
+                        unhealthyMachineTimeout:
+                          description: UnhealthyMachineTimeout is used to configure
+                            the unhealthy machine timeout in machine health checks.
+                            If any unhealthy conditions are met for the amount of
+                            time specified as the timeout, the machines are considered
+                            unhealthy. If not configured, the default value is set
+                            to "5m0s" (5 minutes).
+                          type: string
+                      type: object
                     name:
                       description: Name refers to the name of the worker node group
                       type: string

diff --git a/controllers/factory.go b/controllers/factory.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/google/uuid"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controllers/remote"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -579,7 +580,7 @@ func (f *Factory) withMachineHealthCheckReconciler() *Factory {
 			return nil
 		}
 
-		machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+		machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy))
 
 		f.machineHealthCheckReconciler = mhcreconciler.New(
 			f.manager.GetClient(),

diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go
@@ -8,6 +8,7 @@ import (
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 
 	"github.com/aws/eks-anywhere/pkg/logger"
@@ -296,6 +297,8 @@ type ControlPlaneConfiguration struct {
 	// CertSANs is a slice of domain names or IPs to be added as Subject Name Alternatives of the
 	// Kube API Servers Certificate.
 	CertSANs []string `json:"certSans,omitempty"`
+	// MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
+	MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
 }
 
 // MachineHealthCheck allows to configure timeouts for machine health checks. Machine Health Checks are responsible for remediating unhealthy Machines.
@@ -305,6 +308,8 @@ type MachineHealthCheck struct {
 	NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
 	// UnhealthyMachineTimeout is used to configure the unhealthy machine timeout in machine health checks. If any unhealthy conditions are met for the amount of time specified as the timeout, the machines are considered unhealthy. If not configured, the default value is set to "5m0s" (5 minutes).
 	UnhealthyMachineTimeout *metav1.Duration `json:"unhealthyMachineTimeout,omitempty"`
+	// MaxUnhealthy is used to configure the maximum number of unhealthy machines in machine health checks. This setting applies to both control plane and worker machines. If the number of unhealthy machines exceeds the limit set by maxUnhealthy, further remediation will not be performed. If not configured, the default value is set to "100%" for controlplane machines and "40%" for worker machines.
+	MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
 }
 
 func TaintsSliceEqual(s1, s2 []corev1.Taint) bool {
@@ -440,6 +445,8 @@ type WorkerNodeGroupConfiguration struct {
 	UpgradeRolloutStrategy *WorkerNodesUpgradeRolloutStrategy `json:"upgradeRolloutStrategy,omitempty"`
 	// KuberenetesVersion defines the version for worker nodes. If not set, the top level spec kubernetesVersion will be used.
 	KubernetesVersion *KubernetesVersion `json:"kubernetesVersion,omitempty"`
+	// MachineHealthCheck is a worker node level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
+	MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
 }
 
 // Equal compares two WorkerNodeGroupConfigurations.

diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go