From 0e30882978d62f50d67730ab7da1dc93b9b9c387 Mon Sep 17 00:00:00 2001
From: Abhay Krishna Arunachalam <arnchlm@amazon.com>
Date: Wed, 10 Jan 2024 00:52:28 -0800
Subject: [PATCH] Make maxUnhealthy count configurable for control plane and
 worker machines

---
 cmd/eksctl-anywhere/cmd/constants.go          |   1 +
 cmd/eksctl-anywhere/cmd/createcluster.go      |   3 +
 cmd/eksctl-anywhere/cmd/options.go            |   3 +
 cmd/eksctl-anywhere/cmd/upgradecluster.go     |   4 +-
 .../anywhere.eks.amazonaws.com_clusters.yaml  |  82 +++++++++++++
 ...ywhere.eks.amazonaws.com_nodeupgrades.yaml |   2 -
 config/manifest/eksa-components.yaml          | 113 ++++++++++++++++-
 controllers/factory.go                        |   3 +-
 pkg/api/v1alpha1/cluster_types.go             |   7 ++
 pkg/api/v1alpha1/zz_generated.deepcopy.go     |  16 +++
 pkg/cli/createclusterdefaulter_test.go        |   7 +-
 pkg/cli/upgradeclusterdefaulter_test.go       |   3 +-
 pkg/cluster/defaults.go                       |  30 ++++-
 pkg/cluster/defaults_test.go                  |  14 ++-
 pkg/clusterapi/machine_health_check.go        |  39 ++++--
 pkg/clusterapi/machine_health_check_test.go   | 114 +++++++++++++++++-
 .../reconciler/reconciler_test.go             |   3 +-
 pkg/clustermanager/cluster_manager_test.go    |  11 +-
 pkg/config/config.go                          |  10 +-
 pkg/constants/constants.go                    |   2 +
 pkg/dependencies/factory.go                   |   4 +-
 21 files changed, 433 insertions(+), 38 deletions(-)

diff --git a/cmd/eksctl-anywhere/cmd/constants.go b/cmd/eksctl-anywhere/cmd/constants.go
index 9f934eb3d9102..f3e9e12e29711 100644
--- a/cmd/eksctl-anywhere/cmd/constants.go
+++ b/cmd/eksctl-anywhere/cmd/constants.go
@@ -9,6 +9,7 @@ const (
 	unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
 	nodeStartupTimeoutFlag      = "node-startup-timeout"
 	noTimeoutsFlag              = "no-timeouts"
+	maxUnhealthyFlag            = "max-unhealthy"
 )
 
 type Operation int
diff --git a/cmd/eksctl-anywhere/cmd/createcluster.go b/cmd/eksctl-anywhere/cmd/createcluster.go
index 5f8735e8944c5..6815047ec9c52 100644
--- a/cmd/eksctl-anywhere/cmd/createcluster.go
+++ b/cmd/eksctl-anywhere/cmd/createcluster.go
@@ -12,6 +12,7 @@ import (
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/awsiamauth"
 	"github.com/aws/eks-anywhere/pkg/clustermanager"
+	"github.com/aws/eks-anywhere/pkg/constants"
 	"github.com/aws/eks-anywhere/pkg/dependencies"
 	"github.com/aws/eks-anywhere/pkg/executables"
 	"github.com/aws/eks-anywhere/pkg/features"
@@ -29,6 +30,7 @@ import (
 type createClusterOptions struct {
 	clusterOptions
 	timeoutOptions
+	maxUnhealthy          string
 	forceClean            bool
 	skipIpCheck           bool
 	hardwareCSVPath       string
@@ -61,6 +63,7 @@ func init() {
 	createCmd.AddCommand(createClusterCmd)
 	applyClusterOptionFlags(createClusterCmd.Flags(), &cc.clusterOptions)
 	applyTimeoutFlags(createClusterCmd.Flags(), &cc.timeoutOptions)
+	createClusterCmd.Flags().StringVar(&cc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
 	applyTinkerbellHardwareFlag(createClusterCmd.Flags(), &cc.hardwareCSVPath)
 	aflag.String(aflag.TinkerbellBootstrapIP, &cc.tinkerbellBootstrapIP, createClusterCmd.Flags())
 	createClusterCmd.Flags().BoolVar(&cc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
diff --git a/cmd/eksctl-anywhere/cmd/options.go b/cmd/eksctl-anywhere/cmd/options.go
index 5c2a794254a21..7089b0b06c137 100644
--- a/cmd/eksctl-anywhere/cmd/options.go
+++ b/cmd/eksctl-anywhere/cmd/options.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/spf13/pflag"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/cluster"
@@ -195,6 +196,7 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC
 
 	createCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	createCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)
 
 	return createCliConfig, nil
 }
@@ -221,6 +223,7 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra
 
 	upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
 	upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
+	upgradeCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)
 
 	return &upgradeCliConfig, nil
 }
diff --git a/cmd/eksctl-anywhere/cmd/upgradecluster.go b/cmd/eksctl-anywhere/cmd/upgradecluster.go
index 34ce201beeaa4..553d58e3551bc 100644
--- a/cmd/eksctl-anywhere/cmd/upgradecluster.go
+++ b/cmd/eksctl-anywhere/cmd/upgradecluster.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/aws/eks-anywhere/cmd/eksctl-anywhere/cmd/aflag"
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
+	"github.com/aws/eks-anywhere/pkg/constants"
 	"github.com/aws/eks-anywhere/pkg/dependencies"
 	"github.com/aws/eks-anywhere/pkg/features"
 	"github.com/aws/eks-anywhere/pkg/kubeconfig"
@@ -26,6 +27,7 @@ import (
 type upgradeClusterOptions struct {
 	clusterOptions
 	timeoutOptions
+	maxUnhealthy          string
 	wConfig               string
 	forceClean            bool
 	hardwareCSVPath       string
@@ -68,12 +70,12 @@ func init() {
 	upgradeCmd.AddCommand(upgradeClusterCmd)
 	applyClusterOptionFlags(upgradeClusterCmd.Flags(), &uc.clusterOptions)
 	applyTimeoutFlags(upgradeClusterCmd.Flags(), &uc.timeoutOptions)
+	upgradeClusterCmd.Flags().StringVar(&uc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
 	applyTinkerbellHardwareFlag(upgradeClusterCmd.Flags(), &uc.hardwareCSVPath)
 	upgradeClusterCmd.Flags().StringVarP(&uc.wConfig, "w-config", "w", "", "Kubeconfig file to use when upgrading a workload cluster")
 	upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
 	hideForceCleanup(upgradeClusterCmd.Flags())
 	upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ",")))
-
 	aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name)
 	tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC)
 }
diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
index b586785563755..76b4b8e84ee41 100644
--- a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
+++ b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
@@ -186,6 +186,41 @@ spec:
                       name:
                         type: string
                     type: object
+                  machineHealthCheck:
+                    description: MachineHealthCheck is a control-plane level override
+                      for the timeouts and maxUnhealthy specified in the top-level
+                      MHC configuration. If not configured, the defaults in the top-level
+                      MHC configuration are used.
+                    properties:
+                      maxUnhealthy:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: MaxUnhealthy is used to configure the maximum
+                          number of unhealthy machines in machine health checks. This
+                          setting applies to both control plane and worker machines.
+                          If the number of unhealthy machines exceeds the limit set
+                          by maxUnhealthy, further remediation will not be performed.
+                          If not configured, the default value is set to "100%".
+                        x-kubernetes-int-or-string: true
+                      nodeStartupTimeout:
+                        description: NodeStartupTimeout is used to configure the node
+                          startup timeout in machine health checks. It determines
+                          how long a MachineHealthCheck should wait for a Node to
+                          join the cluster, before considering a Machine unhealthy.
+                          If not configured, the default value is set to "10m0s" (10
+                          minutes) for all providers. For Tinkerbell provider the
+                          default is "20m0s".
+                        type: string
+                      unhealthyMachineTimeout:
+                        description: UnhealthyMachineTimeout is used to configure
+                          the unhealthy machine timeout in machine health checks.
+                          If any unhealthy conditions are met for the amount of time
+                          specified as the timeout, the machines are considered unhealthy.
+                          If not configured, the default value is set to "5m0s" (5
+                          minutes).
+                        type: string
+                    type: object
                   skipLoadBalancerDeployment:
                     description: SkipLoadBalancerDeployment skip deploying control
                       plane load balancer. Make sure your infrastructure can handle
@@ -342,6 +377,17 @@ spec:
                   to wait to remediate unhealthy machine or determine health of nodes'
                   machines.
                 properties:
+                  maxUnhealthy:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxUnhealthy is used to configure the maximum number
+                      of unhealthy machines in machine health checks. This setting
+                      applies to both control plane and worker machines. If the number
+                      of unhealthy machines exceeds the limit set by maxUnhealthy,
+                      further remediation will not be performed. If not configured,
+                      the default value is set to "100%".
+                    x-kubernetes-int-or-string: true
                   nodeStartupTimeout:
                     description: NodeStartupTimeout is used to configure the node
                       startup timeout in machine health checks. It determines how
@@ -534,6 +580,42 @@ spec:
                         name:
                           type: string
                       type: object
+                    machineHealthCheck:
+                      description: MachineHealthCheck is a control-plane level override
+                        for the timeouts and maxUnhealthy specified in the top-level
+                        MHC configuration. If not configured, the defaults in the
+                        top-level MHC configuration are used.
+                      properties:
+                        maxUnhealthy:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: MaxUnhealthy is used to configure the maximum
+                            number of unhealthy machines in machine health checks.
+                            This setting applies to both control plane and worker
+                            machines. If the number of unhealthy machines exceeds
+                            the limit set by maxUnhealthy, further remediation will
+                            not be performed. If not configured, the default value
+                            is set to "100%".
+                          x-kubernetes-int-or-string: true
+                        nodeStartupTimeout:
+                          description: NodeStartupTimeout is used to configure the
+                            node startup timeout in machine health checks. It determines
+                            how long a MachineHealthCheck should wait for a Node to
+                            join the cluster, before considering a Machine unhealthy.
+                            If not configured, the default value is set to "10m0s"
+                            (10 minutes) for all providers. For Tinkerbell provider
+                            the default is "20m0s".
+                          type: string
+                        unhealthyMachineTimeout:
+                          description: UnhealthyMachineTimeout is used to configure
+                            the unhealthy machine timeout in machine health checks.
+                            If any unhealthy conditions are met for the amount of
+                            time specified as the timeout, the machines are considered
+                            unhealthy. If not configured, the default value is set
+                            to "5m0s" (5 minutes).
+                          type: string
+                      type: object
                     name:
                       description: Name refers to the name of the worker node group
                       type: string
diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
index 22edf5243746b..90ea218b0ab51 100644
--- a/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
+++ b/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
@@ -46,8 +46,6 @@ spec:
                   nodes and ignored for worker nodes.
                 type: boolean
               kubernetesVersion:
-                description: 'TODO(in-place): Determine if there''s a way to get these
-                  dynamically instead of expecting it from the CRD.'
                 type: string
               machine:
                 description: Machine is a reference to the CAPI Machine that needs
diff --git a/config/manifest/eksa-components.yaml b/config/manifest/eksa-components.yaml
index 90e4fa38eddcc..5b86fbb080a93 100644
--- a/config/manifest/eksa-components.yaml
+++ b/config/manifest/eksa-components.yaml
@@ -3740,17 +3740,42 @@ spec:
                       in the cluster
                     properties:
                       cilium:
+                        description: CiliumConfig contains configuration specific
+                          to the Cilium CNI.
                         properties:
                           egressMasqueradeInterfaces:
                             description: EgressMasquaradeInterfaces determines which
                               network interfaces are used for masquerading. Accepted
                               values are a valid interface name or interface prefix.
                             type: string
+                          ipv4NativeRoutingCIDR:
+                            description: IPv4NativeRoutingCIDR specifies the CIDR
+                              to use when RoutingMode is set to direct. When specified,
+                              Cilium assumes networking for this CIDR is preconfigured
+                              and hands traffic destined for that range to the Linux
+                              network stack without applying any SNAT. If this is
+                              not set autoDirectNodeRoutes will be set to true
+                            type: string
+                          ipv6NativeRoutingCIDR:
+                            description: IPv6NativeRoutingCIDR specifies the IPv6
+                              CIDR to use when RoutingMode is set to direct. When
+                              specified, Cilium assumes networking for this CIDR is
+                              preconfigured and hands traffic destined for that range
+                              to the Linux network stack without applying any SNAT.
+                              If this is not set autoDirectNodeRoutes will be set
+                              to true
+                            type: string
                           policyEnforcementMode:
                             description: PolicyEnforcementMode determines communication
                               allowed between pods. Accepted values are default, always,
                               never.
                             type: string
+                          routingMode:
+                            description: RoutingMode indicates the routing tunnel
+                              mode to use for Cilium. Accepted values are overlay
+                              (geneve tunnel with overlay) or direct (tunneling disabled
+                              with direct routing) Defaults to overlay.
+                            type: string
                           skipUpgrade:
                             description: SkipUpgrade indicicates that Cilium maintenance
                               should be skipped during upgrades. This can be used
@@ -3758,6 +3783,8 @@ spec:
                             type: boolean
                         type: object
                       kindnetd:
+                        description: KindnetdConfig contains configuration specific
+                          to the Kindnetd CNI.
                         type: object
                     type: object
                   dns:
@@ -3833,6 +3860,41 @@ spec:
                       name:
                         type: string
                     type: object
+                  machineHealthCheck:
+                    description: MachineHealthCheck is a control-plane level override
+                      for the timeouts and maxUnhealthy specified in the top-level
+                      MHC configuration. If not configured, the defaults in the top-level
+                      MHC configuration are used.
+                    properties:
+                      maxUnhealthy:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        description: MaxUnhealthy is used to configure the maximum
+                          number of unhealthy machines in machine health checks. This
+                          setting applies to both control plane and worker machines.
+                          If the number of unhealthy machines exceeds the limit set
+                          by maxUnhealthy, further remediation will not be performed.
+                          If not configured, the default value is set to "100%".
+                        x-kubernetes-int-or-string: true
+                      nodeStartupTimeout:
+                        description: NodeStartupTimeout is used to configure the node
+                          startup timeout in machine health checks. It determines
+                          how long a MachineHealthCheck should wait for a Node to
+                          join the cluster, before considering a Machine unhealthy.
+                          If not configured, the default value is set to "10m0s" (10
+                          minutes) for all providers. For Tinkerbell provider the
+                          default is "20m0s".
+                        type: string
+                      unhealthyMachineTimeout:
+                        description: UnhealthyMachineTimeout is used to configure
+                          the unhealthy machine timeout in machine health checks.
+                          If any unhealthy conditions are met for the amount of time
+                          specified as the timeout, the machines are considered unhealthy.
+                          If not configured, the default value is set to "5m0s" (5
+                          minutes).
+                        type: string
+                    type: object
                   skipLoadBalancerDeployment:
                     description: SkipLoadBalancerDeployment skip deploying control
                       plane load balancer. Make sure your infrastructure can handle
@@ -3989,6 +4051,17 @@ spec:
                   to wait to remediate unhealthy machine or determine health of nodes'
                   machines.
                 properties:
+                  maxUnhealthy:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxUnhealthy is used to configure the maximum number
+                      of unhealthy machines in machine health checks. This setting
+                      applies to both control plane and worker machines. If the number
+                      of unhealthy machines exceeds the limit set by maxUnhealthy,
+                      further remediation will not be performed. If not configured,
+                      the default value is set to "100%".
+                    x-kubernetes-int-or-string: true
                   nodeStartupTimeout:
                     description: NodeStartupTimeout is used to configure the node
                       startup timeout in machine health checks. It determines how
@@ -4181,6 +4254,42 @@ spec:
                         name:
                           type: string
                       type: object
+                    machineHealthCheck:
+                      description: MachineHealthCheck is a control-plane level override
+                        for the timeouts and maxUnhealthy specified in the top-level
+                        MHC configuration. If not configured, the defaults in the
+                        top-level MHC configuration are used.
+                      properties:
+                        maxUnhealthy:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          description: MaxUnhealthy is used to configure the maximum
+                            number of unhealthy machines in machine health checks.
+                            This setting applies to both control plane and worker
+                            machines. If the number of unhealthy machines exceeds
+                            the limit set by maxUnhealthy, further remediation will
+                            not be performed. If not configured, the default value
+                            is set to "100%".
+                          x-kubernetes-int-or-string: true
+                        nodeStartupTimeout:
+                          description: NodeStartupTimeout is used to configure the
+                            node startup timeout in machine health checks. It determines
+                            how long a MachineHealthCheck should wait for a Node to
+                            join the cluster, before considering a Machine unhealthy.
+                            If not configured, the default value is set to "10m0s"
+                            (10 minutes) for all providers. For Tinkerbell provider
+                            the default is "20m0s".
+                          type: string
+                        unhealthyMachineTimeout:
+                          description: UnhealthyMachineTimeout is used to configure
+                            the unhealthy machine timeout in machine health checks.
+                            If any unhealthy conditions are met for the amount of
+                            time specified as the timeout, the machines are considered
+                            unhealthy. If not configured, the default value is set
+                            to "5m0s" (5 minutes).
+                          type: string
+                      type: object
                     name:
                       description: Name refers to the name of the worker node group
                       type: string
@@ -4440,10 +4549,6 @@ spec:
               upgraded:
                 format: int64
                 type: integer
-            required:
-            - ready
-            - requireUpgrade
-            - upgraded
             type: object
         type: object
     served: true
diff --git a/controllers/factory.go b/controllers/factory.go
index 9905f2281521d..127c56b1eef12 100644
--- a/controllers/factory.go
+++ b/controllers/factory.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/google/uuid"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3"
 	"sigs.k8s.io/cluster-api/controllers/remote"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -577,7 +578,7 @@ func (f *Factory) withMachineHealthCheckReconciler() *Factory {
 			return nil
 		}
 
-		machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+		machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 		f.machineHealthCheckReconciler = mhcreconciler.New(
 			f.manager.GetClient(),
diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go
index fa0b107b42d88..5fa6266461346 100644
--- a/pkg/api/v1alpha1/cluster_types.go
+++ b/pkg/api/v1alpha1/cluster_types.go
@@ -8,6 +8,7 @@ import (
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 
 	"github.com/aws/eks-anywhere/pkg/logger"
@@ -285,6 +286,8 @@ type ControlPlaneConfiguration struct {
 	// CertSANs is a slice of domain names or IPs to be added as Subject Name Alternatives of the
 	// Kube API Servers Certificate.
 	CertSANs []string `json:"certSans,omitempty"`
+	// MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
+	MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
 }
 
 // MachineHealthCheck allows to configure timeouts for machine health checks. Machine Health Checks are responsible for remediating unhealthy Machines.
@@ -294,6 +297,8 @@ type MachineHealthCheck struct {
 	NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
 	// UnhealthyMachineTimeout is used to configure the unhealthy machine timeout in machine health checks. If any unhealthy conditions are met for the amount of time specified as the timeout, the machines are considered unhealthy. If not configured, the default value is set to "5m0s" (5 minutes).
 	UnhealthyMachineTimeout *metav1.Duration `json:"unhealthyMachineTimeout,omitempty"`
+	// MaxUnhealthy is used to configure the maximum number of unhealthy machines in machine health checks. This setting applies to both control plane and worker machines. If the number of unhealthy machines exceeds the limit set by maxUnhealthy, further remediation will not be performed. If not configured, the default value is set to "100%".
+	MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
 }
 
 func TaintsSliceEqual(s1, s2 []corev1.Taint) bool {
@@ -429,6 +434,8 @@ type WorkerNodeGroupConfiguration struct {
 	UpgradeRolloutStrategy *WorkerNodesUpgradeRolloutStrategy `json:"upgradeRolloutStrategy,omitempty"`
 	// KuberenetesVersion defines the version for worker nodes. If not set, the top level spec kubernetesVersion will be used.
 	KubernetesVersion *KubernetesVersion `json:"kubernetesVersion,omitempty"`
+	// MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
+	MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
 }
 
 // Equal compares two WorkerNodeGroupConfigurations.
diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go
index fc30d2f05b93b..00e62967e609c 100644
--- a/pkg/api/v1alpha1/zz_generated.deepcopy.go
+++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go
@@ -24,6 +24,7 @@ import (
 	"k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"sigs.k8s.io/cluster-api/api/v1beta1"
 	apiv1beta1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
 )
@@ -872,6 +873,11 @@ func (in *ControlPlaneConfiguration) DeepCopyInto(out *ControlPlaneConfiguration
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
+	if in.MachineHealthCheck != nil {
+		in, out := &in.MachineHealthCheck, &out.MachineHealthCheck
+		*out = new(MachineHealthCheck)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControlPlaneConfiguration.
@@ -1707,6 +1713,11 @@ func (in *MachineHealthCheck) DeepCopyInto(out *MachineHealthCheck) {
 		*out = new(metav1.Duration)
 		**out = **in
 	}
+	if in.MaxUnhealthy != nil {
+		in, out := &in.MaxUnhealthy, &out.MaxUnhealthy
+		*out = new(intstr.IntOrString)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineHealthCheck.
@@ -3460,6 +3471,11 @@ func (in *WorkerNodeGroupConfiguration) DeepCopyInto(out *WorkerNodeGroupConfigu
 		*out = new(KubernetesVersion)
 		**out = **in
 	}
+	if in.MachineHealthCheck != nil {
+		in, out := &in.MachineHealthCheck, &out.MachineHealthCheck
+		*out = new(MachineHealthCheck)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerNodeGroupConfiguration.
diff --git a/pkg/cli/createclusterdefaulter_test.go b/pkg/cli/createclusterdefaulter_test.go
index 8a2e5ee2de49c..6a6d8c7cc4980 100644
--- a/pkg/cli/createclusterdefaulter_test.go
+++ b/pkg/cli/createclusterdefaulter_test.go
@@ -6,6 +6,7 @@ import (
 
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/cli"
@@ -20,7 +21,7 @@ func TestNewCreateClusterDefaulter(t *testing.T) {
 
 	skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false)
 
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	r := defaulting.NewRunner[*cluster.Spec]()
 	r.Register(
@@ -43,7 +44,7 @@ func TestRunWithoutSkipIPAnnotation(t *testing.T) {
 		},
 	}
 	skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false)
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter)
 	clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec)
@@ -66,7 +67,7 @@ func TestRunWithSkipIPAnnotation(t *testing.T) {
 	}
 
 	skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(true)
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 	createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter)
 	clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec)
 
diff --git a/pkg/cli/upgradeclusterdefaulter_test.go b/pkg/cli/upgradeclusterdefaulter_test.go
index 518726d82195f..ed000736a7a1a 100644
--- a/pkg/cli/upgradeclusterdefaulter_test.go
+++ b/pkg/cli/upgradeclusterdefaulter_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"
 
 	. "github.com/onsi/gomega"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	"github.com/aws/eks-anywhere/pkg/cli"
 	"github.com/aws/eks-anywhere/pkg/cluster"
@@ -21,7 +22,7 @@ func TestRunUpgradeClusterDefaulter(t *testing.T) {
 			Cluster: c,
 		},
 	}
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(mhcDefaulter)
 	clusterSpec, err := upgradeClusterDefaulter.Run(context.Background(), clusterSpec)
diff --git a/pkg/cluster/defaults.go b/pkg/cluster/defaults.go
index e0518c0355ded..d407a8ba25991 100644
--- a/pkg/cluster/defaults.go
+++ b/pkg/cluster/defaults.go
@@ -5,6 +5,7 @@ import (
 	"time"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/constants"
@@ -39,24 +40,27 @@ func (d ControlPlaneIPCheckAnnotationDefaulter) ControlPlaneIPCheckDefault(ctx c
 type MachineHealthCheckDefaulter struct {
 	NodeStartupTimeout      time.Duration
 	UnhealthyMachineTimeout time.Duration
+	MaxUnhealthy            intstr.IntOrString
 }
 
 // NewMachineHealthCheckDefaulter allows to create a new MachineHealthCheckDefaulter.
-func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration) MachineHealthCheckDefaulter {
+func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration, maxUnhealthy intstr.IntOrString) MachineHealthCheckDefaulter {
 	return MachineHealthCheckDefaulter{
 		NodeStartupTimeout:      nodeStartupTimeout,
 		UnhealthyMachineTimeout: unhealthyMachineTimeout,
+		MaxUnhealthy:            maxUnhealthy,
 	}
 }
 
-// MachineHealthCheckDefault sets the defaults for machine health check timeouts.
+// MachineHealthCheckDefault sets the defaults for machine health check timeouts and maxUnhealthy.
 func (d MachineHealthCheckDefaulter) MachineHealthCheckDefault(ctx context.Context, spec *Spec) (*Spec, error) {
 	SetMachineHealthCheckTimeoutDefaults(spec.Cluster, d.NodeStartupTimeout, d.UnhealthyMachineTimeout)
+	SetMachineHealthCheckMaxUnhealthyDefaults(spec.Cluster, d.MaxUnhealthy)
 
 	return spec, nil
 }
 
-// SetMachineHealthCheckTimeoutDefaults sests defaults for mhcs in the EKSA cluster object based on the input.
+// SetMachineHealthCheckTimeoutDefaults sets default timeouts for MHCs in the EKSA cluster object based on the input.
 func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) {
 	if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.NodeStartupTimeout != nil && cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout != nil {
 		return
@@ -75,6 +79,19 @@ func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart
 	setMachineHealthCheckTimeoutDefaults(cluster, nodeStartupTimeout, unhealthyMachineTimeout)
 }
 
+// SetMachineHealthCheckMaxUnhealthyDefaults sets defaults maxUnhealthy for MHCs in the EKSA cluster object based on the input.
+func SetMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, maxUnhealthy intstr.IntOrString) {
+	if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.MaxUnhealthy != nil {
+		return
+	}
+
+	if cluster.Spec.MachineHealthCheck == nil {
+		cluster.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{}
+	}
+
+	setMachineHealthCheckMaxUnhealthyDefaults(cluster, maxUnhealthy)
+}
+
 // setMachineHealthCheckTimeoutDefaults sets default timeout values for cluster's machine health checks.
 func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) {
 	if cluster.Spec.MachineHealthCheck.NodeStartupTimeout == nil {
@@ -88,3 +105,10 @@ func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart
 		}
 	}
 }
+
+// setMachineHealthCheckMaxUnhealthyDefaults sets default maxUnhealthy values for cluster's machine health checks.
+func setMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, maxUnhealthy intstr.IntOrString) {
+	if cluster.Spec.MachineHealthCheck.MaxUnhealthy == nil {
+		cluster.Spec.MachineHealthCheck.MaxUnhealthy = &maxUnhealthy
+	}
+}
diff --git a/pkg/cluster/defaults_test.go b/pkg/cluster/defaults_test.go
index da30d4b5a7cbf..9691d4c187917 100644
--- a/pkg/cluster/defaults_test.go
+++ b/pkg/cluster/defaults_test.go
@@ -7,6 +7,7 @@ import (
 
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1"
 	"github.com/aws/eks-anywhere/pkg/cluster"
@@ -98,8 +99,9 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) {
 	g := NewWithT(t)
 
 	timeout := 15 * time.Minute
+	maxUnhealthy := intstr.Parse("100%")
 
-	newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout)
+	newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	c := baseCluster()
 	machineHealthcheck := &anywherev1.MachineHealthCheck{
@@ -109,6 +111,7 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) {
 		UnhealthyMachineTimeout: &metav1.Duration{
 			Duration: 15 * time.Minute,
 		},
+		MaxUnhealthy: &maxUnhealthy,
 	}
 
 	clusterSpec := &cluster.Spec{
@@ -126,10 +129,11 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) {
 func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) {
 	g := NewWithT(t)
 
+	maxUnhealthy := intstr.Parse("100%")
 	unhealthyTimeout := metav1.Duration{
 		Duration: constants.DefaultUnhealthyMachineTimeout,
 	}
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	c := baseCluster()
 	c.Spec.DatacenterRef.Kind = anywherev1.TinkerbellDatacenterKind
@@ -138,6 +142,7 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) {
 			Duration: constants.DefaultTinkerbellNodeStartupTimeout,
 		},
 		UnhealthyMachineTimeout: &unhealthyTimeout,
+		MaxUnhealthy:            &maxUnhealthy,
 	}
 
 	clusterSpec := &cluster.Spec{
@@ -154,10 +159,11 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) {
 func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) {
 	g := NewWithT(t)
 
+	maxUnhealthy := intstr.Parse("100%")
 	unhealthyTimeout := metav1.Duration{
 		Duration: constants.DefaultUnhealthyMachineTimeout,
 	}
-	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 
 	c := baseCluster()
 	c.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{
@@ -165,6 +171,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) {
 			Duration: 5 * time.Minute,
 		},
 		UnhealthyMachineTimeout: &unhealthyTimeout,
+		MaxUnhealthy:            &maxUnhealthy,
 	}
 	clusterSpec := &cluster.Spec{
 		Config: &cluster.Config{
@@ -177,6 +184,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) {
 			Duration: 5 * time.Minute,
 		},
 		UnhealthyMachineTimeout: &unhealthyTimeout,
+		MaxUnhealthy:            &maxUnhealthy,
 	}
 
 	clusterSpec, err := mhcDefaulter.MachineHealthCheckDefault(context.Background(), clusterSpec)
diff --git a/pkg/clusterapi/machine_health_check.go b/pkg/clusterapi/machine_health_check.go
index 3f7d210bf9cbd..711ec97d61c00 100644
--- a/pkg/clusterapi/machine_health_check.go
+++ b/pkg/clusterapi/machine_health_check.go
@@ -3,7 +3,6 @@ package clusterapi
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 
 	"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
@@ -12,9 +11,7 @@ import (
 )
 
 const (
-	machineHealthCheckKind   = "MachineHealthCheck"
-	maxUnhealthyControlPlane = "100%"
-	maxUnhealthyWorker       = "40%"
+	machineHealthCheckKind = "MachineHealthCheck"
 )
 
 func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout *metav1.Duration) *clusterv1.MachineHealthCheck {
@@ -51,11 +48,22 @@ func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout
 
 // MachineHealthCheckForControlPlane creates MachineHealthCheck resources for the control plane.
 func MachineHealthCheckForControlPlane(cluster *v1alpha1.Cluster) *clusterv1.MachineHealthCheck {
-	mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout)
+	unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout
+	if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout != nil {
+		unhealthyMachineTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout
+	}
+	nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout
+	if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout != nil {
+		nodeStartupTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout
+	}
+	mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout)
 	mhc.SetName(ControlPlaneMachineHealthCheckName(cluster))
 	mhc.Spec.Selector.MatchLabels[clusterv1.MachineControlPlaneLabel] = ""
-	maxUnhealthy := intstr.Parse(maxUnhealthyControlPlane)
-	mhc.Spec.MaxUnhealthy = &maxUnhealthy
+	maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy
+	if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy != nil {
+		maxUnhealthy = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy
+	}
+	mhc.Spec.MaxUnhealthy = maxUnhealthy
 	return mhc
 }
 
@@ -70,11 +78,22 @@ func MachineHealthCheckForWorkers(cluster *v1alpha1.Cluster) []*clusterv1.Machin
 }
 
 func machineHealthCheckForWorker(cluster *v1alpha1.Cluster, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration) *clusterv1.MachineHealthCheck {
-	mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout)
+	unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout
+	if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout != nil {
+		unhealthyMachineTimeout = workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout
+	}
+	nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout
+	if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout != nil {
+		nodeStartupTimeout = workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout
+	}
+	mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout)
 	mhc.SetName(WorkerMachineHealthCheckName(cluster, workerNodeGroupConfig))
 	mhc.Spec.Selector.MatchLabels[clusterv1.MachineDeploymentNameLabel] = MachineDeploymentName(cluster, workerNodeGroupConfig)
-	maxUnhealthy := intstr.Parse(maxUnhealthyWorker)
-	mhc.Spec.MaxUnhealthy = &maxUnhealthy
+	maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy
+	if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy != nil {
+		maxUnhealthy = workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy
+	}
+	mhc.Spec.MaxUnhealthy = maxUnhealthy
 	return mhc
 }
 
diff --git a/pkg/clusterapi/machine_health_check_test.go b/pkg/clusterapi/machine_health_check_test.go
index f99ee4bf342ef..a4917ea229260 100644
--- a/pkg/clusterapi/machine_health_check_test.go
+++ b/pkg/clusterapi/machine_health_check_test.go
@@ -18,9 +18,10 @@ import (
 
 func TestMachineHealthCheckForControlPlane(t *testing.T) {
 	timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
+	maxUnhealthy := intstr.Parse("80%")
 	for _, timeout := range timeouts {
 		tt := newApiBuilerTest(t)
-		want := expectedMachineHealthCheckForControlPlane(timeout)
+		want := expectedMachineHealthCheckForControlPlane(timeout, maxUnhealthy)
 		tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 			NodeStartupTimeout: &metav1.Duration{
 				Duration: timeout,
@@ -28,14 +29,65 @@ func TestMachineHealthCheckForControlPlane(t *testing.T) {
 			UnhealthyMachineTimeout: &metav1.Duration{
 				Duration: timeout,
 			},
+			MaxUnhealthy: &maxUnhealthy,
 		}
 		got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster)
 		tt.Expect(got).To(BeComparableTo(want))
 	}
 }
 
-func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1.MachineHealthCheck {
+func TestMachineHealthCheckForControlPlaneWithTimeoutOverride(t *testing.T) {
+	defaultTimeout := 30 * time.Minute
+	cpTimeout := 60 * time.Minute
 	maxUnhealthy := intstr.Parse("100%")
+
+	tt := newApiBuilerTest(t)
+	want := expectedMachineHealthCheckForControlPlane(cpTimeout, maxUnhealthy)
+	tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: cpTimeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: cpTimeout,
+		},
+	}
+	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: defaultTimeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: defaultTimeout,
+		},
+		MaxUnhealthy: &maxUnhealthy,
+	}
+	got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster)
+	tt.Expect(got).To(BeComparableTo(want))
+}
+
+func TestMachineHealthCheckForControlPlaneWithMaxUnhealthyOverride(t *testing.T) {
+	timeout := 30 * time.Minute
+	defaultMaxUnhealthy := intstr.Parse("40%")
+	cpMaxUnhealthyOverride := intstr.Parse("100%")
+
+	tt := newApiBuilerTest(t)
+	want := expectedMachineHealthCheckForControlPlane(timeout, cpMaxUnhealthyOverride)
+	tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		MaxUnhealthy: &cpMaxUnhealthyOverride,
+	}
+	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: timeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: timeout,
+		},
+		MaxUnhealthy: &defaultMaxUnhealthy,
+	}
+	got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster)
+	tt.Expect(got).To(BeComparableTo(want))
+}
+
+func expectedMachineHealthCheckForControlPlane(timeout time.Duration, maxUnhealthy intstr.IntOrString) *clusterv1.MachineHealthCheck {
 	return &clusterv1.MachineHealthCheck{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: "cluster.x-k8s.io/v1beta1",
@@ -77,11 +129,12 @@ func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1
 }
 
 func TestMachineHealthCheckForWorkers(t *testing.T) {
+	maxUnhealthy := intstr.Parse("40%")
 	timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
 	for _, timeout := range timeouts {
 		tt := newApiBuilerTest(t)
+		want := expectedMachineHealthCheckForWorkers(timeout, maxUnhealthy)
 		tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
-		want := expectedMachineHealthCheckForWorkers(timeout)
 		tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 			NodeStartupTimeout: &metav1.Duration{
 				Duration: timeout,
@@ -89,14 +142,67 @@ func TestMachineHealthCheckForWorkers(t *testing.T) {
 			UnhealthyMachineTimeout: &metav1.Duration{
 				Duration: timeout,
 			},
+			MaxUnhealthy: &maxUnhealthy,
 		}
 		got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster)
 		tt.Expect(got).To(Equal(want))
 	}
 }
 
-func expectedMachineHealthCheckForWorkers(timeout time.Duration) []*clusterv1.MachineHealthCheck {
+func TestMachineHealthCheckForWorkersWithTimeoutOverride(t *testing.T) {
+	defaultTimeout := 30 * time.Minute
+	workerTimeout := 60 * time.Minute
 	maxUnhealthy := intstr.Parse("40%")
+
+	tt := newApiBuilerTest(t)
+	want := expectedMachineHealthCheckForWorkers(workerTimeout, maxUnhealthy)
+	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
+	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: workerTimeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: workerTimeout,
+		},
+	}
+	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: defaultTimeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: defaultTimeout,
+		},
+		MaxUnhealthy: &maxUnhealthy,
+	}
+	got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster)
+	tt.Expect(got).To(Equal(want))
+}
+
+func TestMachineHealthCheckForWorkersWithMaxUnhealthyOverride(t *testing.T) {
+	timeout := 30 * time.Minute
+	defaultMaxUnhealthy := intstr.Parse("40%")
+	workerMaxUnhealthyOverride := intstr.Parse("100%")
+
+	tt := newApiBuilerTest(t)
+	want := expectedMachineHealthCheckForWorkers(timeout, workerMaxUnhealthyOverride)
+	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
+	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		MaxUnhealthy: &workerMaxUnhealthyOverride,
+	}
+	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
+		NodeStartupTimeout: &metav1.Duration{
+			Duration: timeout,
+		},
+		UnhealthyMachineTimeout: &metav1.Duration{
+			Duration: timeout,
+		},
+		MaxUnhealthy: &defaultMaxUnhealthy,
+	}
+	got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster)
+	tt.Expect(got).To(Equal(want))
+}
+
+func expectedMachineHealthCheckForWorkers(timeout time.Duration, maxUnhealthy intstr.IntOrString) []*clusterv1.MachineHealthCheck {
 	return []*clusterv1.MachineHealthCheck{
 		{
 			TypeMeta: metav1.TypeMeta{
diff --git a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go
index 9164c2c5dec0f..085fcdd5c9f63 100644
--- a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go
+++ b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go
@@ -8,6 +8,7 @@ import (
 	"github.com/go-logr/logr"
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -52,7 +53,7 @@ func (tt *reconcilerTest) withFakeClient() {
 }
 
 func newReconciler(t testing.TB) *reconcilerTest {
-	mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
+	mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy))
 	bundle := test.Bundle()
 	version := test.DevEksaVersion()
 
diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go
index 9f35826273fb4..b135a2db741d9 100644
--- a/pkg/clustermanager/cluster_manager_test.go
+++ b/pkg/clustermanager/cluster_manager_test.go
@@ -13,6 +13,7 @@ import (
 	. "github.com/onsi/gomega"
 	"github.com/stretchr/testify/assert"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
 
@@ -1874,7 +1875,7 @@ metadata:
   namespace: eksa-system
 spec:
   clusterName: fluxTestCluster
-  maxUnhealthy: 40%%
+  maxUnhealthy: 100%%
   nodeStartupTimeout: %[2]s
   selector:
     matchLabels:
@@ -1926,6 +1927,7 @@ func TestInstallMachineHealthChecks(t *testing.T) {
 	ctx := context.Background()
 	tt := newTest(t)
 	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
+	maxUnhealthy := intstr.Parse("100%")
 	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 		UnhealthyMachineTimeout: &metav1.Duration{
 			Duration: constants.DefaultUnhealthyMachineTimeout,
@@ -1933,6 +1935,7 @@ func TestInstallMachineHealthChecks(t *testing.T) {
 		NodeStartupTimeout: &metav1.Duration{
 			Duration: constants.DefaultNodeStartupTimeout,
 		},
+		MaxUnhealthy: &maxUnhealthy,
 	}
 	wantMHC := expectedMachineHealthCheck(constants.DefaultUnhealthyMachineTimeout, constants.DefaultNodeStartupTimeout)
 	tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC)
@@ -1945,6 +1948,7 @@ func TestInstallMachineHealthChecks(t *testing.T) {
 func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) {
 	ctx := context.Background()
 	tt := newTest(t)
+	maxUnhealthy := intstr.Parse("100%")
 	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 		UnhealthyMachineTimeout: &metav1.Duration{
 			Duration: (30 * time.Minute),
@@ -1952,6 +1956,7 @@ func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) {
 		NodeStartupTimeout: &metav1.Duration{
 			Duration: (30 * time.Minute),
 		},
+		MaxUnhealthy: &maxUnhealthy,
 	}
 	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
 	wantMHC := expectedMachineHealthCheck(30*time.Minute, 30*time.Minute)
@@ -1966,6 +1971,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) {
 	tt := newTest(t)
 	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
 	maxTime := time.Duration(math.MaxInt64)
+	maxUnhealthy := intstr.Parse("100%")
 	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 		UnhealthyMachineTimeout: &metav1.Duration{
 			Duration: maxTime,
@@ -1973,6 +1979,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) {
 		NodeStartupTimeout: &metav1.Duration{
 			Duration: maxTime,
 		},
+		MaxUnhealthy: &maxUnhealthy,
 	}
 	wantMHC := expectedMachineHealthCheck(maxTime, maxTime)
 
@@ -1984,6 +1991,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) {
 func TestInstallMachineHealthChecksApplyError(t *testing.T) {
 	ctx := context.Background()
 	tt := newTest(t, clustermanager.WithRetrier(retrier.NewWithMaxRetries(2, 0)))
+	maxUnhealthy := intstr.Parse("100%")
 	tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
 	tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{
 		UnhealthyMachineTimeout: &metav1.Duration{
@@ -1992,6 +2000,7 @@ func TestInstallMachineHealthChecksApplyError(t *testing.T) {
 		NodeStartupTimeout: &metav1.Duration{
 			Duration: constants.DefaultNodeStartupTimeout,
 		},
+		MaxUnhealthy: &maxUnhealthy,
 	}
 	wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout, clustermanager.DefaultNodeStartupTimeout)
 	tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC).Return(errors.New("apply error")).MaxTimes(2)
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 5efcb872bbcfc..a5eb7bb49e389 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -1,6 +1,10 @@
 package config
 
-import "time"
+import (
+	"time"
+
+	"k8s.io/apimachinery/pkg/util/intstr"
+)
 
 const (
 	EksaGitPassphraseTokenEnv = "EKSA_GIT_SSH_KEY_PASSPHRASE"
@@ -26,10 +30,12 @@ type CreateClusterCLIConfig struct {
 	SkipCPIPCheck           bool
 	NodeStartupTimeout      time.Duration
 	UnhealthyMachineTimeout time.Duration
+	MaxUnhealthy            intstr.IntOrString
 }
 
-// UpgradeClusterCLIConfig is the config we use for create cluster specific configurations.
+// UpgradeClusterCLIConfig is the config we use for upgrade cluster specific configurations.
 type UpgradeClusterCLIConfig struct {
 	NodeStartupTimeout      time.Duration
 	UnhealthyMachineTimeout time.Duration
+	MaxUnhealthy            intstr.IntOrString
 }
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
index 605aeff0cfcfc..1d4ca506953e4 100644
--- a/pkg/constants/constants.go
+++ b/pkg/constants/constants.go
@@ -90,6 +90,8 @@ const (
 	DefaultNodeStartupTimeout = 10 * time.Minute
 	// DefaultTinkerbellNodeStartupTimeout is the default node start up timeout for Tinkerbell.
 	DefaultTinkerbellNodeStartupTimeout = 20 * time.Minute
+	// DefaultMaxUnhealthy is the default maxUnhealthy value for machine health checks.
+	DefaultMaxUnhealthy = "100%"
 )
 
 type Operation int
diff --git a/pkg/dependencies/factory.go b/pkg/dependencies/factory.go
index 1302fac4c7b9e..6a03a787a8764 100644
--- a/pkg/dependencies/factory.go
+++ b/pkg/dependencies/factory.go
@@ -1082,7 +1082,7 @@ func (f *Factory) WithCliConfig(cliConfig *cliconfig.CliConfig) *Factory {
 func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateClusterCLIConfig) *Factory {
 	f.buildSteps = append(f.buildSteps, func(ctx context.Context) error {
 		controlPlaneIPCheckAnnotationDefaulter := cluster.NewControlPlaneIPCheckAnnotationDefaulter(createCliConfig.SkipCPIPCheck)
-		machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout)
+		machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout, createCliConfig.MaxUnhealthy)
 
 		createClusterDefaulter := cli.NewCreateClusterDefaulter(controlPlaneIPCheckAnnotationDefaulter, machineHealthCheckDefaulter)
 
@@ -1097,7 +1097,7 @@ func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateCl
 // WithUpgradeClusterDefaulter builds a create cluster defaulter that builds defaulter dependencies specific to the create cluster command. The defaulter is then run once the factory is built in the create cluster command.
 func (f *Factory) WithUpgradeClusterDefaulter(upgradeCliConfig *cliconfig.UpgradeClusterCLIConfig) *Factory {
 	f.buildSteps = append(f.buildSteps, func(ctx context.Context) error {
-		machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout)
+		machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout, upgradeCliConfig.MaxUnhealthy)
 
 		upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(machineHealthCheckDefaulter)