From e540c2ee6cadaa3e632326af18853948be00ab43 Mon Sep 17 00:00:00 2001 From: Abhay Krishna Arunachalam Date: Wed, 10 Jan 2024 00:52:28 -0800 Subject: [PATCH] Make maxUnhealthy count configurable for control plane and worker machines --- cmd/eksctl-anywhere/cmd/options.go | 5 + cmd/eksctl-anywhere/cmd/upgradecluster.go | 1 - .../anywhere.eks.amazonaws.com_clusters.yaml | 85 +++++++++++++ config/manifest/eksa-components.yaml | 85 +++++++++++++ controllers/factory.go | 3 +- pkg/api/v1alpha1/cluster_types.go | 7 ++ pkg/api/v1alpha1/zz_generated.deepcopy.go | 16 +++ pkg/cli/createclusterdefaulter_test.go | 7 +- pkg/cli/upgradeclusterdefaulter_test.go | 3 +- pkg/cluster/defaults.go | 56 ++++++++- pkg/cluster/defaults_test.go | 14 ++- pkg/clusterapi/machine_health_check.go | 39 ++++-- pkg/clusterapi/machine_health_check_test.go | 114 +++++++++++++++++- .../reconciler/reconciler_test.go | 3 +- pkg/clustermanager/cluster_manager_test.go | 11 +- pkg/config/config.go | 12 +- pkg/constants/constants.go | 4 + pkg/dependencies/factory.go | 4 +- 18 files changed, 437 insertions(+), 32 deletions(-) diff --git a/cmd/eksctl-anywhere/cmd/options.go b/cmd/eksctl-anywhere/cmd/options.go index bc33006d8ff1..a2d060324c87 100644 --- a/cmd/eksctl-anywhere/cmd/options.go +++ b/cmd/eksctl-anywhere/cmd/options.go @@ -9,6 +9,7 @@ import ( "time" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/util/intstr" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cluster" @@ -195,6 +196,8 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC createCliConfig.NodeStartupTimeout = nodeStartupTimeout createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout + createCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy) + createCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy) return createCliConfig, nil } @@ -221,6 +224,8 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout + upgradeCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy) + upgradeCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy) return &upgradeCliConfig, nil } diff --git a/cmd/eksctl-anywhere/cmd/upgradecluster.go b/cmd/eksctl-anywhere/cmd/upgradecluster.go index 558ed28ce033..0f6005448563 100644 --- a/cmd/eksctl-anywhere/cmd/upgradecluster.go +++ b/cmd/eksctl-anywhere/cmd/upgradecluster.go @@ -71,7 +71,6 @@ func init() { upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster") hideForceCleanup(upgradeClusterCmd.Flags()) upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ","))) - aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name) tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC) } diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml index 9de93015c421..ccec1b0fee9e 100644 --- a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml +++ b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml @@ -186,6 +186,42 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the top-level + MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. This + setting applies to both control plane and worker machines. + If the number of unhealthy machines exceeds the limit set + by maxUnhealthy, further remediation will not be performed. + If not configured, the default value is set to "100%" for + controlplane machines and "40%" for worker machines. + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the node + startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" (10 + minutes) for all providers. For Tinkerbell provider the + default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of time + specified as the timeout, the machines are considered unhealthy. + If not configured, the default value is set to "5m0s" (5 + minutes). + type: string + type: object skipLoadBalancerDeployment: description: SkipLoadBalancerDeployment skip deploying control plane load balancer. Make sure your infrastructure can handle @@ -344,6 +380,18 @@ spec: to wait to remediate unhealthy machine or determine health of nodes' machines. properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum number + of unhealthy machines in machine health checks. This setting + applies to both control plane and worker machines. If the number + of unhealthy machines exceeds the limit set by maxUnhealthy, + further remediation will not be performed. If not configured, + the default value is set to "100%" for controlplane machines + and "40%" for worker machines. + x-kubernetes-int-or-string: true nodeStartupTimeout: description: NodeStartupTimeout is used to configure the node startup timeout in machine health checks. It determines how @@ -536,6 +584,43 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the + top-level MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. + This setting applies to both control plane and worker + machines. If the number of unhealthy machines exceeds + the limit set by maxUnhealthy, further remediation will + not be performed. If not configured, the default value + is set to "100%" for controlplane machines and "40%" for + worker machines. + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the + node startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" + (10 minutes) for all providers. For Tinkerbell provider + the default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of + time specified as the timeout, the machines are considered + unhealthy. If not configured, the default value is set + to "5m0s" (5 minutes). + type: string + type: object name: description: Name refers to the name of the worker node group type: string diff --git a/config/manifest/eksa-components.yaml b/config/manifest/eksa-components.yaml index 683adfa495e8..006957c55f35 100644 --- a/config/manifest/eksa-components.yaml +++ b/config/manifest/eksa-components.yaml @@ -3889,6 +3889,42 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the top-level + MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. This + setting applies to both control plane and worker machines. + If the number of unhealthy machines exceeds the limit set + by maxUnhealthy, further remediation will not be performed. + If not configured, the default value is set to "100%" for + controlplane machines and "40%" for worker machines. + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the node + startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" (10 + minutes) for all providers. For Tinkerbell provider the + default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of time + specified as the timeout, the machines are considered unhealthy. + If not configured, the default value is set to "5m0s" (5 + minutes). + type: string + type: object skipLoadBalancerDeployment: description: SkipLoadBalancerDeployment skip deploying control plane load balancer. Make sure your infrastructure can handle @@ -4047,6 +4083,18 @@ spec: to wait to remediate unhealthy machine or determine health of nodes' machines. properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum number + of unhealthy machines in machine health checks. This setting + applies to both control plane and worker machines. If the number + of unhealthy machines exceeds the limit set by maxUnhealthy, + further remediation will not be performed. If not configured, + the default value is set to "100%" for controlplane machines + and "40%" for worker machines. + x-kubernetes-int-or-string: true nodeStartupTimeout: description: NodeStartupTimeout is used to configure the node startup timeout in machine health checks. It determines how @@ -4239,6 +4287,43 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the + top-level MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. + This setting applies to both control plane and worker + machines. If the number of unhealthy machines exceeds + the limit set by maxUnhealthy, further remediation will + not be performed. If not configured, the default value + is set to "100%" for controlplane machines and "40%" for + worker machines. + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the + node startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" + (10 minutes) for all providers. For Tinkerbell provider + the default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of + time specified as the timeout, the machines are considered + unhealthy. If not configured, the default value is set + to "5m0s" (5 minutes). + type: string + type: object name: description: Name refers to the name of the worker node group type: string diff --git a/controllers/factory.go b/controllers/factory.go index 45fc68ecffa6..c1fc03ec045a 100644 --- a/controllers/factory.go +++ b/controllers/factory.go @@ -5,6 +5,7 @@ import ( "github.com/go-logr/logr" "github.com/google/uuid" + "k8s.io/apimachinery/pkg/util/intstr" clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -579,7 +580,7 @@ func (f *Factory) withMachineHealthCheckReconciler() *Factory { return nil } - machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) f.machineHealthCheckReconciler = mhcreconciler.New( f.manager.GetClient(), diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go index 4412e2eb633c..6781587ea342 100644 --- a/pkg/api/v1alpha1/cluster_types.go +++ b/pkg/api/v1alpha1/cluster_types.go @@ -8,6 +8,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "github.com/aws/eks-anywhere/pkg/logger" @@ -296,6 +297,8 @@ type ControlPlaneConfiguration struct { // CertSANs is a slice of domain names or IPs to be added as Subject Name Alternatives of the // Kube API Servers Certificate. CertSANs []string `json:"certSans,omitempty"` + // MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used. + MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"` } // MachineHealthCheck allows to configure timeouts for machine health checks. Machine Health Checks are responsible for remediating unhealthy Machines. @@ -305,6 +308,8 @@ type MachineHealthCheck struct { NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"` // UnhealthyMachineTimeout is used to configure the unhealthy machine timeout in machine health checks. If any unhealthy conditions are met for the amount of time specified as the timeout, the machines are considered unhealthy. If not configured, the default value is set to "5m0s" (5 minutes). UnhealthyMachineTimeout *metav1.Duration `json:"unhealthyMachineTimeout,omitempty"` + // MaxUnhealthy is used to configure the maximum number of unhealthy machines in machine health checks. This setting applies to both control plane and worker machines. If the number of unhealthy machines exceeds the limit set by maxUnhealthy, further remediation will not be performed. If not configured, the default value is set to "100%" for controlplane machines and "40%" for worker machines. + MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"` } func TaintsSliceEqual(s1, s2 []corev1.Taint) bool { @@ -440,6 +445,8 @@ type WorkerNodeGroupConfiguration struct { UpgradeRolloutStrategy *WorkerNodesUpgradeRolloutStrategy `json:"upgradeRolloutStrategy,omitempty"` // KuberenetesVersion defines the version for worker nodes. If not set, the top level spec kubernetesVersion will be used. KubernetesVersion *KubernetesVersion `json:"kubernetesVersion,omitempty"` + // MachineHealthCheck is a worker node level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used. + MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"` } // Equal compares two WorkerNodeGroupConfigurations. diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index 8bd39b01613a..6104d7cacd9b 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -24,6 +24,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/cluster-api/api/v1beta1" apiv1beta1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" ) @@ -872,6 +873,11 @@ func (in *ControlPlaneConfiguration) DeepCopyInto(out *ControlPlaneConfiguration *out = make([]string, len(*in)) copy(*out, *in) } + if in.MachineHealthCheck != nil { + in, out := &in.MachineHealthCheck, &out.MachineHealthCheck + *out = new(MachineHealthCheck) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControlPlaneConfiguration. @@ -1699,6 +1705,11 @@ func (in *MachineHealthCheck) DeepCopyInto(out *MachineHealthCheck) { *out = new(metav1.Duration) **out = **in } + if in.MaxUnhealthy != nil { + in, out := &in.MaxUnhealthy, &out.MaxUnhealthy + *out = new(intstr.IntOrString) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineHealthCheck. @@ -3447,6 +3458,11 @@ func (in *WorkerNodeGroupConfiguration) DeepCopyInto(out *WorkerNodeGroupConfigu *out = new(KubernetesVersion) **out = **in } + if in.MachineHealthCheck != nil { + in, out := &in.MachineHealthCheck, &out.MachineHealthCheck + *out = new(MachineHealthCheck) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerNodeGroupConfiguration. diff --git a/pkg/cli/createclusterdefaulter_test.go b/pkg/cli/createclusterdefaulter_test.go index 8a2e5ee2de49..3d0012a682f6 100644 --- a/pkg/cli/createclusterdefaulter_test.go +++ b/pkg/cli/createclusterdefaulter_test.go @@ -6,6 +6,7 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cli" @@ -20,7 +21,7 @@ func TestNewCreateClusterDefaulter(t *testing.T) { skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) r := defaulting.NewRunner[*cluster.Spec]() r.Register( @@ -43,7 +44,7 @@ func TestRunWithoutSkipIPAnnotation(t *testing.T) { }, } skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter) clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec) @@ -66,7 +67,7 @@ func TestRunWithSkipIPAnnotation(t *testing.T) { } skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(true) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter) clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec) diff --git a/pkg/cli/upgradeclusterdefaulter_test.go b/pkg/cli/upgradeclusterdefaulter_test.go index 518726d82195..0341d779fd9a 100644 --- a/pkg/cli/upgradeclusterdefaulter_test.go +++ b/pkg/cli/upgradeclusterdefaulter_test.go @@ -5,6 +5,7 @@ import ( "testing" . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/util/intstr" "github.com/aws/eks-anywhere/pkg/cli" "github.com/aws/eks-anywhere/pkg/cluster" @@ -21,7 +22,7 @@ func TestRunUpgradeClusterDefaulter(t *testing.T) { Cluster: c, }, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(mhcDefaulter) clusterSpec, err := upgradeClusterDefaulter.Run(context.Background(), clusterSpec) diff --git a/pkg/cluster/defaults.go b/pkg/cluster/defaults.go index 008807b5e261..614822006f6b 100644 --- a/pkg/cluster/defaults.go +++ b/pkg/cluster/defaults.go @@ -5,6 +5,7 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/constants" @@ -39,24 +40,29 @@ func (d ControlPlaneIPCheckAnnotationDefaulter) ControlPlaneIPCheckDefault(ctx c type MachineHealthCheckDefaulter struct { NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString + WorkerMaxUnhealthy intstr.IntOrString } // NewMachineHealthCheckDefaulter allows to create a new MachineHealthCheckDefaulter. -func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration) MachineHealthCheckDefaulter { +func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration, globalMaxUnhealthy, workerMaxUnhealthy intstr.IntOrString) MachineHealthCheckDefaulter { return MachineHealthCheckDefaulter{ NodeStartupTimeout: nodeStartupTimeout, UnhealthyMachineTimeout: unhealthyMachineTimeout, + MaxUnhealthy: globalMaxUnhealthy, + WorkerMaxUnhealthy: workerMaxUnhealthy, } } -// MachineHealthCheckDefault sets the defaults for machine health check timeouts. +// MachineHealthCheckDefault sets the defaults for machine health check timeouts and maxUnhealthy. func (d MachineHealthCheckDefaulter) MachineHealthCheckDefault(ctx context.Context, spec *Spec) (*Spec, error) { SetMachineHealthCheckTimeoutDefaults(spec.Cluster, d.NodeStartupTimeout, d.UnhealthyMachineTimeout) + SetMachineHealthCheckMaxUnhealthyDefaults(spec.Cluster, d.MaxUnhealthy, d.WorkerMaxUnhealthy) return spec, nil } -// SetMachineHealthCheckTimeoutDefaults sests defaults for mhcs in the EKSA cluster object based on the input. +// SetMachineHealthCheckTimeoutDefaults sets default timeouts for MHCs in the EKSA cluster object based on the input. func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) { if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.NodeStartupTimeout != nil && cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout != nil { return @@ -75,6 +81,19 @@ func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart setMachineHealthCheckTimeoutDefaults(cluster, nodeStartupTimeout, unhealthyMachineTimeout) } +// SetMachineHealthCheckMaxUnhealthyDefaults sets defaults maxUnhealthy for MHCs in the EKSA cluster object based on the input. +func SetMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, globalMaxUnhealthy, workerMaxUnhealthy intstr.IntOrString) { + if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.MaxUnhealthy != nil { + return + } + + if cluster.Spec.MachineHealthCheck == nil { + cluster.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{} + } + + setMachineHealthCheckMaxUnhealthyDefaults(cluster, globalMaxUnhealthy, workerMaxUnhealthy) +} + // setMachineHealthCheckTimeoutDefaults sets default timeout values for cluster's machine health checks. func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) { if cluster.Spec.MachineHealthCheck.NodeStartupTimeout == nil { @@ -89,6 +108,37 @@ func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart } } +// setMachineHealthCheckMaxUnhealthyDefaults sets default maxUnhealthy values for cluster's machine health checks. +func setMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, globalMaxUnhealthy, workerMaxUnhealthy intstr.IntOrString) { + topLevelMaxUnhealthyUndefined := true + if cluster.Spec.MachineHealthCheck.MaxUnhealthy == nil { + cluster.Spec.MachineHealthCheck.MaxUnhealthy = &globalMaxUnhealthy + } else { + topLevelMaxUnhealthyUndefined = false + } + + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck == nil { + cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &anywherev1.MachineHealthCheck{} + } + + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy == nil { + cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy = &globalMaxUnhealthy + } + + for i := range cluster.Spec.WorkerNodeGroupConfigurations { + if cluster.Spec.WorkerNodeGroupConfigurations[i].MachineHealthCheck == nil { + cluster.Spec.WorkerNodeGroupConfigurations[i].MachineHealthCheck = &anywherev1.MachineHealthCheck{} + } + if cluster.Spec.WorkerNodeGroupConfigurations[i].MachineHealthCheck.MaxUnhealthy == nil { + if topLevelMaxUnhealthyUndefined { + cluster.Spec.WorkerNodeGroupConfigurations[i].MachineHealthCheck.MaxUnhealthy = &workerMaxUnhealthy + } else { + cluster.Spec.WorkerNodeGroupConfigurations[i].MachineHealthCheck.MaxUnhealthy = cluster.Spec.MachineHealthCheck.MaxUnhealthy + } + } + } +} + // NamespaceDefaulter is the defaulter created to configure the cluster's namespace. type NamespaceDefaulter struct { defaultClusterNamespace string diff --git a/pkg/cluster/defaults_test.go b/pkg/cluster/defaults_test.go index 81b30edad1ec..d0d64440add1 100644 --- a/pkg/cluster/defaults_test.go +++ b/pkg/cluster/defaults_test.go @@ -7,6 +7,7 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cluster" @@ -98,8 +99,9 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { g := NewWithT(t) timeout := 15 * time.Minute + maxUnhealthy := intstr.Parse("100%") - newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout) + newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) c := baseCluster() machineHealthcheck := &anywherev1.MachineHealthCheck{ @@ -109,6 +111,7 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: 15 * time.Minute, }, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ @@ -126,10 +129,11 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { g := NewWithT(t) + maxUnhealthy := intstr.Parse("100%") unhealthyTimeout := metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) c := baseCluster() c.Spec.DatacenterRef.Kind = anywherev1.TinkerbellDatacenterKind @@ -138,6 +142,7 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { Duration: constants.DefaultTinkerbellNodeStartupTimeout, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ @@ -154,10 +159,11 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { g := NewWithT(t) + maxUnhealthy := intstr.Parse("100%") unhealthyTimeout := metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) c := baseCluster() c.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{ @@ -165,6 +171,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { Duration: 5 * time.Minute, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ Config: &cluster.Config{ @@ -177,6 +184,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { Duration: 5 * time.Minute, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec, err := mhcDefaulter.MachineHealthCheckDefault(context.Background(), clusterSpec) diff --git a/pkg/clusterapi/machine_health_check.go b/pkg/clusterapi/machine_health_check.go index 3f7d210bf9cb..711ec97d61c0 100644 --- a/pkg/clusterapi/machine_health_check.go +++ b/pkg/clusterapi/machine_health_check.go @@ -3,7 +3,6 @@ package clusterapi import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" @@ -12,9 +11,7 @@ import ( ) const ( - machineHealthCheckKind = "MachineHealthCheck" - maxUnhealthyControlPlane = "100%" - maxUnhealthyWorker = "40%" + machineHealthCheckKind = "MachineHealthCheck" ) func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout *metav1.Duration) *clusterv1.MachineHealthCheck { @@ -51,11 +48,22 @@ func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout // MachineHealthCheckForControlPlane creates MachineHealthCheck resources for the control plane. func MachineHealthCheckForControlPlane(cluster *v1alpha1.Cluster) *clusterv1.MachineHealthCheck { - mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout) + unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout != nil { + unhealthyMachineTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout + } + nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout != nil { + nodeStartupTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout + } + mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout) mhc.SetName(ControlPlaneMachineHealthCheckName(cluster)) mhc.Spec.Selector.MatchLabels[clusterv1.MachineControlPlaneLabel] = "" - maxUnhealthy := intstr.Parse(maxUnhealthyControlPlane) - mhc.Spec.MaxUnhealthy = &maxUnhealthy + maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy != nil { + maxUnhealthy = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy + } + mhc.Spec.MaxUnhealthy = maxUnhealthy return mhc } @@ -70,11 +78,22 @@ func MachineHealthCheckForWorkers(cluster *v1alpha1.Cluster) []*clusterv1.Machin } func machineHealthCheckForWorker(cluster *v1alpha1.Cluster, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration) *clusterv1.MachineHealthCheck { - mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout) + unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout != nil { + unhealthyMachineTimeout = workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout + } + nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout != nil { + nodeStartupTimeout = workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout + } + mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout) mhc.SetName(WorkerMachineHealthCheckName(cluster, workerNodeGroupConfig)) mhc.Spec.Selector.MatchLabels[clusterv1.MachineDeploymentNameLabel] = MachineDeploymentName(cluster, workerNodeGroupConfig) - maxUnhealthy := intstr.Parse(maxUnhealthyWorker) - mhc.Spec.MaxUnhealthy = &maxUnhealthy + maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy != nil { + maxUnhealthy = workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy + } + mhc.Spec.MaxUnhealthy = maxUnhealthy return mhc } diff --git a/pkg/clusterapi/machine_health_check_test.go b/pkg/clusterapi/machine_health_check_test.go index f99ee4bf342e..a4917ea22926 100644 --- a/pkg/clusterapi/machine_health_check_test.go +++ b/pkg/clusterapi/machine_health_check_test.go @@ -18,9 +18,10 @@ import ( func TestMachineHealthCheckForControlPlane(t *testing.T) { timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second} + maxUnhealthy := intstr.Parse("80%") for _, timeout := range timeouts { tt := newApiBuilerTest(t) - want := expectedMachineHealthCheckForControlPlane(timeout) + want := expectedMachineHealthCheckForControlPlane(timeout, maxUnhealthy) tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ NodeStartupTimeout: &metav1.Duration{ Duration: timeout, @@ -28,14 +29,65 @@ func TestMachineHealthCheckForControlPlane(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: timeout, }, + MaxUnhealthy: &maxUnhealthy, } got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) tt.Expect(got).To(BeComparableTo(want)) } } -func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1.MachineHealthCheck { +func TestMachineHealthCheckForControlPlaneWithTimeoutOverride(t *testing.T) { + defaultTimeout := 30 * time.Minute + cpTimeout := 60 * time.Minute maxUnhealthy := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForControlPlane(cpTimeout, maxUnhealthy) + tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: cpTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: cpTimeout, + }, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + MaxUnhealthy: &maxUnhealthy, + } + got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) + tt.Expect(got).To(BeComparableTo(want)) +} + +func TestMachineHealthCheckForControlPlaneWithMaxUnhealthyOverride(t *testing.T) { + timeout := 30 * time.Minute + defaultMaxUnhealthy := intstr.Parse("40%") + cpMaxUnhealthyOverride := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForControlPlane(timeout, cpMaxUnhealthyOverride) + tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + MaxUnhealthy: &cpMaxUnhealthyOverride, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: timeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: timeout, + }, + MaxUnhealthy: &defaultMaxUnhealthy, + } + got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) + tt.Expect(got).To(BeComparableTo(want)) +} + +func expectedMachineHealthCheckForControlPlane(timeout time.Duration, maxUnhealthy intstr.IntOrString) *clusterv1.MachineHealthCheck { return &clusterv1.MachineHealthCheck{ TypeMeta: metav1.TypeMeta{ APIVersion: "cluster.x-k8s.io/v1beta1", @@ -77,11 +129,12 @@ func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1 } func TestMachineHealthCheckForWorkers(t *testing.T) { + maxUnhealthy := intstr.Parse("40%") timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second} for _, timeout := range timeouts { tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(timeout, maxUnhealthy) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} - want := expectedMachineHealthCheckForWorkers(timeout) tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ NodeStartupTimeout: &metav1.Duration{ Duration: timeout, @@ -89,14 +142,67 @@ func TestMachineHealthCheckForWorkers(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: timeout, }, + MaxUnhealthy: &maxUnhealthy, } got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) tt.Expect(got).To(Equal(want)) } } -func expectedMachineHealthCheckForWorkers(timeout time.Duration) []*clusterv1.MachineHealthCheck { +func TestMachineHealthCheckForWorkersWithTimeoutOverride(t *testing.T) { + defaultTimeout := 30 * time.Minute + workerTimeout := 60 * time.Minute maxUnhealthy := intstr.Parse("40%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(workerTimeout, maxUnhealthy) + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: workerTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: workerTimeout, + }, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + MaxUnhealthy: &maxUnhealthy, + } + got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) + tt.Expect(got).To(Equal(want)) +} + +func TestMachineHealthCheckForWorkersWithMaxUnhealthyOverride(t *testing.T) { + timeout := 30 * time.Minute + defaultMaxUnhealthy := intstr.Parse("40%") + workerMaxUnhealthyOverride := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(timeout, workerMaxUnhealthyOverride) + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + MaxUnhealthy: &workerMaxUnhealthyOverride, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: timeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: timeout, + }, + MaxUnhealthy: &defaultMaxUnhealthy, + } + got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) + tt.Expect(got).To(Equal(want)) +} + +func expectedMachineHealthCheckForWorkers(timeout time.Duration, maxUnhealthy intstr.IntOrString) []*clusterv1.MachineHealthCheck { return []*clusterv1.MachineHealthCheck{ { TypeMeta: metav1.TypeMeta{ diff --git a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go index 9164c2c5dec0..f89fdb240930 100644 --- a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go +++ b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go @@ -8,6 +8,7 @@ import ( "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -52,7 +53,7 @@ func (tt *reconcilerTest) withFakeClient() { } func newReconciler(t testing.TB) *reconcilerTest { - mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy)) bundle := test.Bundle() version := test.DevEksaVersion() diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go index adcaa18cea6a..fb5a1e399973 100644 --- a/pkg/clustermanager/cluster_manager_test.go +++ b/pkg/clustermanager/cluster_manager_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" @@ -1990,7 +1991,7 @@ metadata: namespace: eksa-system spec: clusterName: fluxTestCluster - maxUnhealthy: 40%% + maxUnhealthy: 100%% nodeStartupTimeout: %[2]s selector: matchLabels: @@ -2042,6 +2043,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { ctx := context.Background() tt := newTest(t) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, @@ -2049,6 +2051,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: constants.DefaultNodeStartupTimeout, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(constants.DefaultUnhealthyMachineTimeout, constants.DefaultNodeStartupTimeout) tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC) @@ -2061,6 +2064,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) { ctx := context.Background() tt := newTest(t) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: (30 * time.Minute), @@ -2068,6 +2072,7 @@ func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: (30 * time.Minute), }, + MaxUnhealthy: &maxUnhealthy, } tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" wantMHC := expectedMachineHealthCheck(30*time.Minute, 30*time.Minute) @@ -2082,6 +2087,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { tt := newTest(t) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" maxTime := time.Duration(math.MaxInt64) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: maxTime, @@ -2089,6 +2095,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: maxTime, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(maxTime, maxTime) @@ -2100,6 +2107,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { func TestInstallMachineHealthChecksApplyError(t *testing.T) { ctx := context.Background() tt := newTest(t, clustermanager.WithRetrier(retrier.NewWithMaxRetries(2, 0))) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ @@ -2108,6 +2116,7 @@ func TestInstallMachineHealthChecksApplyError(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: constants.DefaultNodeStartupTimeout, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout, clustermanager.DefaultNodeStartupTimeout) tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC).Return(errors.New("apply error")).MaxTimes(2) diff --git a/pkg/config/config.go b/pkg/config/config.go index e2f50ac85e83..62d63ea8cb6a 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -1,6 +1,10 @@ package config -import "time" +import ( + "time" + + "k8s.io/apimachinery/pkg/util/intstr" +) const ( EksaGitPassphraseTokenEnv = "EKSA_GIT_SSH_KEY_PASSPHRASE" @@ -26,12 +30,16 @@ type CreateClusterCLIConfig struct { SkipCPIPCheck bool NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString + WorkerMaxUnhealthy intstr.IntOrString } -// UpgradeClusterCLIConfig is the config we use for create cluster specific configurations. +// UpgradeClusterCLIConfig is the config we use for upgrade cluster specific configurations. type UpgradeClusterCLIConfig struct { NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString + WorkerMaxUnhealthy intstr.IntOrString } // DeleteClusterCLIConfig is the config we use for delete cluster specific configurations. diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 85526e1d5e68..3d541f200558 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -95,6 +95,10 @@ const ( DefaultNodeStartupTimeout = 10 * time.Minute // DefaultTinkerbellNodeStartupTimeout is the default node start up timeout for Tinkerbell. DefaultTinkerbellNodeStartupTimeout = 20 * time.Minute + // DefaultMaxUnhealthy is the default maxUnhealthy value for machine health checks. + DefaultMaxUnhealthy = "100%" + // DefaultWorkerMaxUnhealthy is the default maxUnhealthy value for worker node machine health checks. + DefaultWorkerMaxUnhealthy = "40%" ) type Operation int diff --git a/pkg/dependencies/factory.go b/pkg/dependencies/factory.go index 8eb6e2126a54..c0a0b4342176 100644 --- a/pkg/dependencies/factory.go +++ b/pkg/dependencies/factory.go @@ -1161,7 +1161,7 @@ func (f *Factory) WithCliConfig(cliConfig *cliconfig.CliConfig) *Factory { func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateClusterCLIConfig) *Factory { f.buildSteps = append(f.buildSteps, func(ctx context.Context) error { controlPlaneIPCheckAnnotationDefaulter := cluster.NewControlPlaneIPCheckAnnotationDefaulter(createCliConfig.SkipCPIPCheck) - machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout) + machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout, createCliConfig.MaxUnhealthy, createCliConfig.WorkerMaxUnhealthy) createClusterDefaulter := cli.NewCreateClusterDefaulter(controlPlaneIPCheckAnnotationDefaulter, machineHealthCheckDefaulter) @@ -1176,7 +1176,7 @@ func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateCl // WithUpgradeClusterDefaulter builds a create cluster defaulter that builds defaulter dependencies specific to the create cluster command. The defaulter is then run once the factory is built in the create cluster command. func (f *Factory) WithUpgradeClusterDefaulter(upgradeCliConfig *cliconfig.UpgradeClusterCLIConfig) *Factory { f.buildSteps = append(f.buildSteps, func(ctx context.Context) error { - machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout) + machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout, upgradeCliConfig.MaxUnhealthy, upgradeCliConfig.WorkerMaxUnhealthy) upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(machineHealthCheckDefaulter)