From 0e30882978d62f50d67730ab7da1dc93b9b9c387 Mon Sep 17 00:00:00 2001 From: Abhay Krishna Arunachalam Date: Wed, 10 Jan 2024 00:52:28 -0800 Subject: [PATCH] Make maxUnhealthy count configurable for control plane and worker machines --- cmd/eksctl-anywhere/cmd/constants.go | 1 + cmd/eksctl-anywhere/cmd/createcluster.go | 3 + cmd/eksctl-anywhere/cmd/options.go | 3 + cmd/eksctl-anywhere/cmd/upgradecluster.go | 4 +- .../anywhere.eks.amazonaws.com_clusters.yaml | 82 +++++++++++++ ...ywhere.eks.amazonaws.com_nodeupgrades.yaml | 2 - config/manifest/eksa-components.yaml | 113 ++++++++++++++++- controllers/factory.go | 3 +- pkg/api/v1alpha1/cluster_types.go | 7 ++ pkg/api/v1alpha1/zz_generated.deepcopy.go | 16 +++ pkg/cli/createclusterdefaulter_test.go | 7 +- pkg/cli/upgradeclusterdefaulter_test.go | 3 +- pkg/cluster/defaults.go | 30 ++++- pkg/cluster/defaults_test.go | 14 ++- pkg/clusterapi/machine_health_check.go | 39 ++++-- pkg/clusterapi/machine_health_check_test.go | 114 +++++++++++++++++- .../reconciler/reconciler_test.go | 3 +- pkg/clustermanager/cluster_manager_test.go | 11 +- pkg/config/config.go | 10 +- pkg/constants/constants.go | 2 + pkg/dependencies/factory.go | 4 +- 21 files changed, 433 insertions(+), 38 deletions(-) diff --git a/cmd/eksctl-anywhere/cmd/constants.go b/cmd/eksctl-anywhere/cmd/constants.go index 9f934eb3d9102..f3e9e12e29711 100644 --- a/cmd/eksctl-anywhere/cmd/constants.go +++ b/cmd/eksctl-anywhere/cmd/constants.go @@ -9,6 +9,7 @@ const ( unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout" nodeStartupTimeoutFlag = "node-startup-timeout" noTimeoutsFlag = "no-timeouts" + maxUnhealthyFlag = "max-unhealthy" ) type Operation int diff --git a/cmd/eksctl-anywhere/cmd/createcluster.go b/cmd/eksctl-anywhere/cmd/createcluster.go index 5f8735e8944c5..6815047ec9c52 100644 --- a/cmd/eksctl-anywhere/cmd/createcluster.go +++ b/cmd/eksctl-anywhere/cmd/createcluster.go @@ -12,6 +12,7 @@ import ( "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/awsiamauth" "github.com/aws/eks-anywhere/pkg/clustermanager" + "github.com/aws/eks-anywhere/pkg/constants" "github.com/aws/eks-anywhere/pkg/dependencies" "github.com/aws/eks-anywhere/pkg/executables" "github.com/aws/eks-anywhere/pkg/features" @@ -29,6 +30,7 @@ import ( type createClusterOptions struct { clusterOptions timeoutOptions + maxUnhealthy string forceClean bool skipIpCheck bool hardwareCSVPath string @@ -61,6 +63,7 @@ func init() { createCmd.AddCommand(createClusterCmd) applyClusterOptionFlags(createClusterCmd.Flags(), &cc.clusterOptions) applyTimeoutFlags(createClusterCmd.Flags(), &cc.timeoutOptions) + createClusterCmd.Flags().StringVar(&cc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage") applyTinkerbellHardwareFlag(createClusterCmd.Flags(), &cc.hardwareCSVPath) aflag.String(aflag.TinkerbellBootstrapIP, &cc.tinkerbellBootstrapIP, createClusterCmd.Flags()) createClusterCmd.Flags().BoolVar(&cc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster") diff --git a/cmd/eksctl-anywhere/cmd/options.go b/cmd/eksctl-anywhere/cmd/options.go index 5c2a794254a21..7089b0b06c137 100644 --- a/cmd/eksctl-anywhere/cmd/options.go +++ b/cmd/eksctl-anywhere/cmd/options.go @@ -9,6 +9,7 @@ import ( "time" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/util/intstr" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cluster" @@ -195,6 +196,7 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC createCliConfig.NodeStartupTimeout = nodeStartupTimeout createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout + createCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy) return createCliConfig, nil } @@ -221,6 +223,7 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout + upgradeCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy) return &upgradeCliConfig, nil } diff --git a/cmd/eksctl-anywhere/cmd/upgradecluster.go b/cmd/eksctl-anywhere/cmd/upgradecluster.go index 34ce201beeaa4..553d58e3551bc 100644 --- a/cmd/eksctl-anywhere/cmd/upgradecluster.go +++ b/cmd/eksctl-anywhere/cmd/upgradecluster.go @@ -10,6 +10,7 @@ import ( "github.com/aws/eks-anywhere/cmd/eksctl-anywhere/cmd/aflag" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" + "github.com/aws/eks-anywhere/pkg/constants" "github.com/aws/eks-anywhere/pkg/dependencies" "github.com/aws/eks-anywhere/pkg/features" "github.com/aws/eks-anywhere/pkg/kubeconfig" @@ -26,6 +27,7 @@ import ( type upgradeClusterOptions struct { clusterOptions timeoutOptions + maxUnhealthy string wConfig string forceClean bool hardwareCSVPath string @@ -68,12 +70,12 @@ func init() { upgradeCmd.AddCommand(upgradeClusterCmd) applyClusterOptionFlags(upgradeClusterCmd.Flags(), &uc.clusterOptions) applyTimeoutFlags(upgradeClusterCmd.Flags(), &uc.timeoutOptions) + upgradeClusterCmd.Flags().StringVar(&uc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage") applyTinkerbellHardwareFlag(upgradeClusterCmd.Flags(), &uc.hardwareCSVPath) upgradeClusterCmd.Flags().StringVarP(&uc.wConfig, "w-config", "w", "", "Kubeconfig file to use when upgrading a workload cluster") upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster") hideForceCleanup(upgradeClusterCmd.Flags()) upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ","))) - aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name) tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC) } diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml index b586785563755..76b4b8e84ee41 100644 --- a/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml +++ b/config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml @@ -186,6 +186,41 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the top-level + MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. This + setting applies to both control plane and worker machines. + If the number of unhealthy machines exceeds the limit set + by maxUnhealthy, further remediation will not be performed. + If not configured, the default value is set to "100%". + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the node + startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" (10 + minutes) for all providers. For Tinkerbell provider the + default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of time + specified as the timeout, the machines are considered unhealthy. + If not configured, the default value is set to "5m0s" (5 + minutes). + type: string + type: object skipLoadBalancerDeployment: description: SkipLoadBalancerDeployment skip deploying control plane load balancer. Make sure your infrastructure can handle @@ -342,6 +377,17 @@ spec: to wait to remediate unhealthy machine or determine health of nodes' machines. properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum number + of unhealthy machines in machine health checks. This setting + applies to both control plane and worker machines. If the number + of unhealthy machines exceeds the limit set by maxUnhealthy, + further remediation will not be performed. If not configured, + the default value is set to "100%". + x-kubernetes-int-or-string: true nodeStartupTimeout: description: NodeStartupTimeout is used to configure the node startup timeout in machine health checks. It determines how @@ -534,6 +580,42 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the + top-level MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. + This setting applies to both control plane and worker + machines. If the number of unhealthy machines exceeds + the limit set by maxUnhealthy, further remediation will + not be performed. If not configured, the default value + is set to "100%". + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the + node startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" + (10 minutes) for all providers. For Tinkerbell provider + the default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of + time specified as the timeout, the machines are considered + unhealthy. If not configured, the default value is set + to "5m0s" (5 minutes). + type: string + type: object name: description: Name refers to the name of the worker node group type: string diff --git a/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml b/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml index 22edf5243746b..90ea218b0ab51 100644 --- a/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml +++ b/config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml @@ -46,8 +46,6 @@ spec: nodes and ignored for worker nodes. type: boolean kubernetesVersion: - description: 'TODO(in-place): Determine if there''s a way to get these - dynamically instead of expecting it from the CRD.' type: string machine: description: Machine is a reference to the CAPI Machine that needs diff --git a/config/manifest/eksa-components.yaml b/config/manifest/eksa-components.yaml index 90e4fa38eddcc..5b86fbb080a93 100644 --- a/config/manifest/eksa-components.yaml +++ b/config/manifest/eksa-components.yaml @@ -3740,17 +3740,42 @@ spec: in the cluster properties: cilium: + description: CiliumConfig contains configuration specific + to the Cilium CNI. properties: egressMasqueradeInterfaces: description: EgressMasquaradeInterfaces determines which network interfaces are used for masquerading. Accepted values are a valid interface name or interface prefix. type: string + ipv4NativeRoutingCIDR: + description: IPv4NativeRoutingCIDR specifies the CIDR + to use when RoutingMode is set to direct. When specified, + Cilium assumes networking for this CIDR is preconfigured + and hands traffic destined for that range to the Linux + network stack without applying any SNAT. If this is + not set autoDirectNodeRoutes will be set to true + type: string + ipv6NativeRoutingCIDR: + description: IPv6NativeRoutingCIDR specifies the IPv6 + CIDR to use when RoutingMode is set to direct. When + specified, Cilium assumes networking for this CIDR is + preconfigured and hands traffic destined for that range + to the Linux network stack without applying any SNAT. + If this is not set autoDirectNodeRoutes will be set + to true + type: string policyEnforcementMode: description: PolicyEnforcementMode determines communication allowed between pods. Accepted values are default, always, never. type: string + routingMode: + description: RoutingMode indicates the routing tunnel + mode to use for Cilium. Accepted values are overlay + (geneve tunnel with overlay) or direct (tunneling disabled + with direct routing) Defaults to overlay. + type: string skipUpgrade: description: SkipUpgrade indicicates that Cilium maintenance should be skipped during upgrades. This can be used @@ -3758,6 +3783,8 @@ spec: type: boolean type: object kindnetd: + description: KindnetdConfig contains configuration specific + to the Kindnetd CNI. type: object type: object dns: @@ -3833,6 +3860,41 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the top-level + MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. This + setting applies to both control plane and worker machines. + If the number of unhealthy machines exceeds the limit set + by maxUnhealthy, further remediation will not be performed. + If not configured, the default value is set to "100%". + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the node + startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" (10 + minutes) for all providers. For Tinkerbell provider the + default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of time + specified as the timeout, the machines are considered unhealthy. + If not configured, the default value is set to "5m0s" (5 + minutes). + type: string + type: object skipLoadBalancerDeployment: description: SkipLoadBalancerDeployment skip deploying control plane load balancer. Make sure your infrastructure can handle @@ -3989,6 +4051,17 @@ spec: to wait to remediate unhealthy machine or determine health of nodes' machines. properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum number + of unhealthy machines in machine health checks. This setting + applies to both control plane and worker machines. If the number + of unhealthy machines exceeds the limit set by maxUnhealthy, + further remediation will not be performed. If not configured, + the default value is set to "100%". + x-kubernetes-int-or-string: true nodeStartupTimeout: description: NodeStartupTimeout is used to configure the node startup timeout in machine health checks. It determines how @@ -4181,6 +4254,42 @@ spec: name: type: string type: object + machineHealthCheck: + description: MachineHealthCheck is a control-plane level override + for the timeouts and maxUnhealthy specified in the top-level + MHC configuration. If not configured, the defaults in the + top-level MHC configuration are used. + properties: + maxUnhealthy: + anyOf: + - type: integer + - type: string + description: MaxUnhealthy is used to configure the maximum + number of unhealthy machines in machine health checks. + This setting applies to both control plane and worker + machines. If the number of unhealthy machines exceeds + the limit set by maxUnhealthy, further remediation will + not be performed. If not configured, the default value + is set to "100%". + x-kubernetes-int-or-string: true + nodeStartupTimeout: + description: NodeStartupTimeout is used to configure the + node startup timeout in machine health checks. It determines + how long a MachineHealthCheck should wait for a Node to + join the cluster, before considering a Machine unhealthy. + If not configured, the default value is set to "10m0s" + (10 minutes) for all providers. For Tinkerbell provider + the default is "20m0s". + type: string + unhealthyMachineTimeout: + description: UnhealthyMachineTimeout is used to configure + the unhealthy machine timeout in machine health checks. + If any unhealthy conditions are met for the amount of + time specified as the timeout, the machines are considered + unhealthy. If not configured, the default value is set + to "5m0s" (5 minutes). + type: string + type: object name: description: Name refers to the name of the worker node group type: string @@ -4440,10 +4549,6 @@ spec: upgraded: format: int64 type: integer - required: - - ready - - requireUpgrade - - upgraded type: object type: object served: true diff --git a/controllers/factory.go b/controllers/factory.go index 9905f2281521d..127c56b1eef12 100644 --- a/controllers/factory.go +++ b/controllers/factory.go @@ -5,6 +5,7 @@ import ( "github.com/go-logr/logr" "github.com/google/uuid" + "k8s.io/apimachinery/pkg/util/intstr" clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -577,7 +578,7 @@ func (f *Factory) withMachineHealthCheckReconciler() *Factory { return nil } - machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) f.machineHealthCheckReconciler = mhcreconciler.New( f.manager.GetClient(), diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go index fa0b107b42d88..5fa6266461346 100644 --- a/pkg/api/v1alpha1/cluster_types.go +++ b/pkg/api/v1alpha1/cluster_types.go @@ -8,6 +8,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "github.com/aws/eks-anywhere/pkg/logger" @@ -285,6 +286,8 @@ type ControlPlaneConfiguration struct { // CertSANs is a slice of domain names or IPs to be added as Subject Name Alternatives of the // Kube API Servers Certificate. CertSANs []string `json:"certSans,omitempty"` + // MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used. + MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"` } // MachineHealthCheck allows to configure timeouts for machine health checks. Machine Health Checks are responsible for remediating unhealthy Machines. @@ -294,6 +297,8 @@ type MachineHealthCheck struct { NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"` // UnhealthyMachineTimeout is used to configure the unhealthy machine timeout in machine health checks. If any unhealthy conditions are met for the amount of time specified as the timeout, the machines are considered unhealthy. If not configured, the default value is set to "5m0s" (5 minutes). UnhealthyMachineTimeout *metav1.Duration `json:"unhealthyMachineTimeout,omitempty"` + // MaxUnhealthy is used to configure the maximum number of unhealthy machines in machine health checks. This setting applies to both control plane and worker machines. If the number of unhealthy machines exceeds the limit set by maxUnhealthy, further remediation will not be performed. If not configured, the default value is set to "100%". + MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"` } func TaintsSliceEqual(s1, s2 []corev1.Taint) bool { @@ -429,6 +434,8 @@ type WorkerNodeGroupConfiguration struct { UpgradeRolloutStrategy *WorkerNodesUpgradeRolloutStrategy `json:"upgradeRolloutStrategy,omitempty"` // KuberenetesVersion defines the version for worker nodes. If not set, the top level spec kubernetesVersion will be used. KubernetesVersion *KubernetesVersion `json:"kubernetesVersion,omitempty"` + // MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used. + MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"` } // Equal compares two WorkerNodeGroupConfigurations. diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index fc30d2f05b93b..00e62967e609c 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -24,6 +24,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/cluster-api/api/v1beta1" apiv1beta1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" ) @@ -872,6 +873,11 @@ func (in *ControlPlaneConfiguration) DeepCopyInto(out *ControlPlaneConfiguration *out = make([]string, len(*in)) copy(*out, *in) } + if in.MachineHealthCheck != nil { + in, out := &in.MachineHealthCheck, &out.MachineHealthCheck + *out = new(MachineHealthCheck) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControlPlaneConfiguration. @@ -1707,6 +1713,11 @@ func (in *MachineHealthCheck) DeepCopyInto(out *MachineHealthCheck) { *out = new(metav1.Duration) **out = **in } + if in.MaxUnhealthy != nil { + in, out := &in.MaxUnhealthy, &out.MaxUnhealthy + *out = new(intstr.IntOrString) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineHealthCheck. @@ -3460,6 +3471,11 @@ func (in *WorkerNodeGroupConfiguration) DeepCopyInto(out *WorkerNodeGroupConfigu *out = new(KubernetesVersion) **out = **in } + if in.MachineHealthCheck != nil { + in, out := &in.MachineHealthCheck, &out.MachineHealthCheck + *out = new(MachineHealthCheck) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerNodeGroupConfiguration. diff --git a/pkg/cli/createclusterdefaulter_test.go b/pkg/cli/createclusterdefaulter_test.go index 8a2e5ee2de49c..6a6d8c7cc4980 100644 --- a/pkg/cli/createclusterdefaulter_test.go +++ b/pkg/cli/createclusterdefaulter_test.go @@ -6,6 +6,7 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cli" @@ -20,7 +21,7 @@ func TestNewCreateClusterDefaulter(t *testing.T) { skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) r := defaulting.NewRunner[*cluster.Spec]() r.Register( @@ -43,7 +44,7 @@ func TestRunWithoutSkipIPAnnotation(t *testing.T) { }, } skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(false) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter) clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec) @@ -66,7 +67,7 @@ func TestRunWithSkipIPAnnotation(t *testing.T) { } skipIPCheck := cluster.NewControlPlaneIPCheckAnnotationDefaulter(true) - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) createClusterDefaulter := cli.NewCreateClusterDefaulter(skipIPCheck, mhcDefaulter) clusterSpec, err := createClusterDefaulter.Run(context.Background(), clusterSpec) diff --git a/pkg/cli/upgradeclusterdefaulter_test.go b/pkg/cli/upgradeclusterdefaulter_test.go index 518726d82195f..ed000736a7a1a 100644 --- a/pkg/cli/upgradeclusterdefaulter_test.go +++ b/pkg/cli/upgradeclusterdefaulter_test.go @@ -5,6 +5,7 @@ import ( "testing" . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/util/intstr" "github.com/aws/eks-anywhere/pkg/cli" "github.com/aws/eks-anywhere/pkg/cluster" @@ -21,7 +22,7 @@ func TestRunUpgradeClusterDefaulter(t *testing.T) { Cluster: c, }, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(mhcDefaulter) clusterSpec, err := upgradeClusterDefaulter.Run(context.Background(), clusterSpec) diff --git a/pkg/cluster/defaults.go b/pkg/cluster/defaults.go index e0518c0355ded..d407a8ba25991 100644 --- a/pkg/cluster/defaults.go +++ b/pkg/cluster/defaults.go @@ -5,6 +5,7 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/constants" @@ -39,24 +40,27 @@ func (d ControlPlaneIPCheckAnnotationDefaulter) ControlPlaneIPCheckDefault(ctx c type MachineHealthCheckDefaulter struct { NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString } // NewMachineHealthCheckDefaulter allows to create a new MachineHealthCheckDefaulter. -func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration) MachineHealthCheckDefaulter { +func NewMachineHealthCheckDefaulter(nodeStartupTimeout, unhealthyMachineTimeout time.Duration, maxUnhealthy intstr.IntOrString) MachineHealthCheckDefaulter { return MachineHealthCheckDefaulter{ NodeStartupTimeout: nodeStartupTimeout, UnhealthyMachineTimeout: unhealthyMachineTimeout, + MaxUnhealthy: maxUnhealthy, } } -// MachineHealthCheckDefault sets the defaults for machine health check timeouts. +// MachineHealthCheckDefault sets the defaults for machine health check timeouts and maxUnhealthy. func (d MachineHealthCheckDefaulter) MachineHealthCheckDefault(ctx context.Context, spec *Spec) (*Spec, error) { SetMachineHealthCheckTimeoutDefaults(spec.Cluster, d.NodeStartupTimeout, d.UnhealthyMachineTimeout) + SetMachineHealthCheckMaxUnhealthyDefaults(spec.Cluster, d.MaxUnhealthy) return spec, nil } -// SetMachineHealthCheckTimeoutDefaults sests defaults for mhcs in the EKSA cluster object based on the input. +// SetMachineHealthCheckTimeoutDefaults sets default timeouts for MHCs in the EKSA cluster object based on the input. func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) { if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.NodeStartupTimeout != nil && cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout != nil { return @@ -75,6 +79,19 @@ func SetMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart setMachineHealthCheckTimeoutDefaults(cluster, nodeStartupTimeout, unhealthyMachineTimeout) } +// SetMachineHealthCheckMaxUnhealthyDefaults sets defaults maxUnhealthy for MHCs in the EKSA cluster object based on the input. +func SetMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, maxUnhealthy intstr.IntOrString) { + if cluster.Spec.MachineHealthCheck != nil && cluster.Spec.MachineHealthCheck.MaxUnhealthy != nil { + return + } + + if cluster.Spec.MachineHealthCheck == nil { + cluster.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{} + } + + setMachineHealthCheckMaxUnhealthyDefaults(cluster, maxUnhealthy) +} + // setMachineHealthCheckTimeoutDefaults sets default timeout values for cluster's machine health checks. func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStartupTimeout, unhealthyMachineTimeout time.Duration) { if cluster.Spec.MachineHealthCheck.NodeStartupTimeout == nil { @@ -88,3 +105,10 @@ func setMachineHealthCheckTimeoutDefaults(cluster *anywherev1.Cluster, nodeStart } } } + +// setMachineHealthCheckMaxUnhealthyDefaults sets default maxUnhealthy values for cluster's machine health checks. +func setMachineHealthCheckMaxUnhealthyDefaults(cluster *anywherev1.Cluster, maxUnhealthy intstr.IntOrString) { + if cluster.Spec.MachineHealthCheck.MaxUnhealthy == nil { + cluster.Spec.MachineHealthCheck.MaxUnhealthy = &maxUnhealthy + } +} diff --git a/pkg/cluster/defaults_test.go b/pkg/cluster/defaults_test.go index da30d4b5a7cbf..9691d4c187917 100644 --- a/pkg/cluster/defaults_test.go +++ b/pkg/cluster/defaults_test.go @@ -7,6 +7,7 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cluster" @@ -98,8 +99,9 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { g := NewWithT(t) timeout := 15 * time.Minute + maxUnhealthy := intstr.Parse("100%") - newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout) + newMachineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(timeout, timeout, intstr.Parse(constants.DefaultMaxUnhealthy)) c := baseCluster() machineHealthcheck := &anywherev1.MachineHealthCheck{ @@ -109,6 +111,7 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: 15 * time.Minute, }, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ @@ -126,10 +129,11 @@ func TestNewMachineHealthCheckDefaulter(t *testing.T) { func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { g := NewWithT(t) + maxUnhealthy := intstr.Parse("100%") unhealthyTimeout := metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) c := baseCluster() c.Spec.DatacenterRef.Kind = anywherev1.TinkerbellDatacenterKind @@ -138,6 +142,7 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { Duration: constants.DefaultTinkerbellNodeStartupTimeout, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ @@ -154,10 +159,11 @@ func TestNewMachineHealthCheckDefaulterTinkerbell(t *testing.T) { func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { g := NewWithT(t) + maxUnhealthy := intstr.Parse("100%") unhealthyTimeout := metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, } - mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := cluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) c := baseCluster() c.Spec.MachineHealthCheck = &anywherev1.MachineHealthCheck{ @@ -165,6 +171,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { Duration: 5 * time.Minute, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec := &cluster.Spec{ Config: &cluster.Config{ @@ -177,6 +184,7 @@ func TestNewMachineHealthCheckDefaulterNoChange(t *testing.T) { Duration: 5 * time.Minute, }, UnhealthyMachineTimeout: &unhealthyTimeout, + MaxUnhealthy: &maxUnhealthy, } clusterSpec, err := mhcDefaulter.MachineHealthCheckDefault(context.Background(), clusterSpec) diff --git a/pkg/clusterapi/machine_health_check.go b/pkg/clusterapi/machine_health_check.go index 3f7d210bf9cbd..711ec97d61c00 100644 --- a/pkg/clusterapi/machine_health_check.go +++ b/pkg/clusterapi/machine_health_check.go @@ -3,7 +3,6 @@ package clusterapi import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" @@ -12,9 +11,7 @@ import ( ) const ( - machineHealthCheckKind = "MachineHealthCheck" - maxUnhealthyControlPlane = "100%" - maxUnhealthyWorker = "40%" + machineHealthCheckKind = "MachineHealthCheck" ) func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout *metav1.Duration) *clusterv1.MachineHealthCheck { @@ -51,11 +48,22 @@ func machineHealthCheck(clusterName string, unhealthyTimeout, nodeStartupTimeout // MachineHealthCheckForControlPlane creates MachineHealthCheck resources for the control plane. func MachineHealthCheckForControlPlane(cluster *v1alpha1.Cluster) *clusterv1.MachineHealthCheck { - mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout) + unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout != nil { + unhealthyMachineTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.UnhealthyMachineTimeout + } + nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout != nil { + nodeStartupTimeout = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.NodeStartupTimeout + } + mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout) mhc.SetName(ControlPlaneMachineHealthCheckName(cluster)) mhc.Spec.Selector.MatchLabels[clusterv1.MachineControlPlaneLabel] = "" - maxUnhealthy := intstr.Parse(maxUnhealthyControlPlane) - mhc.Spec.MaxUnhealthy = &maxUnhealthy + maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy + if cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck != nil && cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy != nil { + maxUnhealthy = cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck.MaxUnhealthy + } + mhc.Spec.MaxUnhealthy = maxUnhealthy return mhc } @@ -70,11 +78,22 @@ func MachineHealthCheckForWorkers(cluster *v1alpha1.Cluster) []*clusterv1.Machin } func machineHealthCheckForWorker(cluster *v1alpha1.Cluster, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration) *clusterv1.MachineHealthCheck { - mhc := machineHealthCheck(ClusterName(cluster), cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout, cluster.Spec.MachineHealthCheck.NodeStartupTimeout) + unhealthyMachineTimeout := cluster.Spec.MachineHealthCheck.UnhealthyMachineTimeout + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout != nil { + unhealthyMachineTimeout = workerNodeGroupConfig.MachineHealthCheck.UnhealthyMachineTimeout + } + nodeStartupTimeout := cluster.Spec.MachineHealthCheck.NodeStartupTimeout + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout != nil { + nodeStartupTimeout = workerNodeGroupConfig.MachineHealthCheck.NodeStartupTimeout + } + mhc := machineHealthCheck(ClusterName(cluster), unhealthyMachineTimeout, nodeStartupTimeout) mhc.SetName(WorkerMachineHealthCheckName(cluster, workerNodeGroupConfig)) mhc.Spec.Selector.MatchLabels[clusterv1.MachineDeploymentNameLabel] = MachineDeploymentName(cluster, workerNodeGroupConfig) - maxUnhealthy := intstr.Parse(maxUnhealthyWorker) - mhc.Spec.MaxUnhealthy = &maxUnhealthy + maxUnhealthy := cluster.Spec.MachineHealthCheck.MaxUnhealthy + if workerNodeGroupConfig.MachineHealthCheck != nil && workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy != nil { + maxUnhealthy = workerNodeGroupConfig.MachineHealthCheck.MaxUnhealthy + } + mhc.Spec.MaxUnhealthy = maxUnhealthy return mhc } diff --git a/pkg/clusterapi/machine_health_check_test.go b/pkg/clusterapi/machine_health_check_test.go index f99ee4bf342ef..a4917ea229260 100644 --- a/pkg/clusterapi/machine_health_check_test.go +++ b/pkg/clusterapi/machine_health_check_test.go @@ -18,9 +18,10 @@ import ( func TestMachineHealthCheckForControlPlane(t *testing.T) { timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second} + maxUnhealthy := intstr.Parse("80%") for _, timeout := range timeouts { tt := newApiBuilerTest(t) - want := expectedMachineHealthCheckForControlPlane(timeout) + want := expectedMachineHealthCheckForControlPlane(timeout, maxUnhealthy) tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ NodeStartupTimeout: &metav1.Duration{ Duration: timeout, @@ -28,14 +29,65 @@ func TestMachineHealthCheckForControlPlane(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: timeout, }, + MaxUnhealthy: &maxUnhealthy, } got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) tt.Expect(got).To(BeComparableTo(want)) } } -func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1.MachineHealthCheck { +func TestMachineHealthCheckForControlPlaneWithTimeoutOverride(t *testing.T) { + defaultTimeout := 30 * time.Minute + cpTimeout := 60 * time.Minute maxUnhealthy := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForControlPlane(cpTimeout, maxUnhealthy) + tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: cpTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: cpTimeout, + }, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + MaxUnhealthy: &maxUnhealthy, + } + got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) + tt.Expect(got).To(BeComparableTo(want)) +} + +func TestMachineHealthCheckForControlPlaneWithMaxUnhealthyOverride(t *testing.T) { + timeout := 30 * time.Minute + defaultMaxUnhealthy := intstr.Parse("40%") + cpMaxUnhealthyOverride := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForControlPlane(timeout, cpMaxUnhealthyOverride) + tt.clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + MaxUnhealthy: &cpMaxUnhealthyOverride, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: timeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: timeout, + }, + MaxUnhealthy: &defaultMaxUnhealthy, + } + got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec.Cluster) + tt.Expect(got).To(BeComparableTo(want)) +} + +func expectedMachineHealthCheckForControlPlane(timeout time.Duration, maxUnhealthy intstr.IntOrString) *clusterv1.MachineHealthCheck { return &clusterv1.MachineHealthCheck{ TypeMeta: metav1.TypeMeta{ APIVersion: "cluster.x-k8s.io/v1beta1", @@ -77,11 +129,12 @@ func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1 } func TestMachineHealthCheckForWorkers(t *testing.T) { + maxUnhealthy := intstr.Parse("40%") timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second} for _, timeout := range timeouts { tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(timeout, maxUnhealthy) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} - want := expectedMachineHealthCheckForWorkers(timeout) tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ NodeStartupTimeout: &metav1.Duration{ Duration: timeout, @@ -89,14 +142,67 @@ func TestMachineHealthCheckForWorkers(t *testing.T) { UnhealthyMachineTimeout: &metav1.Duration{ Duration: timeout, }, + MaxUnhealthy: &maxUnhealthy, } got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) tt.Expect(got).To(Equal(want)) } } -func expectedMachineHealthCheckForWorkers(timeout time.Duration) []*clusterv1.MachineHealthCheck { +func TestMachineHealthCheckForWorkersWithTimeoutOverride(t *testing.T) { + defaultTimeout := 30 * time.Minute + workerTimeout := 60 * time.Minute maxUnhealthy := intstr.Parse("40%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(workerTimeout, maxUnhealthy) + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: workerTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: workerTimeout, + }, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: defaultTimeout, + }, + MaxUnhealthy: &maxUnhealthy, + } + got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) + tt.Expect(got).To(Equal(want)) +} + +func TestMachineHealthCheckForWorkersWithMaxUnhealthyOverride(t *testing.T) { + timeout := 30 * time.Minute + defaultMaxUnhealthy := intstr.Parse("40%") + workerMaxUnhealthyOverride := intstr.Parse("100%") + + tt := newApiBuilerTest(t) + want := expectedMachineHealthCheckForWorkers(timeout, workerMaxUnhealthyOverride) + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig} + tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + MaxUnhealthy: &workerMaxUnhealthyOverride, + } + tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ + NodeStartupTimeout: &metav1.Duration{ + Duration: timeout, + }, + UnhealthyMachineTimeout: &metav1.Duration{ + Duration: timeout, + }, + MaxUnhealthy: &defaultMaxUnhealthy, + } + got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec.Cluster) + tt.Expect(got).To(Equal(want)) +} + +func expectedMachineHealthCheckForWorkers(timeout time.Duration, maxUnhealthy intstr.IntOrString) []*clusterv1.MachineHealthCheck { return []*clusterv1.MachineHealthCheck{ { TypeMeta: metav1.TypeMeta{ diff --git a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go index 9164c2c5dec0f..085fcdd5c9f63 100644 --- a/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go +++ b/pkg/clusterapi/machinehealthcheck/reconciler/reconciler_test.go @@ -8,6 +8,7 @@ import ( "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -52,7 +53,7 @@ func (tt *reconcilerTest) withFakeClient() { } func newReconciler(t testing.TB) *reconcilerTest { - mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout) + mhcDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy)) bundle := test.Bundle() version := test.DevEksaVersion() diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go index 9f35826273fb4..b135a2db741d9 100644 --- a/pkg/clustermanager/cluster_manager_test.go +++ b/pkg/clustermanager/cluster_manager_test.go @@ -13,6 +13,7 @@ import ( . "github.com/onsi/gomega" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" @@ -1874,7 +1875,7 @@ metadata: namespace: eksa-system spec: clusterName: fluxTestCluster - maxUnhealthy: 40%% + maxUnhealthy: 100%% nodeStartupTimeout: %[2]s selector: matchLabels: @@ -1926,6 +1927,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { ctx := context.Background() tt := newTest(t) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: constants.DefaultUnhealthyMachineTimeout, @@ -1933,6 +1935,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: constants.DefaultNodeStartupTimeout, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(constants.DefaultUnhealthyMachineTimeout, constants.DefaultNodeStartupTimeout) tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC) @@ -1945,6 +1948,7 @@ func TestInstallMachineHealthChecks(t *testing.T) { func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) { ctx := context.Background() tt := newTest(t) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: (30 * time.Minute), @@ -1952,6 +1956,7 @@ func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: (30 * time.Minute), }, + MaxUnhealthy: &maxUnhealthy, } tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" wantMHC := expectedMachineHealthCheck(30*time.Minute, 30*time.Minute) @@ -1966,6 +1971,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { tt := newTest(t) tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" maxTime := time.Duration(math.MaxInt64) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ Duration: maxTime, @@ -1973,6 +1979,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: maxTime, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(maxTime, maxTime) @@ -1984,6 +1991,7 @@ func TestInstallMachineHealthChecksWithNoTimeout(t *testing.T) { func TestInstallMachineHealthChecksApplyError(t *testing.T) { ctx := context.Background() tt := newTest(t, clustermanager.WithRetrier(retrier.NewWithMaxRetries(2, 0))) + maxUnhealthy := intstr.Parse("100%") tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1" tt.clusterSpec.Cluster.Spec.MachineHealthCheck = &v1alpha1.MachineHealthCheck{ UnhealthyMachineTimeout: &metav1.Duration{ @@ -1992,6 +2000,7 @@ func TestInstallMachineHealthChecksApplyError(t *testing.T) { NodeStartupTimeout: &metav1.Duration{ Duration: constants.DefaultNodeStartupTimeout, }, + MaxUnhealthy: &maxUnhealthy, } wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout, clustermanager.DefaultNodeStartupTimeout) tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC).Return(errors.New("apply error")).MaxTimes(2) diff --git a/pkg/config/config.go b/pkg/config/config.go index 5efcb872bbcfc..a5eb7bb49e389 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -1,6 +1,10 @@ package config -import "time" +import ( + "time" + + "k8s.io/apimachinery/pkg/util/intstr" +) const ( EksaGitPassphraseTokenEnv = "EKSA_GIT_SSH_KEY_PASSPHRASE" @@ -26,10 +30,12 @@ type CreateClusterCLIConfig struct { SkipCPIPCheck bool NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString } -// UpgradeClusterCLIConfig is the config we use for create cluster specific configurations. +// UpgradeClusterCLIConfig is the config we use for upgrade cluster specific configurations. type UpgradeClusterCLIConfig struct { NodeStartupTimeout time.Duration UnhealthyMachineTimeout time.Duration + MaxUnhealthy intstr.IntOrString } diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 605aeff0cfcfc..1d4ca506953e4 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -90,6 +90,8 @@ const ( DefaultNodeStartupTimeout = 10 * time.Minute // DefaultTinkerbellNodeStartupTimeout is the default node start up timeout for Tinkerbell. DefaultTinkerbellNodeStartupTimeout = 20 * time.Minute + // DefaultMaxUnhealthy is the default maxUnhealthy value for machine health checks. + DefaultMaxUnhealthy = "100%" ) type Operation int diff --git a/pkg/dependencies/factory.go b/pkg/dependencies/factory.go index 1302fac4c7b9e..6a03a787a8764 100644 --- a/pkg/dependencies/factory.go +++ b/pkg/dependencies/factory.go @@ -1082,7 +1082,7 @@ func (f *Factory) WithCliConfig(cliConfig *cliconfig.CliConfig) *Factory { func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateClusterCLIConfig) *Factory { f.buildSteps = append(f.buildSteps, func(ctx context.Context) error { controlPlaneIPCheckAnnotationDefaulter := cluster.NewControlPlaneIPCheckAnnotationDefaulter(createCliConfig.SkipCPIPCheck) - machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout) + machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(createCliConfig.NodeStartupTimeout, createCliConfig.UnhealthyMachineTimeout, createCliConfig.MaxUnhealthy) createClusterDefaulter := cli.NewCreateClusterDefaulter(controlPlaneIPCheckAnnotationDefaulter, machineHealthCheckDefaulter) @@ -1097,7 +1097,7 @@ func (f *Factory) WithCreateClusterDefaulter(createCliConfig *cliconfig.CreateCl // WithUpgradeClusterDefaulter builds a create cluster defaulter that builds defaulter dependencies specific to the create cluster command. The defaulter is then run once the factory is built in the create cluster command. func (f *Factory) WithUpgradeClusterDefaulter(upgradeCliConfig *cliconfig.UpgradeClusterCLIConfig) *Factory { f.buildSteps = append(f.buildSteps, func(ctx context.Context) error { - machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout) + machineHealthCheckDefaulter := cluster.NewMachineHealthCheckDefaulter(upgradeCliConfig.NodeStartupTimeout, upgradeCliConfig.UnhealthyMachineTimeout, upgradeCliConfig.MaxUnhealthy) upgradeClusterDefaulter := cli.NewUpgradeClusterDefaulter(machineHealthCheckDefaulter)