Skip to content

Commit

Permalink
Make maxUnhealthy count configurable for control plane and worker mac…
Browse files Browse the repository at this point in the history
…hines
  • Loading branch information
abhay-krishna committed Feb 15, 2024
1 parent a915a47 commit e540c2e
Show file tree
Hide file tree
Showing 18 changed files with 437 additions and 32 deletions.
5 changes: 5 additions & 0 deletions cmd/eksctl-anywhere/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/spf13/pflag"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/cluster"
Expand Down Expand Up @@ -195,6 +196,8 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC

createCliConfig.NodeStartupTimeout = nodeStartupTimeout
createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
createCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy)
createCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy)

return createCliConfig, nil
}
Expand All @@ -221,6 +224,8 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra

upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
upgradeCliConfig.MaxUnhealthy = intstr.Parse(constants.DefaultMaxUnhealthy)
upgradeCliConfig.WorkerMaxUnhealthy = intstr.Parse(constants.DefaultWorkerMaxUnhealthy)

return &upgradeCliConfig, nil
}
Expand Down
1 change: 0 additions & 1 deletion cmd/eksctl-anywhere/cmd/upgradecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ func init() {
upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
hideForceCleanup(upgradeClusterCmd.Flags())
upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ",")))

aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name)
tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC)
}
Expand Down
85 changes: 85 additions & 0 deletions config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,42 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the top-level
MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks. This
setting applies to both control plane and worker machines.
If the number of unhealthy machines exceeds the limit set
by maxUnhealthy, further remediation will not be performed.
If not configured, the default value is set to "100%" for
controlplane machines and "40%" for worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s" (10
minutes) for all providers. For Tinkerbell provider the
default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of time
specified as the timeout, the machines are considered unhealthy.
If not configured, the default value is set to "5m0s" (5
minutes).
type: string
type: object
skipLoadBalancerDeployment:
description: SkipLoadBalancerDeployment skip deploying control
plane load balancer. Make sure your infrastructure can handle
Expand Down Expand Up @@ -344,6 +380,18 @@ spec:
to wait to remediate unhealthy machine or determine health of nodes'
machines.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum number
of unhealthy machines in machine health checks. This setting
applies to both control plane and worker machines. If the number
of unhealthy machines exceeds the limit set by maxUnhealthy,
further remediation will not be performed. If not configured,
the default value is set to "100%" for controlplane machines
and "40%" for worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines how
Expand Down Expand Up @@ -536,6 +584,43 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the
top-level MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks.
This setting applies to both control plane and worker
machines. If the number of unhealthy machines exceeds
the limit set by maxUnhealthy, further remediation will
not be performed. If not configured, the default value
is set to "100%" for controlplane machines and "40%" for
worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the
node startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s"
(10 minutes) for all providers. For Tinkerbell provider
the default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of
time specified as the timeout, the machines are considered
unhealthy. If not configured, the default value is set
to "5m0s" (5 minutes).
type: string
type: object
name:
description: Name refers to the name of the worker node group
type: string
Expand Down
85 changes: 85 additions & 0 deletions config/manifest/eksa-components.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3889,6 +3889,42 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the top-level
MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks. This
setting applies to both control plane and worker machines.
If the number of unhealthy machines exceeds the limit set
by maxUnhealthy, further remediation will not be performed.
If not configured, the default value is set to "100%" for
controlplane machines and "40%" for worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s" (10
minutes) for all providers. For Tinkerbell provider the
default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of time
specified as the timeout, the machines are considered unhealthy.
If not configured, the default value is set to "5m0s" (5
minutes).
type: string
type: object
skipLoadBalancerDeployment:
description: SkipLoadBalancerDeployment skip deploying control
plane load balancer. Make sure your infrastructure can handle
Expand Down Expand Up @@ -4047,6 +4083,18 @@ spec:
to wait to remediate unhealthy machine or determine health of nodes'
machines.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum number
of unhealthy machines in machine health checks. This setting
applies to both control plane and worker machines. If the number
of unhealthy machines exceeds the limit set by maxUnhealthy,
further remediation will not be performed. If not configured,
the default value is set to "100%" for controlplane machines
and "40%" for worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines how
Expand Down Expand Up @@ -4239,6 +4287,43 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the
top-level MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks.
This setting applies to both control plane and worker
machines. If the number of unhealthy machines exceeds
the limit set by maxUnhealthy, further remediation will
not be performed. If not configured, the default value
is set to "100%" for controlplane machines and "40%" for
worker machines.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the
node startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s"
(10 minutes) for all providers. For Tinkerbell provider
the default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of
time specified as the timeout, the machines are considered
unhealthy. If not configured, the default value is set
to "5m0s" (5 minutes).
type: string
type: object
name:
description: Name refers to the name of the worker node group
type: string
Expand Down
3 changes: 2 additions & 1 deletion controllers/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/go-logr/logr"
"github.com/google/uuid"
"k8s.io/apimachinery/pkg/util/intstr"
clusterctlv1 "sigs.k8s.io/cluster-api/cmd/clusterctl/api/v1alpha3"
"sigs.k8s.io/cluster-api/controllers/remote"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand Down Expand Up @@ -579,7 +580,7 @@ func (f *Factory) withMachineHealthCheckReconciler() *Factory {
return nil
}

machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout)
machineHealthCheckDefaulter := anywhereCluster.NewMachineHealthCheckDefaulter(constants.DefaultNodeStartupTimeout, constants.DefaultUnhealthyMachineTimeout, intstr.Parse(constants.DefaultMaxUnhealthy), intstr.Parse(constants.DefaultWorkerMaxUnhealthy))

f.machineHealthCheckReconciler = mhcreconciler.New(
f.manager.GetClient(),
Expand Down
7 changes: 7 additions & 0 deletions pkg/api/v1alpha1/cluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

"github.com/aws/eks-anywhere/pkg/logger"
Expand Down Expand Up @@ -296,6 +297,8 @@ type ControlPlaneConfiguration struct {
// CertSANs is a slice of domain names or IPs to be added as Subject Name Alternatives of the
// Kube API Servers Certificate.
CertSANs []string `json:"certSans,omitempty"`
// MachineHealthCheck is a control-plane level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
}

// MachineHealthCheck allows to configure timeouts for machine health checks. Machine Health Checks are responsible for remediating unhealthy Machines.
Expand All @@ -305,6 +308,8 @@ type MachineHealthCheck struct {
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
// UnhealthyMachineTimeout is used to configure the unhealthy machine timeout in machine health checks. If any unhealthy conditions are met for the amount of time specified as the timeout, the machines are considered unhealthy. If not configured, the default value is set to "5m0s" (5 minutes).
UnhealthyMachineTimeout *metav1.Duration `json:"unhealthyMachineTimeout,omitempty"`
// MaxUnhealthy is used to configure the maximum number of unhealthy machines in machine health checks. This setting applies to both control plane and worker machines. If the number of unhealthy machines exceeds the limit set by maxUnhealthy, further remediation will not be performed. If not configured, the default value is set to "100%" for controlplane machines and "40%" for worker machines.
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
}

func TaintsSliceEqual(s1, s2 []corev1.Taint) bool {
Expand Down Expand Up @@ -440,6 +445,8 @@ type WorkerNodeGroupConfiguration struct {
UpgradeRolloutStrategy *WorkerNodesUpgradeRolloutStrategy `json:"upgradeRolloutStrategy,omitempty"`
// KuberenetesVersion defines the version for worker nodes. If not set, the top level spec kubernetesVersion will be used.
KubernetesVersion *KubernetesVersion `json:"kubernetesVersion,omitempty"`
// MachineHealthCheck is a worker node level override for the timeouts and maxUnhealthy specified in the top-level MHC configuration. If not configured, the defaults in the top-level MHC configuration are used.
MachineHealthCheck *MachineHealthCheck `json:"machineHealthCheck,omitempty"`
}

// Equal compares two WorkerNodeGroupConfigurations.
Expand Down
16 changes: 16 additions & 0 deletions pkg/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e540c2e

Please sign in to comment.