Skip to content

Commit

Permalink
Make maxUnhealthy count configurable for control plane and worker mac…
Browse files Browse the repository at this point in the history
…hines
  • Loading branch information
abhay-krishna committed Jan 16, 2024
1 parent 9bb82e1 commit 0e30882
Show file tree
Hide file tree
Showing 21 changed files with 433 additions and 38 deletions.
1 change: 1 addition & 0 deletions cmd/eksctl-anywhere/cmd/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ const (
unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
nodeStartupTimeoutFlag = "node-startup-timeout"
noTimeoutsFlag = "no-timeouts"
maxUnhealthyFlag = "max-unhealthy"
)

type Operation int
Expand Down
3 changes: 3 additions & 0 deletions cmd/eksctl-anywhere/cmd/createcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/awsiamauth"
"github.com/aws/eks-anywhere/pkg/clustermanager"
"github.com/aws/eks-anywhere/pkg/constants"
"github.com/aws/eks-anywhere/pkg/dependencies"
"github.com/aws/eks-anywhere/pkg/executables"
"github.com/aws/eks-anywhere/pkg/features"
Expand All @@ -29,6 +30,7 @@ import (
type createClusterOptions struct {
clusterOptions
timeoutOptions
maxUnhealthy string
forceClean bool
skipIpCheck bool
hardwareCSVPath string
Expand Down Expand Up @@ -61,6 +63,7 @@ func init() {
createCmd.AddCommand(createClusterCmd)
applyClusterOptionFlags(createClusterCmd.Flags(), &cc.clusterOptions)
applyTimeoutFlags(createClusterCmd.Flags(), &cc.timeoutOptions)
createClusterCmd.Flags().StringVar(&cc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
applyTinkerbellHardwareFlag(createClusterCmd.Flags(), &cc.hardwareCSVPath)
aflag.String(aflag.TinkerbellBootstrapIP, &cc.tinkerbellBootstrapIP, createClusterCmd.Flags())
createClusterCmd.Flags().BoolVar(&cc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
Expand Down
3 changes: 3 additions & 0 deletions cmd/eksctl-anywhere/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/spf13/pflag"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/cluster"
Expand Down Expand Up @@ -195,6 +196,7 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC

createCliConfig.NodeStartupTimeout = nodeStartupTimeout
createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
createCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)

Check warning on line 199 in cmd/eksctl-anywhere/cmd/options.go

View check run for this annotation

Codecov / codecov/patch

cmd/eksctl-anywhere/cmd/options.go#L199

Added line #L199 was not covered by tests

return createCliConfig, nil
}
Expand All @@ -221,6 +223,7 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra

upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout
upgradeCliConfig.MaxUnhealthy = intstr.Parse(clusterOptions.maxUnhealthy)

Check warning on line 226 in cmd/eksctl-anywhere/cmd/options.go

View check run for this annotation

Codecov / codecov/patch

cmd/eksctl-anywhere/cmd/options.go#L226

Added line #L226 was not covered by tests

return &upgradeCliConfig, nil
}
Expand Down
4 changes: 3 additions & 1 deletion cmd/eksctl-anywhere/cmd/upgradecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/aws/eks-anywhere/cmd/eksctl-anywhere/cmd/aflag"
"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/constants"
"github.com/aws/eks-anywhere/pkg/dependencies"
"github.com/aws/eks-anywhere/pkg/features"
"github.com/aws/eks-anywhere/pkg/kubeconfig"
Expand All @@ -26,6 +27,7 @@ import (
type upgradeClusterOptions struct {
clusterOptions
timeoutOptions
maxUnhealthy string
wConfig string
forceClean bool
hardwareCSVPath string
Expand Down Expand Up @@ -68,12 +70,12 @@ func init() {
upgradeCmd.AddCommand(upgradeClusterCmd)
applyClusterOptionFlags(upgradeClusterCmd.Flags(), &uc.clusterOptions)
applyTimeoutFlags(upgradeClusterCmd.Flags(), &uc.timeoutOptions)
upgradeClusterCmd.Flags().StringVar(&uc.maxUnhealthy, maxUnhealthyFlag, constants.DefaultMaxUnhealthy, "Override the default maxUnhealthy count or percentage")
applyTinkerbellHardwareFlag(upgradeClusterCmd.Flags(), &uc.hardwareCSVPath)
upgradeClusterCmd.Flags().StringVarP(&uc.wConfig, "w-config", "w", "", "Kubeconfig file to use when upgrading a workload cluster")
upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
hideForceCleanup(upgradeClusterCmd.Flags())
upgradeClusterCmd.Flags().StringArrayVar(&uc.skipValidations, "skip-validations", []string{}, fmt.Sprintf("Bypass upgrade validations by name. Valid arguments you can pass are --skip-validations=%s", strings.Join(upgradevalidations.SkippableValidations[:], ",")))

aflag.MarkRequired(createClusterCmd.Flags(), aflag.ClusterConfig.Name)
tinkerbellFlags(upgradeClusterCmd.Flags(), uc.providerOptions.Tinkerbell.BMCOptions.RPC)
}
Expand Down
82 changes: 82 additions & 0 deletions config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,41 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the top-level
MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks. This
setting applies to both control plane and worker machines.
If the number of unhealthy machines exceeds the limit set
by maxUnhealthy, further remediation will not be performed.
If not configured, the default value is set to "100%".
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s" (10
minutes) for all providers. For Tinkerbell provider the
default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of time
specified as the timeout, the machines are considered unhealthy.
If not configured, the default value is set to "5m0s" (5
minutes).
type: string
type: object
skipLoadBalancerDeployment:
description: SkipLoadBalancerDeployment skip deploying control
plane load balancer. Make sure your infrastructure can handle
Expand Down Expand Up @@ -342,6 +377,17 @@ spec:
to wait to remediate unhealthy machine or determine health of nodes'
machines.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum number
of unhealthy machines in machine health checks. This setting
applies to both control plane and worker machines. If the number
of unhealthy machines exceeds the limit set by maxUnhealthy,
further remediation will not be performed. If not configured,
the default value is set to "100%".
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines how
Expand Down Expand Up @@ -534,6 +580,42 @@ spec:
name:
type: string
type: object
machineHealthCheck:
description: MachineHealthCheck is a control-plane level override
for the timeouts and maxUnhealthy specified in the top-level
MHC configuration. If not configured, the defaults in the
top-level MHC configuration are used.
properties:
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the maximum
number of unhealthy machines in machine health checks.
This setting applies to both control plane and worker
machines. If the number of unhealthy machines exceeds
the limit set by maxUnhealthy, further remediation will
not be performed. If not configured, the default value
is set to "100%".
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the
node startup timeout in machine health checks. It determines
how long a MachineHealthCheck should wait for a Node to
join the cluster, before considering a Machine unhealthy.
If not configured, the default value is set to "10m0s"
(10 minutes) for all providers. For Tinkerbell provider
the default is "20m0s".
type: string
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure
the unhealthy machine timeout in machine health checks.
If any unhealthy conditions are met for the amount of
time specified as the timeout, the machines are considered
unhealthy. If not configured, the default value is set
to "5m0s" (5 minutes).
type: string
type: object
name:
description: Name refers to the name of the worker node group
type: string
Expand Down
2 changes: 0 additions & 2 deletions config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ spec:
nodes and ignored for worker nodes.
type: boolean
kubernetesVersion:
description: 'TODO(in-place): Determine if there''s a way to get these
dynamically instead of expecting it from the CRD.'
type: string
machine:
description: Machine is a reference to the CAPI Machine that needs
Expand Down
Loading

0 comments on commit 0e30882

Please sign in to comment.