Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rollout new nodes for OSImageURL change on spec without changing K8s version #8656

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions pkg/providers/tinkerbell/assert.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ func MinimumHardwareAvailableAssertionForCreate(catalogue *hardware.Catalogue) C

// Build a set of required hardware counts per machine group. minimumHardwareRequirements
// will account for the same selector being specified on different groups.
requirements := minimumHardwareRequirements{}
requirements := MinimumHardwareRequirements{}

err := requirements.Add(
spec.ControlPlaneMachineConfig().Spec.HardwareSelector,
Expand Down Expand Up @@ -380,7 +380,7 @@ func AssertionsForScaleUpDown(catalogue *hardware.Catalogue, current Validatable
}
// Build a set of required hardware counts per machine group. minimumHardwareRequirements
// will account for the same selector being specified on different groups.
requirements := minimumHardwareRequirements{}
requirements := MinimumHardwareRequirements{}

if current.ControlPlaneReplicaCount() != spec.Cluster.Spec.ControlPlaneConfiguration.Count {
if rollingUpgrade {
Expand Down Expand Up @@ -452,10 +452,10 @@ func ExtraHardwareAvailableAssertionForRollingUpgrade(catalogue *hardware.Catalo

// Build a set of required hardware counts per machine group. minimumHardwareRequirements
// will account for the same selector being specified on different groups.
requirements := minimumHardwareRequirements{}
requirements := MinimumHardwareRequirements{}

if spec.Cluster.Spec.KubernetesVersion != current.ClusterK8sVersion() || eksaVersionUpgrade {
if err := ensureCPHardwareAvailability(spec, current, requirements); err != nil {
if err := ensureCPHardwareAvailability(spec, requirements); err != nil {
return err
}
}
Expand All @@ -475,7 +475,7 @@ func ExtraHardwareAvailableAssertionForRollingUpgrade(catalogue *hardware.Catalo
}
}

func ensureCPHardwareAvailability(spec *ClusterSpec, current ValidatableCluster, hwReq minimumHardwareRequirements) error {
func ensureCPHardwareAvailability(spec *ClusterSpec, hwReq MinimumHardwareRequirements) error {
maxSurge := 1

rolloutStrategy := spec.Cluster.Spec.ControlPlaneConfiguration.UpgradeRolloutStrategy
Expand All @@ -492,7 +492,7 @@ func ensureCPHardwareAvailability(spec *ClusterSpec, current ValidatableCluster,
return nil
}

func ensureWorkerHardwareAvailability(spec *ClusterSpec, current ValidatableCluster, hwReq minimumHardwareRequirements, eksaVersionUpgrade bool) error {
func ensureWorkerHardwareAvailability(spec *ClusterSpec, current ValidatableCluster, hwReq MinimumHardwareRequirements, eksaVersionUpgrade bool) error {
currentWngK8sversion := current.WorkerNodeGroupK8sVersion()
desiredWngK8sVersion := WorkerNodeGroupWithK8sVersion(spec.Spec)
for _, nodeGroup := range spec.WorkerNodeGroupConfigurations() {
Expand Down Expand Up @@ -543,6 +543,17 @@ func ensureHardwareSelectorsSpecified(spec *ClusterSpec) error {
return nil
}

// ExtraHardwareAvailableAssertionForNodeRollOut asserts catalogue has sufficient hardware to meet minimum requirement
// and is component agnostic between Control Plane and worker nodes.
func ExtraHardwareAvailableAssertionForNodeRollOut(catalogue *hardware.Catalogue, hwReq MinimumHardwareRequirements) ClusterSpecAssertion {
return func(_ *ClusterSpec) error {
if err := validateMinimumHardwareRequirements(hwReq, catalogue); err != nil {
return fmt.Errorf("for node rollout, %v", err)
}
return nil
}
}

type missingHardwareSelectorErr struct {
Name string
}
Expand Down
42 changes: 42 additions & 0 deletions pkg/providers/tinkerbell/assert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,48 @@ func TestAssertAutoScalerDisabledForInPlace(t *testing.T) {
g.Expect(tinkerbell.AssertAutoScalerDisabledForInPlace(clusterSpec)).To(gomega.MatchError(gomega.ContainSubstring("austoscaler configuration not supported with InPlace")))
}

func TestAssertExtraHardwareAvailableAssertionForNodeRollOutSuccess(t *testing.T) {
g := gomega.NewWithT(t)
clusterSpec := NewDefaultValidClusterSpecBuilder().Build()
req := tinkerbell.MinimumHardwareRequirements{}
err := req.Add(map[string]string{"type": "cp"}, 1)
if err != nil {
t.Fatal(err)
}

catalogue := hardware.NewCatalogue()
// Add something for the control plane.
g.Expect(catalogue.InsertHardware(&v1alpha1.Hardware{
ObjectMeta: v1.ObjectMeta{
Labels: clusterSpec.ControlPlaneMachineConfig().Spec.HardwareSelector,
},
})).To(gomega.Succeed())

assertion := tinkerbell.ExtraHardwareAvailableAssertionForNodeRollOut(catalogue, req)
g.Expect(assertion(clusterSpec)).To(gomega.Succeed())
}

func TestAssertExtraHardwareAvailableAssertionForNodeRollOutError(t *testing.T) {
g := gomega.NewWithT(t)
clusterSpec := NewDefaultValidClusterSpecBuilder().Build()
req := tinkerbell.MinimumHardwareRequirements{}
err := req.Add(map[string]string{"type": "cp"}, 2)
if err != nil {
t.Fatal(err)
}

catalogue := hardware.NewCatalogue()
// Add something for the control plane.
g.Expect(catalogue.InsertHardware(&v1alpha1.Hardware{
ObjectMeta: v1.ObjectMeta{
Labels: clusterSpec.ControlPlaneMachineConfig().Spec.HardwareSelector,
},
})).To(gomega.Succeed())

assertion := tinkerbell.ExtraHardwareAvailableAssertionForNodeRollOut(catalogue, req)
g.Expect(assertion(clusterSpec)).To(gomega.MatchError(gomega.ContainSubstring("minimum hardware count not met for selector '{\"type\":\"cp\"}'")))
}

// mergeHardwareSelectors merges m1 with m2. Values already in m1 will be overwritten by m2.
func mergeHardwareSelectors(m1, m2 map[string]string) map[string]string {
for name, value := range m2 {
Expand Down
131 changes: 81 additions & 50 deletions pkg/providers/tinkerbell/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import (
"context"
"fmt"
"reflect"
"strings"

"github.com/go-logr/logr"
"github.com/pkg/errors"
Expand Down Expand Up @@ -155,11 +155,6 @@
}
tinkerbellScope.Workers = w

err = r.omitTinkerbellMachineTemplates(ctx, tinkerbellScope)
if err != nil {
return controller.Result{}, err
}

return controller.Result{}, nil
}

Expand Down Expand Up @@ -196,50 +191,6 @@
return NoChange, nil
}

func (r *Reconciler) omitTinkerbellMachineTemplates(ctx context.Context, tinkerbellScope *Scope) error { //nolint:gocyclo
currentKCP, err := controller.GetKubeadmControlPlane(ctx, r.client, tinkerbellScope.ClusterSpec.Cluster)
if err != nil {
return errors.Wrap(err, "failed to get kubeadmcontrolplane")
}

if currentKCP == nil || currentKCP.Spec.Version != tinkerbellScope.ControlPlane.KubeadmControlPlane.Spec.Version {
return nil
}

cpMachineTemplate, err := tinkerbell.GetMachineTemplate(ctx, clientutil.NewKubeClient(r.client), currentKCP.Spec.MachineTemplate.InfrastructureRef.Name, currentKCP.GetNamespace())
if err != nil && !apierrors.IsNotFound(err) {
return errors.Wrap(err, "failed to get controlplane machinetemplate")
}

if cpMachineTemplate != nil {
tinkerbellScope.ControlPlane.ControlPlaneMachineTemplate = nil
tinkerbellScope.ControlPlane.KubeadmControlPlane.Spec.MachineTemplate.InfrastructureRef.Name = cpMachineTemplate.GetName()
}

for i, wg := range tinkerbellScope.Workers.Groups {
machineDeployment, err := controller.GetMachineDeployment(ctx, r.client, wg.MachineDeployment.GetName())
if err != nil {
return errors.Wrap(err, "failed to get workernode group machinedeployment")
}
if machineDeployment == nil ||
!reflect.DeepEqual(machineDeployment.Spec.Template.Spec.Version, tinkerbellScope.Workers.Groups[i].MachineDeployment.Spec.Template.Spec.Version) {
continue
}

workerMachineTemplate, err := tinkerbell.GetMachineTemplate(ctx, clientutil.NewKubeClient(r.client), machineDeployment.Spec.Template.Spec.InfrastructureRef.Name, machineDeployment.GetNamespace())
if err != nil && !apierrors.IsNotFound(err) {
return errors.Wrap(err, "failed to get workernode group machinetemplate")
}

if workerMachineTemplate != nil {
tinkerbellScope.Workers.Groups[i].ProviderMachineTemplate = nil
tinkerbellScope.Workers.Groups[i].MachineDeployment.Spec.Template.Spec.InfrastructureRef.Name = workerMachineTemplate.GetName()
}
}

return nil
}

// ReconcileControlPlane applies the control plane CAPI objects to the cluster.
func (r *Reconciler) ReconcileControlPlane(ctx context.Context, log logr.Logger, tinkerbellScope *Scope) (controller.Result, error) {
log = log.WithValues("phase", "reconcileControlPlane")
Expand Down Expand Up @@ -378,6 +329,24 @@
return controller.Result{}, err
}
v.Register(tinkerbell.AssertionsForScaleUpDown(kubeReader.GetCatalogue(), validatableCAPI, false))

hardwareReq, err := r.validateHardwareReqForKCP(validatableCAPI, tinkerbellScope)
if err != nil {
return controller.Result{}, err

Check warning on line 335 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L335

Added line #L335 was not covered by tests
}

workerHardwareReq, err := r.validateHardwareReqForMachineDeployments(ctx, tinkerbellScope)
if err != nil {
return controller.Result{}, err

Check warning on line 340 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L340

Added line #L340 was not covered by tests
}

// Hardware selectors for controlPlane and worker nodes are mutually exclusive, so its safe to copy
// as no keys are going to be overwritten
for k, v := range workerHardwareReq {
hardwareReq[k] = v

Check warning on line 346 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L346

Added line #L346 was not covered by tests
}
v.Register(tinkerbell.ExtraHardwareAvailableAssertionForNodeRollOut(kubeReader.GetCatalogue(), hardwareReq))

}

tinkClusterSpec := tinkerbell.NewClusterSpec(
Expand Down Expand Up @@ -423,6 +392,68 @@
return validatableCAPI, nil
}

// validateHardwareReqForKCP returns minium hardware requirements for the KCP to rollout new control plane nodes
// CAPI rolls out a new control-plane node whenever the associated MachineTemplate changes in the kcp object
// There will be no rollout if the template stays the same.
func (r *Reconciler) validateHardwareReqForKCP(validatableCAPI *tinkerbell.ValidatableTinkerbellCAPI, tinkerbellScope *Scope) (tinkerbell.MinimumHardwareRequirements, error) {
currentKCP := validatableCAPI.KubeadmControlPlane
newKCP := tinkerbellScope.ControlPlane.KubeadmControlPlane
tinkerbellClusterSpec := tinkerbell.NewClusterSpec(tinkerbellScope.ClusterSpec, tinkerbellScope.ClusterSpec.TinkerbellMachineConfigs, tinkerbellScope.ClusterSpec.TinkerbellDatacenter)
maxSurge := 1
requirements := tinkerbell.MinimumHardwareRequirements{}
if currentKCP.Spec.MachineTemplate.InfrastructureRef.Name != newKCP.Spec.MachineTemplate.InfrastructureRef.Name {
upgradeStrategy := tinkerbellScope.ClusterSpec.Cluster.Spec.ControlPlaneConfiguration.UpgradeRolloutStrategy
if upgradeStrategy != nil && upgradeStrategy.Type == anywherev1.RollingUpdateStrategyType {
maxSurge = upgradeStrategy.RollingUpdate.MaxSurge
}
if err := requirements.Add(tinkerbellClusterSpec.ControlPlaneMachineConfig().Spec.HardwareSelector, maxSurge); err != nil {
return nil, err

Check warning on line 410 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L410

Added line #L410 was not covered by tests
}
}
return requirements, nil
}

// validateHardwareReqForMachineDeployments returns minium hardware requirements for the md's to rollout new worker nodes
// CAPI rolls out a new worker node only whenever the associated MachineTemplate changes in the md object
// A single cluster can have multiple MachineDeployment objects and in case of modular upgrades
// only few of those worker groups might need a rollout.
func (r *Reconciler) validateHardwareReqForMachineDeployments(ctx context.Context, tinkerbellScope *Scope) (requirements tinkerbell.MinimumHardwareRequirements, err error) {
newWorkers := tinkerbellScope.Workers

tinkerbellClusterSpec := tinkerbell.NewClusterSpec(tinkerbellScope.ClusterSpec, tinkerbellScope.ClusterSpec.TinkerbellMachineConfigs, tinkerbellScope.ClusterSpec.TinkerbellDatacenter)
requirements = tinkerbell.MinimumHardwareRequirements{}
for _, wg := range newWorkers.Groups {
maxSurge := 1
currentMachineDeployment, err := controller.GetMachineDeployment(ctx, r.client, wg.MachineDeployment.GetName())
if err != nil {
return nil, errors.Wrap(err, "failed to get workernode group machinedeployment")

Check warning on line 429 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L429

Added line #L429 was not covered by tests
}
clusterName := tinkerbellClusterSpec.Cluster.Name

// EKS-A names MachineDeployment with the clusterName prefix followed by the WorkerNodeGroup name provider concatenated by '-'
// We just need the workerNodeGroup name to fetch the corresponding workerNodeGroup config from the spec
workerNodeGroupName := strings.ReplaceAll(wg.MachineDeployment.GetName(), clusterName, "")[1:]
var workerNodeGroup anywherev1.WorkerNodeGroupConfiguration
for _, wng := range tinkerbellClusterSpec.WorkerNodeGroupConfigurations() {
if wng.Name == workerNodeGroupName {
workerNodeGroup = wng
break
}
}
if currentMachineDeployment != nil && currentMachineDeployment.Spec.Template.Spec.InfrastructureRef.Name != wg.MachineDeployment.Spec.Template.Spec.InfrastructureRef.Name {
upgradeStrategy := wg.MachineDeployment.Spec.Strategy
if upgradeStrategy != nil && upgradeStrategy.Type == clusterv1.RollingUpdateMachineDeploymentStrategyType {
maxSurge = int(upgradeStrategy.RollingUpdate.MaxSurge.IntVal)

Check warning on line 446 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L444-L446

Added lines #L444 - L446 were not covered by tests
}
if err := requirements.Add(tinkerbellClusterSpec.WorkerNodeGroupMachineConfig(workerNodeGroup).Spec.HardwareSelector, maxSurge); err != nil {
return nil, err

Check warning on line 449 in pkg/providers/tinkerbell/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/tinkerbell/reconciler/reconciler.go#L448-L449

Added lines #L448 - L449 were not covered by tests
}
}
}

return requirements, nil
}

// ValidateRufioMachines checks to ensure all the Rufio machines condition contactable is True.
func (r *Reconciler) ValidateRufioMachines(ctx context.Context, log logr.Logger, tinkerbellScope *Scope) (controller.Result, error) {
clusterSpec := tinkerbellScope.ClusterSpec
Expand Down
Loading
Loading