Skip to content

Commit

Permalink
🌱 Prioritize Machine with remediate-machine anotation when selecting …
Browse files Browse the repository at this point in the history
…the next machine to be remediated (#11495)

* Prioritise machine with remidiation anotation for remidiation

* Changes to prioritise remediation for machine set controller

* Update RemediateMachineAnnotation description

* Address review comments
  • Loading branch information
Karthik-K-N authored Jan 23, 2025
1 parent d7bb304 commit a090145
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 4 deletions.
2 changes: 1 addition & 1 deletion api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
// RemediateMachineAnnotation request the MachineHealthCheck reconciler to mark a Machine as unhealthy. CAPI builtin remediation will prioritize Machines with the annotation to be remediated.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
Expand Down
9 changes: 9 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
// having the highest priority issue that other machines have not.
// The following issues are considered (from highest to lowest priority):
// - machine with RemediateMachineAnnotation annotation
// - machine without .status.nodeRef
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
Expand All @@ -373,6 +374,14 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana

// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
// If one machine has the RemediateMachineAnnotation annotation, remediate first.
if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) {
return true
}
if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) {
return false
}

// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
// so remediate first; also without a node, it is not possible to get further info about status.
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {
Expand Down
19 changes: 19 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ import (
)

func TestGetMachineToBeRemediated(t *testing.T) {
t.Run("returns the machine with RemediateMachineAnnotation first", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m2, m1)

g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns provisioning machines first", func(t *testing.T) {
g := NewWithT(t)

Expand Down
25 changes: 22 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
topologynames "sigs.k8s.io/cluster-api/internal/topology/names"
"sigs.k8s.io/cluster-api/internal/util/ssa"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
Expand Down Expand Up @@ -1438,9 +1439,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
// Sort the machines from newest to oldest.
// We are trying to remediate machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
sort.SliceStable(machinesToRemediate, func(i, j int) bool {
return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time)
})
sortMachinesToRemediate(machinesToRemediate)

// Check if we should limit the in flight operations.
if len(machinesToRemediate) > maxInFlight {
Expand Down Expand Up @@ -1617,3 +1616,23 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu

return false, patchHelper.Patch(ctx, obj)
}

// Returns the machines to be remediated in the following order
// - Machines with RemediateMachineAnnotation annotation if any,
// - Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func sortMachinesToRemediate(machines []*clusterv1.Machine) {
sort.SliceStable(machines, func(i, j int) bool {
if annotations.HasRemediateMachine(machines[i]) && !annotations.HasRemediateMachine(machines[j]) {
return true
}
if !annotations.HasRemediateMachine(machines[i]) && annotations.HasRemediateMachine(machines[j]) {
return false
}
// Use newest (and Name) as a tie-breaker criteria.
if machines[i].CreationTimestamp.Equal(&machines[j].CreationTimestamp) {
return machines[i].Name < machines[j].Name
}
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
})
}
112 changes: 112 additions & 0 deletions internal/controllers/machineset/machineset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package machineset
import (
"context"
"fmt"
"sort"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -3138,3 +3139,114 @@ func TestNewMachineUpToDateCondition(t *testing.T) {
})
}
}

func TestSortMachinesToRemediate(t *testing.T) {
unhealthyMachinesWithAnnotations := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachinesWithAnnotations = append(unhealthyMachinesWithAnnotations, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-annotated-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
Annotations: map[string]string{
clusterv1.RemediateMachineAnnotation: "",
},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

unhealthyMachines := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

t.Run("remediation machines should be sorted with newest first", func(t *testing.T) {
g := NewWithT(t)
machines := make([]*clusterv1.Machine, len(unhealthyMachines))
copy(machines, unhealthyMachines)
sortMachinesToRemediate(machines)
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
g.Expect(unhealthyMachines).To(Equal(machines))
})

t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) {
g := NewWithT(t)

machines := make([]*clusterv1.Machine, len(unhealthyMachines))
copy(machines, unhealthyMachines)
machines = append(machines, unhealthyMachinesWithAnnotations...)
sortMachinesToRemediate(machines)

sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
sort.SliceStable(unhealthyMachinesWithAnnotations, func(i, j int) bool {
return unhealthyMachinesWithAnnotations[i].CreationTimestamp.After(unhealthyMachinesWithAnnotations[j].CreationTimestamp.Time)
})
g.Expect(machines).To(Equal(append(unhealthyMachinesWithAnnotations, unhealthyMachines...)))
})
}

0 comments on commit a090145

Please sign in to comment.