Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Prioritize Machine with remediate-machine anotation when selecting the next machine to be remediated #11495

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
// RemediateMachineAnnotation request the MachineHealthCheck reconciler to mark a Machine as unhealthy. CAPI builtin remediation will prioritize Machines with the annotation to be remediated.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
Expand Down
9 changes: 9 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
// having the highest priority issue that other machines have not.
// The following issues are considered (from highest to lowest priority):
// - machine with RemediateMachineAnnotation annotation
// - machine without .status.nodeRef
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
Expand All @@ -373,6 +374,14 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana

// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
// If one machine has the RemediateMachineAnnotation annotation, remediate first.
if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) {
return true
}
if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) {
return false
}

// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
// so remediate first; also without a node, it is not possible to get further info about status.
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {
Expand Down
19 changes: 19 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ import (
)

func TestGetMachineToBeRemediated(t *testing.T) {
t.Run("returns the machine with RemediateMachineAnnotation first", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m2, m1)

g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns provisioning machines first", func(t *testing.T) {
g := NewWithT(t)

Expand Down
25 changes: 22 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
"sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil"
"sigs.k8s.io/cluster-api/internal/util/ssa"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
Expand Down Expand Up @@ -1425,9 +1426,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
// Sort the machines from newest to oldest.
// We are trying to remediate machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
sort.SliceStable(machinesToRemediate, func(i, j int) bool {
return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time)
})
sortMachinesToRemediate(machinesToRemediate)

// Check if we should limit the in flight operations.
if len(machinesToRemediate) > maxInFlight {
Expand Down Expand Up @@ -1604,3 +1603,23 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu

return false, patchHelper.Patch(ctx, obj)
}

// Returns the machines to be remediated in the following order
// - Machines with RemediateMachineAnnotation annotation if any,
// - Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func sortMachinesToRemediate(machines []*clusterv1.Machine) {
sort.SliceStable(machines, func(i, j int) bool {
if annotations.HasRemediateMachine(machines[i]) && !annotations.HasRemediateMachine(machines[j]) {
return true
}
if !annotations.HasRemediateMachine(machines[i]) && annotations.HasRemediateMachine(machines[j]) {
return false
}
// Use newest (and Name) as a tie-breaker criteria.
if machines[i].CreationTimestamp.Equal(&machines[j].CreationTimestamp) {
return machines[i].Name < machines[j].Name
}
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
})
}
112 changes: 112 additions & 0 deletions internal/controllers/machineset/machineset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package machineset
import (
"context"
"fmt"
"sort"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -2961,3 +2962,114 @@ func TestNewMachineUpToDateCondition(t *testing.T) {
})
}
}

func TestSortMachinesToRemediate(t *testing.T) {
unhealthyMachinesWithAnnotations := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachinesWithAnnotations = append(unhealthyMachinesWithAnnotations, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-annotated-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
Annotations: map[string]string{
clusterv1.RemediateMachineAnnotation: "",
},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

unhealthyMachines := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

t.Run("remediation machines should be sorted with newest first", func(t *testing.T) {
g := NewWithT(t)
machines := make([]*clusterv1.Machine, len(unhealthyMachines))
copy(machines, unhealthyMachines)
sortMachinesToRemediate(machines)
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
g.Expect(unhealthyMachines).To(Equal(machines))
})

t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) {
g := NewWithT(t)

machines := make([]*clusterv1.Machine, len(unhealthyMachines))
copy(machines, unhealthyMachines)
machines = append(machines, unhealthyMachinesWithAnnotations...)
sortMachinesToRemediate(machines)

sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
sort.SliceStable(unhealthyMachinesWithAnnotations, func(i, j int) bool {
return unhealthyMachinesWithAnnotations[i].CreationTimestamp.After(unhealthyMachinesWithAnnotations[j].CreationTimestamp.Time)
})
g.Expect(machines).To(Equal(append(unhealthyMachinesWithAnnotations, unhealthyMachines...)))
})
}
Loading