diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index e32a165e2495..2eab63749228 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -361,11 +361,6 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana return nil } - annotatedMachine := unhealthyMachines.Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation)).Oldest() - if annotatedMachine != nil { - return annotatedMachine - } - machinesToBeRemediated := unhealthyMachines.UnsortedList() if len(machinesToBeRemediated) == 1 { return machinesToBeRemediated[0] @@ -379,6 +374,14 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana // pickMachineToBeRemediated returns true if machine i should be remediated before machine j. func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool { + // If one machine has the RemediateMachineAnnotation annotation, remediate first. + if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) { + return true + } + if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) { + return false + } + // if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all, // so remediate first; also without a node, it is not possible to get further info about status. if i.Status.NodeRef == nil && j.Status.NodeRef != nil { diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index b25a47e2e6be..970e37c4b44d 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -42,7 +42,7 @@ import ( ) func TestGetMachineToBeRemediated(t *testing.T) { - t.Run("returns the oldest machine with RemediateMachineAnnotation even if there are provisioning machines", func(t *testing.T) { + t.Run("returns the machine with RemediateMachineAnnotation first", func(t *testing.T) { g := NewWithT(t) ns, err := env.CreateNamespace(ctx, "ns1") @@ -52,37 +52,13 @@ func TestGetMachineToBeRemediated(t *testing.T) { }() m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) - m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed()) - m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) - m4 := createMachine(ctx, g, ns.Name, "m4-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) - - m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - - unhealthyMachines := collections.FromMachines(m2, m1, m3, m4) - - g.Expect(getMachineToBeRemediated(unhealthyMachines, false).Name).To(HavePrefix("m1-unhealthy-")) - }) - - t.Run("returns the oldest machine with RemediateMachineAnnotation if there are no provisioning machines", func(t *testing.T) { - g := NewWithT(t) - - ns, err := env.CreateNamespace(ctx, "ns1") - g.Expect(err).ToNot(HaveOccurred()) - defer func() { - g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) - }() - - m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) - m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed()) - m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod()) m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - unhealthyMachines := collections.FromMachines(m2, m1, m3) + unhealthyMachines := collections.FromMachines(m2, m1) - g.Expect(getMachineToBeRemediated(unhealthyMachines, false).Name).To(HavePrefix("m1-unhealthy-")) + g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-")) }) t.Run("returns provisioning machines first", func(t *testing.T) { diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 8ea58d752566..314e3966062c 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -54,6 +54,7 @@ import ( "sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" @@ -1425,9 +1426,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( // Sort the machines from newest to oldest. // We are trying to remediate machines failing to come up first because // there is a chance that they are not hosting any workloads (minimize disruption). - sort.SliceStable(machinesToRemediate, func(i, j int) bool { - return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time) - }) + sortMachinesToRemediate(machinesToRemediate) // Check if we should limit the in flight operations. if len(machinesToRemediate) > maxInFlight { @@ -1604,3 +1603,23 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu return false, patchHelper.Patch(ctx, obj) } + +// Returns the machines to be remediated in the following order +// - Machines with RemediateMachineAnnotation annotation if any, +// - Machines failing to come up first because +// there is a chance that they are not hosting any workloads (minimize disruption). +func sortMachinesToRemediate(machines []*clusterv1.Machine) { + sort.SliceStable(machines, func(i, j int) bool { + if annotations.HasRemediateMachine(machines[i]) && !annotations.HasRemediateMachine(machines[j]) { + return true + } + if !annotations.HasRemediateMachine(machines[i]) && annotations.HasRemediateMachine(machines[j]) { + return false + } + // Use newest (and Name) as a tie-breaker criteria. + if machines[i].CreationTimestamp.Equal(&machines[j].CreationTimestamp) { + return machines[i].Name < machines[j].Name + } + return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time) + }) +} diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index 1b06b3458c1a..a587c82bb005 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -3047,17 +3047,21 @@ func TestSortMachinesToRemediate(t *testing.T) { t.Run("remediation machines should be sorted with newest first", func(t *testing.T) { g := NewWithT(t) - machines := unhealthyMachines + machines := make([]*clusterv1.Machine, len(unhealthyMachines)) + copy(machines, unhealthyMachines) sortMachinesToRemediate(machines) sort.SliceStable(unhealthyMachines, func(i, j int) bool { - return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time) + return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time) }) g.Expect(unhealthyMachines).To(Equal(machines)) }) t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) { g := NewWithT(t) - machines := append(unhealthyMachines, unhealthyMachinesWithAnnotations...) + + machines := make([]*clusterv1.Machine, len(unhealthyMachines)) + copy(machines, unhealthyMachines) + machines = append(machines, unhealthyMachinesWithAnnotations...) sortMachinesToRemediate(machines) sort.SliceStable(unhealthyMachines, func(i, j int) bool {