From b8fd7be144f35ea2a895f20d83df73f49e3f3a53 Mon Sep 17 00:00:00 2001 From: Karthik Bhat Date: Wed, 18 Dec 2024 21:43:34 +0530 Subject: [PATCH] Address review comments --- .../internal/controllers/remediation.go | 13 +++++--- .../internal/controllers/remediation_test.go | 32 +++---------------- .../machineset/machineset_controller.go | 21 ++++++++++-- 3 files changed, 30 insertions(+), 36 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index e32a165e2495..2eab63749228 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -361,11 +361,6 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana return nil } - annotatedMachine := unhealthyMachines.Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation)).Oldest() - if annotatedMachine != nil { - return annotatedMachine - } - machinesToBeRemediated := unhealthyMachines.UnsortedList() if len(machinesToBeRemediated) == 1 { return machinesToBeRemediated[0] @@ -379,6 +374,14 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana // pickMachineToBeRemediated returns true if machine i should be remediated before machine j. func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool { + // If one machine has the RemediateMachineAnnotation annotation, remediate first. + if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) { + return true + } + if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) { + return false + } + // if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all, // so remediate first; also without a node, it is not possible to get further info about status. if i.Status.NodeRef == nil && j.Status.NodeRef != nil { diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index b25a47e2e6be..970e37c4b44d 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -42,7 +42,7 @@ import ( ) func TestGetMachineToBeRemediated(t *testing.T) { - t.Run("returns the oldest machine with RemediateMachineAnnotation even if there are provisioning machines", func(t *testing.T) { + t.Run("returns the machine with RemediateMachineAnnotation first", func(t *testing.T) { g := NewWithT(t) ns, err := env.CreateNamespace(ctx, "ns1") @@ -52,37 +52,13 @@ func TestGetMachineToBeRemediated(t *testing.T) { }() m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) - m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed()) - m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) - m4 := createMachine(ctx, g, ns.Name, "m4-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef()) - - m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - - unhealthyMachines := collections.FromMachines(m2, m1, m3, m4) - - g.Expect(getMachineToBeRemediated(unhealthyMachines, false).Name).To(HavePrefix("m1-unhealthy-")) - }) - - t.Run("returns the oldest machine with RemediateMachineAnnotation if there are no provisioning machines", func(t *testing.T) { - g := NewWithT(t) - - ns, err := env.CreateNamespace(ctx, "ns1") - g.Expect(err).ToNot(HaveOccurred()) - defer func() { - g.Expect(env.Cleanup(ctx, ns)).To(Succeed()) - }() - - m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed()) - m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed()) - m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed()) + m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod()) m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""}) - unhealthyMachines := collections.FromMachines(m2, m1, m3) + unhealthyMachines := collections.FromMachines(m2, m1) - g.Expect(getMachineToBeRemediated(unhealthyMachines, false).Name).To(HavePrefix("m1-unhealthy-")) + g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-")) }) t.Run("returns provisioning machines first", func(t *testing.T) { diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 8ea58d752566..95cbfd5db74a 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -54,6 +54,7 @@ import ( "sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" @@ -1425,9 +1426,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( // Sort the machines from newest to oldest. // We are trying to remediate machines failing to come up first because // there is a chance that they are not hosting any workloads (minimize disruption). - sort.SliceStable(machinesToRemediate, func(i, j int) bool { - return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time) - }) + sortMachinesToRemediate(machinesToRemediate) // Check if we should limit the in flight operations. if len(machinesToRemediate) > maxInFlight { @@ -1604,3 +1603,19 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu return false, patchHelper.Patch(ctx, obj) } + +// Returns the machines to be remediated in the following order +// - Machines with RemediateMachineAnnotation annotation if any, +// - Machines failing to come up first because +// there is a chance that they are not hosting any workloads (minimize disruption). +func sortMachinesToRemediate(machines []*clusterv1.Machine) { + sort.SliceStable(machines, func(i, j int) bool { + if annotations.HasRemediateMachine(machines[i]) && !annotations.HasRemediateMachine(machines[j]) { + return true + } + if !annotations.HasRemediateMachine(machines[i]) && annotations.HasRemediateMachine(machines[j]) { + return false + } + return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time) + }) +}