From b0e8f1c6c3253c500c15d7666e928ef24f48dbf5 Mon Sep 17 00:00:00 2001 From: Vishwanath Taykhande <54012750+vishu2498@users.noreply.github.com> Date: Tue, 24 Dec 2024 16:57:10 +0530 Subject: [PATCH] log set of issues when node group in degraded state (#926) Co-authored-by: stefanSpectro --- pkg/cloud/services/eks/nodegroup.go | 28 ++++++++++++++ pkg/cloud/services/eks/nodegroup_test.go | 47 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 pkg/cloud/services/eks/nodegroup_test.go diff --git a/pkg/cloud/services/eks/nodegroup.go b/pkg/cloud/services/eks/nodegroup.go index 00fc0f8371..10c2844e8c 100644 --- a/pkg/cloud/services/eks/nodegroup.go +++ b/pkg/cloud/services/eks/nodegroup.go @@ -20,15 +20,19 @@ import ( "context" "fmt" "sigs.k8s.io/controller-runtime/pkg/client" + "slices" + "strings" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/eks" + "github.com/aws/aws-sdk-go/service/iam" "github.com/google/go-cmp/cmp" "github.com/pkg/errors" "k8s.io/apimachinery/pkg/util/version" + capierrors "sigs.k8s.io/cluster-api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-aws/api/v1beta1" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/controlplane/eks/api/v1beta1" @@ -611,6 +615,30 @@ func (s *NodegroupService) setStatus(ng *eks.Nodegroup) error { managedPool.Status.Ready = false case eks.NodegroupStatusUpdating: managedPool.Status.Ready = true + case eks.NodegroupStatusDegraded: + issueErrMsgSet := make([]string, 0) + var errMsgStr string + + for _, iss := range ng.Health.Issues { + errMsg := iss.GoString() + if slices.Contains(issueErrMsgSet, errMsg) { + continue + } + issueErrMsgSet = append(issueErrMsgSet, errMsg) + errMsgStr = fmt.Sprintf("%s %s", errMsgStr, errMsg) + } + + var reason capierrors.MachineStatusError + // TODO: implement checks for other MachineStatusErrors and set reason accordingly + if strings.Contains(errMsgStr, "VcpuLimitExceeded") { + reason = capierrors.InsufficientResourcesMachineError + } + + managedPool.Status.Ready = false + managedPool.Status.FailureReason = &reason + managedPool.Status.FailureMessage = &errMsgStr + return errors.Errorf("NodeGroup status is %s due to %v caused by error %s. This error may persist and recreating the Node Group may be required to return to %s status", + eks.NodegroupStatusDegraded, *s.scope.ManagedMachinePool.Status.FailureReason, *s.scope.ManagedMachinePool.Status.FailureMessage, eks.NodegroupStatusActive) default: return errors.Errorf("unexpected EKS nodegroup status %s", *ng.Status) } diff --git a/pkg/cloud/services/eks/nodegroup_test.go b/pkg/cloud/services/eks/nodegroup_test.go new file mode 100644 index 0000000000..2b8b3a1bb4 --- /dev/null +++ b/pkg/cloud/services/eks/nodegroup_test.go @@ -0,0 +1,47 @@ +package eks + +import ( + "github.com/aws/aws-sdk-go/service/eks" + . "github.com/onsi/gomega" + "sigs.k8s.io/cluster-api-provider-aws/exp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-aws/pkg/cloud/scope" + capierrors "sigs.k8s.io/cluster-api/errors" + "testing" +) + +func TestSetStatus(t *testing.T) { + g := NewWithT(t) + degraded := eks.NodegroupStatusDegraded + code := eks.NodegroupIssueCodeAsgInstanceLaunchFailures + message := "VcpuLimitExceeded" + resourceId := "my-worker-nodes" + + s := &NodegroupService{ + scope: &scope.ManagedMachinePoolScope{ + ManagedMachinePool: &v1beta1.AWSManagedMachinePool{ + Status: v1beta1.AWSManagedMachinePoolStatus{ + Ready: false, + }, + }, + }, + } + + issue := &eks.Issue{ + Code: &code, + Message: &message, + ResourceIds: []*string{&resourceId}, + } + ng := &eks.Nodegroup{ + Status: °raded, + Health: &eks.NodegroupHealth{ + Issues: []*eks.Issue{issue}, + }, + } + + err := s.setStatus(ng) + g.Expect(err).ToNot(BeNil()) + // ensure machine pool status values are set as expected + g.Expect(*s.scope.ManagedMachinePool.Status.FailureMessage).To(ContainSubstring(issue.GoString())) + g.Expect(s.scope.ManagedMachinePool.Status.Ready).To(Equal(false)) + g.Expect(*s.scope.ManagedMachinePool.Status.FailureReason).To(Equal(capierrors.InsufficientResourcesMachineError)) +}