From df11df2616c32d091adec707a20421f3a7c0aab0 Mon Sep 17 00:00:00 2001 From: stefanSpectro Date: Thu, 22 Feb 2024 16:03:06 -0700 Subject: [PATCH] return set of issues when ng is in degraded state --- pkg/cloud/services/eks/nodegroup.go | 26 +++++++++++++ pkg/cloud/services/eks/nodegroup_test.go | 48 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 pkg/cloud/services/eks/nodegroup_test.go diff --git a/pkg/cloud/services/eks/nodegroup.go b/pkg/cloud/services/eks/nodegroup.go index ec24f16fc3..acffe3cdd3 100644 --- a/pkg/cloud/services/eks/nodegroup.go +++ b/pkg/cloud/services/eks/nodegroup.go @@ -19,6 +19,8 @@ package eks import ( "context" "fmt" + "slices" + "strings" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" @@ -37,6 +39,7 @@ import ( "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/wait" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/util/annotations" ) @@ -588,6 +591,29 @@ func (s *NodegroupService) setStatus(ng *eks.Nodegroup) error { managedPool.Status.Ready = false case eks.NodegroupStatusUpdating: managedPool.Status.Ready = true + case eks.NodegroupStatusDegraded: + issueErrMsgSet := make([]string, 0) + var errMsgStr string + + for _, iss := range ng.Health.Issues { + errMsg := iss.GoString() + if slices.Contains(issueErrMsgSet, errMsg) { + continue + } + issueErrMsgSet = append(issueErrMsgSet, errMsg) + errMsgStr = fmt.Sprintf("%s %s", errMsgStr, errMsg) + } + reason := capierrors.InvalidConfigurationMachineError + // TODO: implement checks for other MachineStatusErrors and set reason accordingly + if strings.Contains(errMsgStr, "VcpuLimitExceeded") { + reason = capierrors.InsufficientResourcesMachineError + } + + managedPool.Status.Ready = false + managedPool.Status.FailureReason = &reason + managedPool.Status.FailureMessage = &errMsgStr + return errors.Errorf("NodeGroup status is %s due to %v caused by error %s. This error may persist and recreating the Node Group may be required to return to %s status", + eks.NodegroupStatusDegraded, *s.scope.ManagedMachinePool.Status.FailureReason, *s.scope.ManagedMachinePool.Status.FailureMessage, eks.NodegroupStatusActive) default: return errors.Errorf("unexpected EKS nodegroup status %s", *ng.Status) } diff --git a/pkg/cloud/services/eks/nodegroup_test.go b/pkg/cloud/services/eks/nodegroup_test.go new file mode 100644 index 0000000000..89794fb99e --- /dev/null +++ b/pkg/cloud/services/eks/nodegroup_test.go @@ -0,0 +1,48 @@ +package eks + +import ( + "testing" + + "github.com/aws/aws-sdk-go/service/eks" + . "github.com/onsi/gomega" + "sigs.k8s.io/cluster-api-provider-aws/v2/exp/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + capierrors "sigs.k8s.io/cluster-api/errors" +) + +func TestSetStatus(t *testing.T) { + g := NewWithT(t) + degraded := eks.NodegroupStatusDegraded + code := eks.NodegroupIssueCodeAsgInstanceLaunchFailures + message := "VcpuLimitExceeded" + resourceId := "my-worker-nodes" + + s := &NodegroupService{ + scope: &scope.ManagedMachinePoolScope{ + ManagedMachinePool: &v1beta2.AWSManagedMachinePool{ + Status: v1beta2.AWSManagedMachinePoolStatus{ + Ready: false, + }, + }, + }, + } + + issue := &eks.Issue{ + Code: &code, + Message: &message, + ResourceIds: []*string{&resourceId}, + } + ng := &eks.Nodegroup{ + Status: °raded, + Health: &eks.NodegroupHealth{ + Issues: []*eks.Issue{issue}, + }, + } + + err := s.setStatus(ng) + g.Expect(err).ToNot(BeNil()) + // ensure machine pool status values are set as expected + g.Expect(*s.scope.ManagedMachinePool.Status.FailureMessage).To(ContainSubstring(issue.GoString())) + g.Expect(s.scope.ManagedMachinePool.Status.Ready).To(Equal(false)) + g.Expect(*s.scope.ManagedMachinePool.Status.FailureReason).To(Equal(capierrors.InsufficientResourcesMachineError)) +}