Skip to content

Commit

Permalink
log set of issues when node group in degraded state (#926)
Browse files Browse the repository at this point in the history
Co-authored-by: stefanSpectro <[email protected]>
  • Loading branch information
vishu2498 and stefanSpectro authored Dec 24, 2024
1 parent 8a233fc commit b0e8f1c
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
28 changes: 28 additions & 0 deletions pkg/cloud/services/eks/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,19 @@ import (
"context"
"fmt"
"sigs.k8s.io/controller-runtime/pkg/client"
"slices"
"strings"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/eks"

"github.com/aws/aws-sdk-go/service/iam"
"github.com/google/go-cmp/cmp"
"github.com/pkg/errors"
"k8s.io/apimachinery/pkg/util/version"
capierrors "sigs.k8s.io/cluster-api/errors"

infrav1 "sigs.k8s.io/cluster-api-provider-aws/api/v1beta1"
ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/controlplane/eks/api/v1beta1"
Expand Down Expand Up @@ -611,6 +615,30 @@ func (s *NodegroupService) setStatus(ng *eks.Nodegroup) error {
managedPool.Status.Ready = false
case eks.NodegroupStatusUpdating:
managedPool.Status.Ready = true
case eks.NodegroupStatusDegraded:
issueErrMsgSet := make([]string, 0)
var errMsgStr string

for _, iss := range ng.Health.Issues {
errMsg := iss.GoString()
if slices.Contains(issueErrMsgSet, errMsg) {
continue
}
issueErrMsgSet = append(issueErrMsgSet, errMsg)
errMsgStr = fmt.Sprintf("%s %s", errMsgStr, errMsg)
}

var reason capierrors.MachineStatusError
// TODO: implement checks for other MachineStatusErrors and set reason accordingly
if strings.Contains(errMsgStr, "VcpuLimitExceeded") {
reason = capierrors.InsufficientResourcesMachineError
}

managedPool.Status.Ready = false
managedPool.Status.FailureReason = &reason
managedPool.Status.FailureMessage = &errMsgStr
return errors.Errorf("NodeGroup status is %s due to %v caused by error %s. This error may persist and recreating the Node Group may be required to return to %s status",
eks.NodegroupStatusDegraded, *s.scope.ManagedMachinePool.Status.FailureReason, *s.scope.ManagedMachinePool.Status.FailureMessage, eks.NodegroupStatusActive)
default:
return errors.Errorf("unexpected EKS nodegroup status %s", *ng.Status)
}
Expand Down
47 changes: 47 additions & 0 deletions pkg/cloud/services/eks/nodegroup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package eks

import (
"github.com/aws/aws-sdk-go/service/eks"
. "github.com/onsi/gomega"
"sigs.k8s.io/cluster-api-provider-aws/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-aws/pkg/cloud/scope"
capierrors "sigs.k8s.io/cluster-api/errors"
"testing"
)

func TestSetStatus(t *testing.T) {
g := NewWithT(t)
degraded := eks.NodegroupStatusDegraded
code := eks.NodegroupIssueCodeAsgInstanceLaunchFailures
message := "VcpuLimitExceeded"
resourceId := "my-worker-nodes"

s := &NodegroupService{
scope: &scope.ManagedMachinePoolScope{
ManagedMachinePool: &v1beta1.AWSManagedMachinePool{
Status: v1beta1.AWSManagedMachinePoolStatus{
Ready: false,
},
},
},
}

issue := &eks.Issue{
Code: &code,
Message: &message,
ResourceIds: []*string{&resourceId},
}
ng := &eks.Nodegroup{
Status: &degraded,
Health: &eks.NodegroupHealth{
Issues: []*eks.Issue{issue},
},
}

err := s.setStatus(ng)
g.Expect(err).ToNot(BeNil())
// ensure machine pool status values are set as expected
g.Expect(*s.scope.ManagedMachinePool.Status.FailureMessage).To(ContainSubstring(issue.GoString()))
g.Expect(s.scope.ManagedMachinePool.Status.Ready).To(Equal(false))
g.Expect(*s.scope.ManagedMachinePool.Status.FailureReason).To(Equal(capierrors.InsufficientResourcesMachineError))
}

0 comments on commit b0e8f1c

Please sign in to comment.