Skip to content

Commit

Permalink
mimir.rules.kubernetes: Don't retry unrecoverable errors
Browse files Browse the repository at this point in the history
Change event processing to immediately stop applying changes to Mimir
in response to events that cause HTTP 4XX errors. These errors indicate
the request is malformed and will never succeed so there's no point
retrying it.

Fixes #610

Signed-off-by: Nick Pillitteri <[email protected]>
  • Loading branch information
56quarters committed Apr 19, 2024
1 parent e607ff6 commit 1d5b73a
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 4 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ Main (unreleased)
- Fix an issue on Windows where uninstalling Alloy did not remove it from the
Add/Remove programs list. (@rfratto)


- Fixed issue where text labels displayed outside of component node's boundary. (@hainenber)

- In `mimir.rules.kubernetes`, fix an issue where unrecoverable errors from the Mimir API were retried. (@56quarters)

v1.0.0 (2024-04-09)
-------------------

Expand Down
6 changes: 4 additions & 2 deletions internal/component/mimir/rules/kubernetes/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ package rules

import (
"context"
"errors"
"fmt"
"regexp"
"time"

"github.com/grafana/alloy/internal/alloy/logging/level"
"github.com/grafana/alloy/internal/component/common/kubernetes"
"github.com/grafana/alloy/internal/mimir/client"
"github.com/hashicorp/go-multierror"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/prometheus/prometheus/model/rulefmt"
Expand All @@ -30,7 +32,7 @@ func (c *Component) eventLoop(ctx context.Context) {

if err != nil {
retries := c.queue.NumRequeues(evt)
if retries < 5 {
if retries < 5 && !errors.Is(err, client.ErrUnrecoverable) {
c.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc()
c.queue.AddRateLimited(evt)
level.Error(c.log).Log(
Expand All @@ -42,7 +44,7 @@ func (c *Component) eventLoop(ctx context.Context) {
} else {
c.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc()
level.Error(c.log).Log(
"msg", "failed to process event, max retries exceeded",
"msg", "failed to process event, unrecoverable error or max retries exceeded",
"retries", fmt.Sprintf("%d/5", retries),
"err", err,
)
Expand Down
5 changes: 4 additions & 1 deletion internal/mimir/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"net/url"
"strings"

log "github.com/go-kit/log"
"github.com/go-kit/log"
"github.com/grafana/alloy/internal/mimir/client/internal"
"github.com/grafana/alloy/internal/useragent"
"github.com/grafana/dskit/instrument"
Expand All @@ -24,6 +24,7 @@ import (
var (
ErrNoConfig = errors.New("no config exists for this user")
ErrResourceNotFound = errors.New("requested resource not found")
ErrUnrecoverable = errors.New("unrecoverable error response")
)

// Config is used to configure a MimirClient.
Expand Down Expand Up @@ -125,6 +126,8 @@ func checkResponse(r *http.Response) error {

if r.StatusCode == http.StatusNotFound {
return ErrResourceNotFound
} else if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests {
return fmt.Errorf("%w: %s", ErrUnrecoverable, errMsg)
}

return errors.New(errMsg)
Expand Down
83 changes: 83 additions & 0 deletions internal/mimir/client/client_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package client

import (
"errors"
"io"
"net/http"
"net/url"
"strings"
"testing"

"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -99,3 +102,83 @@ func TestBuildURL(t *testing.T) {
})
}
}

func TestCheckResponseSuccess(t *testing.T) {
tc := []struct {
name string
body string
statusCode int
}{
{
name: "returns nil error for 200 response",
body: "200 message!",
statusCode: http.StatusOK,
},
{
name: "returns nil error for 204 response",
body: "",
statusCode: http.StatusNoContent,
},
}

for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
response := &http.Response{
Status: http.StatusText(tt.statusCode),
StatusCode: tt.statusCode,
Body: io.NopCloser(strings.NewReader(tt.body)),
}

err := checkResponse(response)
require.NoError(t, err)
})
}
}

func TestCheckResponseErrors(t *testing.T) {
tc := []struct {
name string
body string
statusCode int
fatal bool
}{
{
name: "returns correct error for 400 response",
body: "400 message!",
statusCode: http.StatusBadRequest,
fatal: true,
},
{
name: "returns correct error for 404 response",
body: "404 message!",
statusCode: 404,
fatal: false,
},
{
name: "returns correct error for 429 response",
body: "429 message!",
statusCode: http.StatusTooManyRequests,
fatal: false,
},
{
name: "returns correct error for 500 response",
body: "500 message!",
statusCode: http.StatusInternalServerError,
fatal: false,
},
}

for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
response := &http.Response{
Status: http.StatusText(tt.statusCode),
StatusCode: tt.statusCode,
Body: io.NopCloser(strings.NewReader(tt.body)),
}

err := checkResponse(response)
require.Error(t, err)
require.Equal(t, tt.fatal, errors.Is(err, ErrUnrecoverable), "%+v is not expected recoverable/unrecoverable", err)
})
}
}

0 comments on commit 1d5b73a

Please sign in to comment.