diff --git a/CHANGELOG.md b/CHANGELOG.md index e9338ed98d..0e6dbb71ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,9 +32,10 @@ Main (unreleased) - Fix an issue on Windows where uninstalling Alloy did not remove it from the Add/Remove programs list. (@rfratto) - - Fixed issue where text labels displayed outside of component node's boundary. (@hainenber) +- In `mimir.rules.kubernetes`, fix an issue where unrecoverable errors from the Mimir API were retried. (@56quarters) + v1.0.0 (2024-04-09) ------------------- diff --git a/internal/component/mimir/rules/kubernetes/events.go b/internal/component/mimir/rules/kubernetes/events.go index 471c12b28c..aa7f91de88 100644 --- a/internal/component/mimir/rules/kubernetes/events.go +++ b/internal/component/mimir/rules/kubernetes/events.go @@ -2,12 +2,14 @@ package rules import ( "context" + "errors" "fmt" "regexp" "time" "github.com/grafana/alloy/internal/alloy/logging/level" "github.com/grafana/alloy/internal/component/common/kubernetes" + "github.com/grafana/alloy/internal/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" @@ -30,7 +32,7 @@ func (c *Component) eventLoop(ctx context.Context) { if err != nil { retries := c.queue.NumRequeues(evt) - if retries < 5 { + if retries < 5 && !errors.Is(err, client.ErrUnrecoverable) { c.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc() c.queue.AddRateLimited(evt) level.Error(c.log).Log( @@ -42,7 +44,7 @@ func (c *Component) eventLoop(ctx context.Context) { } else { c.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc() level.Error(c.log).Log( - "msg", "failed to process event, max retries exceeded", + "msg", "failed to process event, unrecoverable error or max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), "err", err, ) diff --git a/internal/mimir/client/client.go b/internal/mimir/client/client.go index b3f86bf46b..f72029ea89 100644 --- a/internal/mimir/client/client.go +++ b/internal/mimir/client/client.go @@ -11,7 +11,7 @@ import ( "net/url" "strings" - log "github.com/go-kit/log" + "github.com/go-kit/log" "github.com/grafana/alloy/internal/mimir/client/internal" "github.com/grafana/alloy/internal/useragent" "github.com/grafana/dskit/instrument" @@ -24,6 +24,7 @@ import ( var ( ErrNoConfig = errors.New("no config exists for this user") ErrResourceNotFound = errors.New("requested resource not found") + ErrUnrecoverable = errors.New("unrecoverable error response") ) // Config is used to configure a MimirClient. @@ -125,6 +126,8 @@ func checkResponse(r *http.Response) error { if r.StatusCode == http.StatusNotFound { return ErrResourceNotFound + } else if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests { + return fmt.Errorf("%w: %s", ErrUnrecoverable, errMsg) } return errors.New(errMsg) diff --git a/internal/mimir/client/client_test.go b/internal/mimir/client/client_test.go index 262e9918a9..d67a3256d5 100644 --- a/internal/mimir/client/client_test.go +++ b/internal/mimir/client/client_test.go @@ -1,8 +1,11 @@ package client import ( + "errors" + "io" "net/http" "net/url" + "strings" "testing" "github.com/stretchr/testify/require" @@ -99,3 +102,83 @@ func TestBuildURL(t *testing.T) { }) } } + +func TestCheckResponseSuccess(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + }{ + { + name: "returns nil error for 200 response", + body: "200 message!", + statusCode: http.StatusOK, + }, + { + name: "returns nil error for 204 response", + body: "", + statusCode: http.StatusNoContent, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.NoError(t, err) + }) + } +} + +func TestCheckResponseErrors(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + fatal bool + }{ + { + name: "returns correct error for 400 response", + body: "400 message!", + statusCode: http.StatusBadRequest, + fatal: true, + }, + { + name: "returns correct error for 404 response", + body: "404 message!", + statusCode: 404, + fatal: false, + }, + { + name: "returns correct error for 429 response", + body: "429 message!", + statusCode: http.StatusTooManyRequests, + fatal: false, + }, + { + name: "returns correct error for 500 response", + body: "500 message!", + statusCode: http.StatusInternalServerError, + fatal: false, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.Error(t, err) + require.Equal(t, tt.fatal, errors.Is(err, ErrUnrecoverable), "%+v is not expected recoverable/unrecoverable", err) + }) + } +}