From 1d5b73a95c144d115800cf04a19c0e0dc3a8a4b7 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Fri, 19 Apr 2024 14:46:50 -0400 Subject: [PATCH] mimir.rules.kubernetes: Don't retry unrecoverable errors Change event processing to immediately stop applying changes to Mimir in response to events that cause HTTP 4XX errors. These errors indicate the request is malformed and will never succeed so there's no point retrying it. Fixes #610 Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 3 +- .../mimir/rules/kubernetes/events.go | 6 +- internal/mimir/client/client.go | 5 +- internal/mimir/client/client_test.go | 83 +++++++++++++++++++ 4 files changed, 93 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9338ed98d..0e6dbb71ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,9 +32,10 @@ Main (unreleased) - Fix an issue on Windows where uninstalling Alloy did not remove it from the Add/Remove programs list. (@rfratto) - - Fixed issue where text labels displayed outside of component node's boundary. (@hainenber) +- In `mimir.rules.kubernetes`, fix an issue where unrecoverable errors from the Mimir API were retried. (@56quarters) + v1.0.0 (2024-04-09) ------------------- diff --git a/internal/component/mimir/rules/kubernetes/events.go b/internal/component/mimir/rules/kubernetes/events.go index 471c12b28c..aa7f91de88 100644 --- a/internal/component/mimir/rules/kubernetes/events.go +++ b/internal/component/mimir/rules/kubernetes/events.go @@ -2,12 +2,14 @@ package rules import ( "context" + "errors" "fmt" "regexp" "time" "github.com/grafana/alloy/internal/alloy/logging/level" "github.com/grafana/alloy/internal/component/common/kubernetes" + "github.com/grafana/alloy/internal/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" @@ -30,7 +32,7 @@ func (c *Component) eventLoop(ctx context.Context) { if err != nil { retries := c.queue.NumRequeues(evt) - if retries < 5 { + if retries < 5 && !errors.Is(err, client.ErrUnrecoverable) { c.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc() c.queue.AddRateLimited(evt) level.Error(c.log).Log( @@ -42,7 +44,7 @@ func (c *Component) eventLoop(ctx context.Context) { } else { c.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc() level.Error(c.log).Log( - "msg", "failed to process event, max retries exceeded", + "msg", "failed to process event, unrecoverable error or max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), "err", err, ) diff --git a/internal/mimir/client/client.go b/internal/mimir/client/client.go index b3f86bf46b..f72029ea89 100644 --- a/internal/mimir/client/client.go +++ b/internal/mimir/client/client.go @@ -11,7 +11,7 @@ import ( "net/url" "strings" - log "github.com/go-kit/log" + "github.com/go-kit/log" "github.com/grafana/alloy/internal/mimir/client/internal" "github.com/grafana/alloy/internal/useragent" "github.com/grafana/dskit/instrument" @@ -24,6 +24,7 @@ import ( var ( ErrNoConfig = errors.New("no config exists for this user") ErrResourceNotFound = errors.New("requested resource not found") + ErrUnrecoverable = errors.New("unrecoverable error response") ) // Config is used to configure a MimirClient. @@ -125,6 +126,8 @@ func checkResponse(r *http.Response) error { if r.StatusCode == http.StatusNotFound { return ErrResourceNotFound + } else if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests { + return fmt.Errorf("%w: %s", ErrUnrecoverable, errMsg) } return errors.New(errMsg) diff --git a/internal/mimir/client/client_test.go b/internal/mimir/client/client_test.go index 262e9918a9..d67a3256d5 100644 --- a/internal/mimir/client/client_test.go +++ b/internal/mimir/client/client_test.go @@ -1,8 +1,11 @@ package client import ( + "errors" + "io" "net/http" "net/url" + "strings" "testing" "github.com/stretchr/testify/require" @@ -99,3 +102,83 @@ func TestBuildURL(t *testing.T) { }) } } + +func TestCheckResponseSuccess(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + }{ + { + name: "returns nil error for 200 response", + body: "200 message!", + statusCode: http.StatusOK, + }, + { + name: "returns nil error for 204 response", + body: "", + statusCode: http.StatusNoContent, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.NoError(t, err) + }) + } +} + +func TestCheckResponseErrors(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + fatal bool + }{ + { + name: "returns correct error for 400 response", + body: "400 message!", + statusCode: http.StatusBadRequest, + fatal: true, + }, + { + name: "returns correct error for 404 response", + body: "404 message!", + statusCode: 404, + fatal: false, + }, + { + name: "returns correct error for 429 response", + body: "429 message!", + statusCode: http.StatusTooManyRequests, + fatal: false, + }, + { + name: "returns correct error for 500 response", + body: "500 message!", + statusCode: http.StatusInternalServerError, + fatal: false, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.Error(t, err) + require.Equal(t, tt.fatal, errors.Is(err, ErrUnrecoverable), "%+v is not expected recoverable/unrecoverable", err) + }) + } +}