From 65ac3e07acc10d7fde7992437aff17ee4df3beae Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Tue, 23 Apr 2024 10:09:07 -0400 Subject: [PATCH] mimir.rules.kubernetes: Don't retry unrecoverable errors (#616) Change event processing to immediately stop applying changes to Mimir in response to events that cause HTTP 4XX errors. These errors indicate the request is malformed and will never succeed so there's no point retrying it. Fixes #610 Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 2 + .../mimir/rules/kubernetes/events.go | 8 +- internal/mimir/client/client.go | 14 ++-- internal/mimir/client/client_test.go | 82 +++++++++++++++++++ 4 files changed, 97 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1692bd1d88..e4bb8db9cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ Main (unreleased) - Fixed issue where text labels displayed outside of component node's boundary. (@hainenber) +- In `mimir.rules.kubernetes`, fix an issue where unrecoverable errors from the Mimir API were retried. (@56quarters) + ### Other changes - Update `alloy-mixin` to use more specific alert group names (for example, diff --git a/internal/component/mimir/rules/kubernetes/events.go b/internal/component/mimir/rules/kubernetes/events.go index 1fa3da8f12..fed2ee6195 100644 --- a/internal/component/mimir/rules/kubernetes/events.go +++ b/internal/component/mimir/rules/kubernetes/events.go @@ -10,7 +10,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/alloy/internal/alloy/logging/level" "github.com/grafana/alloy/internal/component/common/kubernetes" - mimirClient "github.com/grafana/alloy/internal/mimir/client" + "github.com/grafana/alloy/internal/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" @@ -39,7 +39,7 @@ type eventProcessor struct { stopChan chan struct{} health healthReporter - mimirClient mimirClient.Interface + mimirClient client.Interface namespaceLister coreListers.NamespaceLister ruleLister promListers.PrometheusRuleLister namespaceSelector labels.Selector @@ -68,7 +68,7 @@ func (e *eventProcessor) run(ctx context.Context) { if err != nil { retries := e.queue.NumRequeues(evt) - if retries < 5 { + if retries < 5 && client.IsRecoverable(err) { e.metrics.eventsRetried.WithLabelValues(string(evt.Typ)).Inc() e.queue.AddRateLimited(evt) level.Error(e.logger).Log( @@ -80,7 +80,7 @@ func (e *eventProcessor) run(ctx context.Context) { } else { e.metrics.eventsFailed.WithLabelValues(string(evt.Typ)).Inc() level.Error(e.logger).Log( - "msg", "failed to process event, max retries exceeded", + "msg", "failed to process event, unrecoverable error or max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), "err", err, ) diff --git a/internal/mimir/client/client.go b/internal/mimir/client/client.go index b3f86bf46b..299a50edbf 100644 --- a/internal/mimir/client/client.go +++ b/internal/mimir/client/client.go @@ -11,7 +11,7 @@ import ( "net/url" "strings" - log "github.com/go-kit/log" + "github.com/go-kit/log" "github.com/grafana/alloy/internal/mimir/client/internal" "github.com/grafana/alloy/internal/useragent" "github.com/grafana/dskit/instrument" @@ -22,10 +22,14 @@ import ( ) var ( - ErrNoConfig = errors.New("no config exists for this user") - ErrResourceNotFound = errors.New("requested resource not found") + ErrUnrecoverable = errors.New("unrecoverable error response") ) +// IsRecoverable returns true for errors from API requests that can be retried, false otherwise. +func IsRecoverable(err error) bool { + return !errors.Is(err, ErrUnrecoverable) +} + // Config is used to configure a MimirClient. type Config struct { ID string @@ -123,8 +127,8 @@ func checkResponse(r *http.Response) error { errMsg = fmt.Sprintf("server returned HTTP status %s: %s", r.Status, msg) } - if r.StatusCode == http.StatusNotFound { - return ErrResourceNotFound + if r.StatusCode/100 == 4 && r.StatusCode != http.StatusTooManyRequests { + return fmt.Errorf("%w: %s", ErrUnrecoverable, errMsg) } return errors.New(errMsg) diff --git a/internal/mimir/client/client_test.go b/internal/mimir/client/client_test.go index 262e9918a9..119bd41b02 100644 --- a/internal/mimir/client/client_test.go +++ b/internal/mimir/client/client_test.go @@ -1,8 +1,10 @@ package client import ( + "io" "net/http" "net/url" + "strings" "testing" "github.com/stretchr/testify/require" @@ -99,3 +101,83 @@ func TestBuildURL(t *testing.T) { }) } } + +func TestCheckResponseSuccess(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + }{ + { + name: "returns nil error for 200 response", + body: "200 message!", + statusCode: http.StatusOK, + }, + { + name: "returns nil error for 204 response", + body: "", + statusCode: http.StatusNoContent, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.NoError(t, err) + }) + } +} + +func TestCheckResponseErrors(t *testing.T) { + tc := []struct { + name string + body string + statusCode int + canRetry bool + }{ + { + name: "returns correct error for 400 response", + body: "400 message!", + statusCode: http.StatusBadRequest, + canRetry: false, + }, + { + name: "returns correct error for 404 response", + body: "404 message!", + statusCode: 404, + canRetry: false, + }, + { + name: "returns correct error for 429 response", + body: "429 message!", + statusCode: http.StatusTooManyRequests, + canRetry: true, + }, + { + name: "returns correct error for 500 response", + body: "500 message!", + statusCode: http.StatusInternalServerError, + canRetry: true, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + response := &http.Response{ + Status: http.StatusText(tt.statusCode), + StatusCode: tt.statusCode, + Body: io.NopCloser(strings.NewReader(tt.body)), + } + + err := checkResponse(response) + require.Error(t, err) + require.Equal(t, tt.canRetry, IsRecoverable(err), "%+v is not expected recoverable/unrecoverable", err) + }) + } +}