From 8c235caf6ebef5f471108b07fc843358b321e149 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 2 Dec 2022 15:24:26 -0600 Subject: [PATCH 01/40] Initial polling of rules from Grafana Cloud Client copied from cortextool --- component/all/all.go | 1 + component/mimir/rules/rules.go | 175 +++++++++++++++++++++++++ pkg/mimir/client/alerts.go | 76 +++++++++++ pkg/mimir/client/client.go | 221 ++++++++++++++++++++++++++++++++ pkg/mimir/client/client_test.go | 95 ++++++++++++++ pkg/mimir/client/rules.go | 121 +++++++++++++++++ pkg/mimir/client/rules_test.go | 76 +++++++++++ 7 files changed, 765 insertions(+) create mode 100644 component/mimir/rules/rules.go create mode 100644 pkg/mimir/client/alerts.go create mode 100644 pkg/mimir/client/client.go create mode 100644 pkg/mimir/client/client_test.go create mode 100644 pkg/mimir/client/rules.go create mode 100644 pkg/mimir/client/rules_test.go diff --git a/component/all/all.go b/component/all/all.go index 6a39c43c2573..877dfa768065 100644 --- a/component/all/all.go +++ b/component/all/all.go @@ -11,6 +11,7 @@ import ( _ "github.com/grafana/agent/component/loki/relabel" // Import loki.relabel _ "github.com/grafana/agent/component/loki/source/file" // Import loki.source.file _ "github.com/grafana/agent/component/loki/write" // Import loki.write + _ "github.com/grafana/agent/component/mimir/rules" // Import mimir.rules _ "github.com/grafana/agent/component/otelcol/auth/basic" // Import otelcol.auth.basic _ "github.com/grafana/agent/component/otelcol/auth/bearer" // Import otelcol.auth.bearer _ "github.com/grafana/agent/component/otelcol/auth/headers" // Import otelcol.auth.headers diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go new file mode 100644 index 000000000000..bfa9c6138cae --- /dev/null +++ b/component/mimir/rules/rules.go @@ -0,0 +1,175 @@ +package rules + +import ( + "context" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/grafana/agent/component" + "github.com/grafana/agent/pkg/mimir/client" + "github.com/grafana/dskit/crypto/tls" + v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func init() { + component.Register(component.Registration{ + Name: "mimir.rules", + Args: Arguments{}, + Exports: Exports{}, + Build: func(o component.Options, c component.Arguments) (component.Component, error) { + return NewComponent(o, c.(Arguments)) + }, + }) +} + +type Arguments struct { + ClientParams ClientArguments `river:"client,block"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` +} + +type ClientArguments struct { + User string `river:"user,attr,optional"` + Key string `river:"key,attr,optional"` + Address string `river:"address,attr"` + ID string `river:"id,attr,optional"` + TLS TLSArguments `river:"tls,block,optional"` + UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` + AuthToken string `river:"auth_token,attr,optional"` +} + +type TLSArguments struct { + CertPath string `river:"tls_cert_path,attr,optional"` + KeyPath string `river:"tls_key_path,attr,optional"` + CAPath string `river:"tls_ca_path,attr,optional"` + ServerName string `river:"tls_server_name,attr,optional"` + InsecureSkipVerify bool `river:"tls_insecure_skip_verify,attr,optional"` + CipherSuites string `river:"tls_cipher_suites,attr,optional"` + MinVersion string `river:"tls_min_version,attr,optional"` +} + +type Exports struct { +} + +type Component struct { + log log.Logger + opts component.Options + args Arguments + + client *client.MimirClient + ticker *time.Ticker +} + +var _ component.Component = (*Component)(nil) + +func NewComponent(o component.Options, c Arguments) (*Component, error) { + return &Component{ + log: o.Logger, + opts: o, + args: c, + }, nil +} + +func (c *Component) Run(ctx context.Context) error { + err := c.init() + if err != nil { + return err + } + + c.start(ctx) + + return nil +} + +func (c *Component) Update(newConfig component.Arguments) error { + c.args = newConfig.(Arguments) + return c.init() +} + +func (c *Component) init() error { + if c.args.SyncInterval == 0 { + c.args.SyncInterval = 30 * time.Second + } + + var err error + c.client, err = client.New(client.Config{ + User: c.args.ClientParams.User, + Key: c.args.ClientParams.Key, + Address: c.args.ClientParams.Address, + ID: c.args.ClientParams.ID, + TLS: tls.ClientConfig{ + CertPath: c.args.ClientParams.TLS.CertPath, + KeyPath: c.args.ClientParams.TLS.KeyPath, + CAPath: c.args.ClientParams.TLS.CAPath, + ServerName: c.args.ClientParams.TLS.ServerName, + InsecureSkipVerify: c.args.ClientParams.TLS.InsecureSkipVerify, + CipherSuites: c.args.ClientParams.TLS.CipherSuites, + MinVersion: c.args.ClientParams.TLS.MinVersion, + }, + UseLegacyRoutes: c.args.ClientParams.UseLegacyRoutes, + AuthToken: c.args.ClientParams.AuthToken, + }) + if err != nil { + return err + } + + c.ticker = time.NewTicker(c.args.SyncInterval) + + return nil +} + +func (c *Component) start(ctx context.Context) { + for { + select { + case <-c.ticker.C: + level.Info(c.log).Log("msg", "syncing rules") + err := c.syncRules(ctx) + if err != nil { + level.Error(c.log).Log("msg", "failed to sync rules", "err", err) + } + case <-ctx.Done(): + level.Info(c.log).Log("msg", "shutting down") + return + } + } +} + +func (c *Component) syncRules(ctx context.Context) error { + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + desiredState, err := c.discoverRuleCRDs(ctx) + if err != nil { + return err + } + level.Debug(c.log).Log("msg", "found rule crds", "num_crds", len(desiredState)) + + actualState, err := c.loadActiveRules(ctx) + if err != nil { + return err + } + level.Debug(c.log).Log("msg", "found active rules", "num_namespaces", len(actualState)) + + diff := c.diffRuleStates(desiredState, actualState) + + return c.applyChanges(ctx, diff) +} + +func (c *Component) discoverRuleCRDs(ctx context.Context) ([]v1.PrometheusRule, error) { + return nil, nil +} + +func (c *Component) loadActiveRules(ctx context.Context) (map[string][]client.RuleGroup, error) { + return c.client.ListRules(ctx, "") +} + +type RuleGroupDiff struct { +} + +func (c *Component) diffRuleStates(desired []v1.PrometheusRule, actual map[string][]client.RuleGroup) []RuleGroupDiff { + return nil +} + +func (c *Component) applyChanges(ctx context.Context, diff []RuleGroupDiff) error { + return nil +} diff --git a/pkg/mimir/client/alerts.go b/pkg/mimir/client/alerts.go new file mode 100644 index 000000000000..61e4f41e0457 --- /dev/null +++ b/pkg/mimir/client/alerts.go @@ -0,0 +1,76 @@ +package client + +import ( + "context" + "io/ioutil" + + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" + "gopkg.in/yaml.v3" +) + +const alertmanagerAPIPath = "/api/v1/alerts" + +type configCompat struct { + TemplateFiles map[string]string `yaml:"template_files"` + AlertmanagerConfig string `yaml:"alertmanager_config"` +} + +// CreateAlertmanagerConfig creates a new alertmanager config +func (r *CortexClient) CreateAlertmanagerConfig(ctx context.Context, cfg string, templates map[string]string) error { + payload, err := yaml.Marshal(&configCompat{ + TemplateFiles: templates, + AlertmanagerConfig: cfg, + }) + if err != nil { + return err + } + + res, err := r.doRequest(alertmanagerAPIPath, "POST", payload) + if err != nil { + return err + } + + res.Body.Close() + + return nil +} + +// DeleteAlermanagerConfig deletes the users alertmanagerconfig +func (r *CortexClient) DeleteAlermanagerConfig(ctx context.Context) error { + res, err := r.doRequest(alertmanagerAPIPath, "DELETE", nil) + if err != nil { + return err + } + + res.Body.Close() + + return nil +} + +// GetAlertmanagerConfig retrieves a rule group +func (r *CortexClient) GetAlertmanagerConfig(ctx context.Context) (string, map[string]string, error) { + res, err := r.doRequest(alertmanagerAPIPath, "GET", nil) + if err != nil { + log.Debugln("no alert config present in response") + return "", nil, err + } + + defer res.Body.Close() + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return "", nil, err + } + + compat := configCompat{} + err = yaml.Unmarshal(body, &compat) + if err != nil { + log.WithFields(log.Fields{ + "body": string(body), + }).Debugln("failed to unmarshal rule group from response") + + return "", nil, errors.Wrap(err, "unable to unmarshal response") + } + + return compat.AlertmanagerConfig, compat.TemplateFiles, nil +} diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go new file mode 100644 index 000000000000..c6603abca552 --- /dev/null +++ b/pkg/mimir/client/client.go @@ -0,0 +1,221 @@ +package client + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/grafana/dskit/crypto/tls" + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" +) + +const ( + rulerAPIPath = "/api/v1/rules" + legacyAPIPath = "/api/prom/rules" +) + +var ( + ErrNoConfig = errors.New("No config exists for this user") + ErrResourceNotFound = errors.New("requested resource not found") +) + +// Config is used to configure a Ruler Client +type Config struct { + User string `yaml:"user"` + Key string `yaml:"key"` + Address string `yaml:"address"` + ID string `yaml:"id"` + TLS tls.ClientConfig + UseLegacyRoutes bool `yaml:"use_legacy_routes"` + AuthToken string `yaml:"auth_token"` +} + +// CortexClient is used to get and load rules into a cortex ruler +type CortexClient struct { + user string + key string + id string + endpoint *url.URL + Client http.Client + apiPath string + authToken string +} + +// New returns a new Client +func New(cfg Config) (*CortexClient, error) { + endpoint, err := url.Parse(cfg.Address) + if err != nil { + return nil, err + } + + log.WithFields(log.Fields{ + "address": cfg.Address, + "id": cfg.ID, + }).Debugln("New ruler client created") + + client := http.Client{} + + // Setup TLS client + tlsConfig, err := cfg.TLS.GetTLSConfig() + if err != nil { + log.WithError(err).WithFields(log.Fields{ + "tls-ca": cfg.TLS.CAPath, + "tls-cert": cfg.TLS.CertPath, + "tls-key": cfg.TLS.KeyPath, + }).Errorf("error loading tls files") + return nil, fmt.Errorf("client initialization unsuccessful") + } + + if tlsConfig != nil { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + TLSClientConfig: tlsConfig, + } + client = http.Client{Transport: transport} + } + + path := rulerAPIPath + if cfg.UseLegacyRoutes { + path = legacyAPIPath + } + + return &CortexClient{ + user: cfg.User, + key: cfg.Key, + id: cfg.ID, + endpoint: endpoint, + Client: client, + apiPath: path, + authToken: cfg.AuthToken, + }, nil +} + +// Query executes a PromQL query against the Cortex cluster. +func (r *CortexClient) Query(ctx context.Context, query string) (*http.Response, error) { + + query = fmt.Sprintf("query=%s&time=%d", query, time.Now().Unix()) + escapedQuery := url.PathEscape(query) + + res, err := r.doRequest("/api/prom/api/v1/query?"+escapedQuery, "GET", nil) + if err != nil { + return nil, err + } + + return res, nil +} + +func (r *CortexClient) doRequest(path, method string, payload []byte) (*http.Response, error) { + req, err := buildRequest(path, method, *r.endpoint, payload) + if err != nil { + return nil, err + } + + if (r.user != "" || r.key != "") && r.authToken != "" { + err := errors.New("atmost one of basic auth or auth token should be configured") + log.WithFields(log.Fields{ + "url": req.URL.String(), + "method": req.Method, + "error": err, + }).Errorln("error during request to cortex api") + return nil, err + } + + if r.user != "" { + req.SetBasicAuth(r.user, r.key) + } else if r.key != "" { + req.SetBasicAuth(r.id, r.key) + } + + if r.authToken != "" { + req.Header.Add("Authorization", "Bearer "+r.authToken) + } + + req.Header.Add("X-Scope-OrgID", r.id) + + log.WithFields(log.Fields{ + "url": req.URL.String(), + "method": req.Method, + }).Debugln("sending request to cortex api") + + resp, err := r.Client.Do(req) + if err != nil { + log.WithFields(log.Fields{ + "url": req.URL.String(), + "method": req.Method, + "error": err.Error(), + }).Errorln("error during request to cortex api") + return nil, err + } + + err = checkResponse(resp) + if err != nil { + return nil, err + } + + return resp, nil +} + +// checkResponse checks the API response for errors +func checkResponse(r *http.Response) error { + log.WithFields(log.Fields{ + "status": r.Status, + }).Debugln("checking response") + if 200 <= r.StatusCode && r.StatusCode <= 299 { + return nil + } + + var msg, errMsg string + scanner := bufio.NewScanner(io.LimitReader(r.Body, 512)) + if scanner.Scan() { + msg = scanner.Text() + } + + if msg == "" { + errMsg = fmt.Sprintf("server returned HTTP status %s", r.Status) + } else { + errMsg = fmt.Sprintf("server returned HTTP status %s: %s", r.Status, msg) + } + + if r.StatusCode == http.StatusNotFound { + log.WithFields(log.Fields{ + "status": r.Status, + "msg": msg, + }).Debugln(errMsg) + return ErrResourceNotFound + } + + log.WithFields(log.Fields{ + "status": r.Status, + "msg": msg, + }).Errorln(errMsg) + + return errors.New(errMsg) +} + +func joinPath(baseURLPath, targetPath string) string { + // trim exactly one slash at the end of the base URL, this expects target + // path to always start with a slash + return strings.TrimSuffix(baseURLPath, "/") + targetPath +} + +func buildRequest(p, m string, endpoint url.URL, payload []byte) (*http.Request, error) { + // parse path parameter again (as it already contains escaped path information + pURL, err := url.Parse(p) + if err != nil { + return nil, err + } + + // if path or endpoint contains escaping that requires RawPath to be populated, also join rawPath + if pURL.RawPath != "" || endpoint.RawPath != "" { + endpoint.RawPath = joinPath(endpoint.EscapedPath(), pURL.EscapedPath()) + } + endpoint.Path = joinPath(endpoint.Path, pURL.Path) + return http.NewRequest(m, endpoint.String(), bytes.NewBuffer(payload)) +} diff --git a/pkg/mimir/client/client_test.go b/pkg/mimir/client/client_test.go new file mode 100644 index 000000000000..11b7233a8860 --- /dev/null +++ b/pkg/mimir/client/client_test.go @@ -0,0 +1,95 @@ +package client + +import ( + "net/http" + "net/url" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestBuildURL(t *testing.T) { + tc := []struct { + name string + path string + method string + url string + resultURL string + }{ + { + name: "builds the correct URL with a trailing slash", + path: "/api/v1/rules", + method: http.MethodPost, + url: "http://cortexurl.com/", + resultURL: "http://cortexurl.com/api/v1/rules", + }, + { + name: "builds the correct URL without a trailing slash", + path: "/api/v1/rules", + method: http.MethodPost, + url: "http://cortexurl.com", + resultURL: "http://cortexurl.com/api/v1/rules", + }, + { + name: "builds the correct URL when the base url has a path", + path: "/api/v1/rules", + method: http.MethodPost, + url: "http://cortexurl.com/apathto", + resultURL: "http://cortexurl.com/apathto/api/v1/rules", + }, + { + name: "builds the correct URL when the base url has a path with trailing slash", + path: "/api/v1/rules", + method: http.MethodPost, + url: "http://cortexurl.com/apathto/", + resultURL: "http://cortexurl.com/apathto/api/v1/rules", + }, + { + name: "builds the correct URL with a trailing slash and the target path contains special characters", + path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + method: http.MethodPost, + url: "http://cortexurl.com/", + resultURL: "http://cortexurl.com/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + }, + { + name: "builds the correct URL without a trailing slash and the target path contains special characters", + path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + method: http.MethodPost, + url: "http://cortexurl.com", + resultURL: "http://cortexurl.com/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + }, + { + name: "builds the correct URL when the base url has a path and the target path contains special characters", + path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + method: http.MethodPost, + url: "http://cortexurl.com/apathto", + resultURL: "http://cortexurl.com/apathto/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + }, + { + name: "builds the correct URL when the base url has a path and the target path starts with a escaped slash", + path: "/api/v1/rules/%2F-first-char-slash", + method: http.MethodPost, + url: "http://cortexurl.com/apathto", + resultURL: "http://cortexurl.com/apathto/api/v1/rules/%2F-first-char-slash", + }, + { + name: "builds the correct URL when the base url has a path and the target path ends with a escaped slash", + path: "/api/v1/rules/last-char-slash%2F", + method: http.MethodPost, + url: "http://cortexurl.com/apathto", + resultURL: "http://cortexurl.com/apathto/api/v1/rules/last-char-slash%2F", + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + url, err := url.Parse(tt.url) + require.NoError(t, err) + + req, err := buildRequest(tt.path, tt.method, *url, []byte{}) + require.NoError(t, err) + require.Equal(t, tt.resultURL, req.URL.String()) + }) + } + +} diff --git a/pkg/mimir/client/rules.go b/pkg/mimir/client/rules.go new file mode 100644 index 000000000000..daca51e0134c --- /dev/null +++ b/pkg/mimir/client/rules.go @@ -0,0 +1,121 @@ +package client + +import ( + "context" + "fmt" + "io/ioutil" + "net/url" + + "github.com/pkg/errors" + "github.com/prometheus/prometheus/model/rulefmt" + log "github.com/sirupsen/logrus" + "gopkg.in/yaml.v3" +) + +// RuleGroup is a list of sequentially evaluated recording and alerting rules. +type RuleGroup struct { + rulefmt.RuleGroup `yaml:",inline"` + // RWConfigs is used by the remote write forwarding ruler + RWConfigs []RemoteWriteConfig `yaml:"remote_write,omitempty"` +} + +// RemoteWriteConfig is used to specify a remote write endpoint +type RemoteWriteConfig struct { + URL string `json:"url,omitempty"` +} + +// CreateRuleGroup creates a new rule group +func (r *CortexClient) CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error { + payload, err := yaml.Marshal(&rg) + if err != nil { + return err + } + + escapedNamespace := url.PathEscape(namespace) + path := r.apiPath + "/" + escapedNamespace + + res, err := r.doRequest(path, "POST", payload) + if err != nil { + return err + } + + res.Body.Close() + + return nil +} + +// DeleteRuleGroup creates a new rule group +func (r *CortexClient) DeleteRuleGroup(ctx context.Context, namespace, groupName string) error { + escapedNamespace := url.PathEscape(namespace) + escapedGroupName := url.PathEscape(groupName) + path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName + + res, err := r.doRequest(path, "DELETE", nil) + if err != nil { + return err + } + + res.Body.Close() + + return nil +} + +// GetRuleGroup retrieves a rule group +func (r *CortexClient) GetRuleGroup(ctx context.Context, namespace, groupName string) (*RuleGroup, error) { + escapedNamespace := url.PathEscape(namespace) + escapedGroupName := url.PathEscape(groupName) + path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName + + fmt.Println(path) + res, err := r.doRequest(path, "GET", nil) + if err != nil { + return nil, err + } + + defer res.Body.Close() + body, err := ioutil.ReadAll(res.Body) + + if err != nil { + return nil, err + } + + rg := RuleGroup{} + err = yaml.Unmarshal(body, &rg) + if err != nil { + log.WithFields(log.Fields{ + "body": string(body), + }).Debugln("failed to unmarshal rule group from response") + + return nil, errors.Wrap(err, "unable to unmarshal response") + } + + return &rg, nil +} + +// ListRules retrieves a rule group +func (r *CortexClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { + path := r.apiPath + if namespace != "" { + path = path + "/" + namespace + } + + res, err := r.doRequest(path, "GET", nil) + if err != nil { + return nil, err + } + + defer res.Body.Close() + body, err := ioutil.ReadAll(res.Body) + + if err != nil { + return nil, err + } + + ruleSet := map[string][]RuleGroup{} + err = yaml.Unmarshal(body, &ruleSet) + if err != nil { + return nil, err + } + + return ruleSet, nil +} diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go new file mode 100644 index 000000000000..3ac13c9d4bd0 --- /dev/null +++ b/pkg/mimir/client/rules_test.go @@ -0,0 +1,76 @@ +package client + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestCortexClient_X(t *testing.T) { + requestCh := make(chan *http.Request, 1) + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCh <- r + fmt.Fprintln(w, "hello") + })) + defer ts.Close() + + client, err := New(Config{ + Address: ts.URL, + ID: "my-id", + Key: "my-key", + }) + require.NoError(t, err) + + for _, tc := range []struct { + test string + namespace string + name string + expURLPath string + }{ + { + test: "regular-characters", + namespace: "my-namespace", + name: "my-name", + expURLPath: "/api/v1/rules/my-namespace/my-name", + }, + { + test: "special-characters-spaces", + namespace: "My: Namespace", + name: "My: Name", + expURLPath: "/api/v1/rules/My:%20Namespace/My:%20Name", + }, + { + test: "special-characters-slashes", + namespace: "My/Namespace", + name: "My/Name", + expURLPath: "/api/v1/rules/My%2FNamespace/My%2FName", + }, + { + test: "special-characters-slash-first", + namespace: "My/Namespace", + name: "/first-char-slash", + expURLPath: "/api/v1/rules/My%2FNamespace/%2Ffirst-char-slash", + }, + { + test: "special-characters-slash-first", + namespace: "My/Namespace", + name: "last-char-slash/", + expURLPath: "/api/v1/rules/My%2FNamespace/last-char-slash%2F", + }, + } { + t.Run(tc.test, func(t *testing.T) { + ctx := context.Background() + require.NoError(t, client.DeleteRuleGroup(ctx, tc.namespace, tc.name)) + + req := <-requestCh + require.Equal(t, tc.expURLPath, req.URL.EscapedPath()) + + }) + } + +} From d4a65eff1d47eaf4cf43664e74926b5a93e0d666 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Tue, 13 Dec 2022 17:14:51 -0600 Subject: [PATCH 02/40] Clean up CortexClient and rename to Mimir - Use newer ruler API urls - Remove usage of ioutil - Rename Cortex to Mimir --- pkg/mimir/client/alerts.go | 12 ++++---- pkg/mimir/client/client.go | 34 ++++++++++----------- pkg/mimir/client/client_test.go | 54 ++++++++++++++++----------------- pkg/mimir/client/rules.go | 16 +++++----- pkg/mimir/client/rules_test.go | 15 +++++---- 5 files changed, 64 insertions(+), 67 deletions(-) diff --git a/pkg/mimir/client/alerts.go b/pkg/mimir/client/alerts.go index 61e4f41e0457..47b049b0b17f 100644 --- a/pkg/mimir/client/alerts.go +++ b/pkg/mimir/client/alerts.go @@ -2,7 +2,7 @@ package client import ( "context" - "io/ioutil" + "io" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -17,7 +17,7 @@ type configCompat struct { } // CreateAlertmanagerConfig creates a new alertmanager config -func (r *CortexClient) CreateAlertmanagerConfig(ctx context.Context, cfg string, templates map[string]string) error { +func (r *MimirClient) CreateAlertmanagerConfig(ctx context.Context, cfg string, templates map[string]string) error { payload, err := yaml.Marshal(&configCompat{ TemplateFiles: templates, AlertmanagerConfig: cfg, @@ -37,7 +37,7 @@ func (r *CortexClient) CreateAlertmanagerConfig(ctx context.Context, cfg string, } // DeleteAlermanagerConfig deletes the users alertmanagerconfig -func (r *CortexClient) DeleteAlermanagerConfig(ctx context.Context) error { +func (r *MimirClient) DeleteAlermanagerConfig(ctx context.Context) error { res, err := r.doRequest(alertmanagerAPIPath, "DELETE", nil) if err != nil { return err @@ -48,8 +48,8 @@ func (r *CortexClient) DeleteAlermanagerConfig(ctx context.Context) error { return nil } -// GetAlertmanagerConfig retrieves a rule group -func (r *CortexClient) GetAlertmanagerConfig(ctx context.Context) (string, map[string]string, error) { +// GetAlertmanagerConfig retrieves a Mimir cluster's Alertmanager config. +func (r *MimirClient) GetAlertmanagerConfig(ctx context.Context) (string, map[string]string, error) { res, err := r.doRequest(alertmanagerAPIPath, "GET", nil) if err != nil { log.Debugln("no alert config present in response") @@ -57,7 +57,7 @@ func (r *CortexClient) GetAlertmanagerConfig(ctx context.Context) (string, map[s } defer res.Body.Close() - body, err := ioutil.ReadAll(res.Body) + body, err := io.ReadAll(res.Body) if err != nil { return "", nil, err } diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index c6603abca552..5a5a9fab36cb 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -17,8 +17,8 @@ import ( ) const ( - rulerAPIPath = "/api/v1/rules" - legacyAPIPath = "/api/prom/rules" + rulerAPIPath = "/prometheus/config/v1/rules" + legacyAPIPath = "/api/v1/rules" ) var ( @@ -26,7 +26,7 @@ var ( ErrResourceNotFound = errors.New("requested resource not found") ) -// Config is used to configure a Ruler Client +// Config is used to configure a MimirClient. type Config struct { User string `yaml:"user"` Key string `yaml:"key"` @@ -37,8 +37,8 @@ type Config struct { AuthToken string `yaml:"auth_token"` } -// CortexClient is used to get and load rules into a cortex ruler -type CortexClient struct { +// MimirClient is a client to the Mimir API. +type MimirClient struct { user string key string id string @@ -48,8 +48,8 @@ type CortexClient struct { authToken string } -// New returns a new Client -func New(cfg Config) (*CortexClient, error) { +// New returns a new MimirClient. +func New(cfg Config) (*MimirClient, error) { endpoint, err := url.Parse(cfg.Address) if err != nil { return nil, err @@ -86,7 +86,7 @@ func New(cfg Config) (*CortexClient, error) { path = legacyAPIPath } - return &CortexClient{ + return &MimirClient{ user: cfg.User, key: cfg.Key, id: cfg.ID, @@ -97,13 +97,11 @@ func New(cfg Config) (*CortexClient, error) { }, nil } -// Query executes a PromQL query against the Cortex cluster. -func (r *CortexClient) Query(ctx context.Context, query string) (*http.Response, error) { +// Query executes a PromQL query against the Mimir cluster. +func (r *MimirClient) Query(ctx context.Context, query string) (*http.Response, error) { + req := fmt.Sprintf("/prometheus/api/v1/query?query=%s&time=%d", url.QueryEscape(query), time.Now().Unix()) - query = fmt.Sprintf("query=%s&time=%d", query, time.Now().Unix()) - escapedQuery := url.PathEscape(query) - - res, err := r.doRequest("/api/prom/api/v1/query?"+escapedQuery, "GET", nil) + res, err := r.doRequest(req, "GET", nil) if err != nil { return nil, err } @@ -111,7 +109,7 @@ func (r *CortexClient) Query(ctx context.Context, query string) (*http.Response, return res, nil } -func (r *CortexClient) doRequest(path, method string, payload []byte) (*http.Response, error) { +func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Response, error) { req, err := buildRequest(path, method, *r.endpoint, payload) if err != nil { return nil, err @@ -123,7 +121,7 @@ func (r *CortexClient) doRequest(path, method string, payload []byte) (*http.Res "url": req.URL.String(), "method": req.Method, "error": err, - }).Errorln("error during request to cortex api") + }).Errorln("error during request to Mimir api") return nil, err } @@ -142,7 +140,7 @@ func (r *CortexClient) doRequest(path, method string, payload []byte) (*http.Res log.WithFields(log.Fields{ "url": req.URL.String(), "method": req.Method, - }).Debugln("sending request to cortex api") + }).Debugln("sending request to Mimir api") resp, err := r.Client.Do(req) if err != nil { @@ -150,7 +148,7 @@ func (r *CortexClient) doRequest(path, method string, payload []byte) (*http.Res "url": req.URL.String(), "method": req.Method, "error": err.Error(), - }).Errorln("error during request to cortex api") + }).Errorln("error during request to Mimir api") return nil, err } diff --git a/pkg/mimir/client/client_test.go b/pkg/mimir/client/client_test.go index 11b7233a8860..1313d22a4569 100644 --- a/pkg/mimir/client/client_test.go +++ b/pkg/mimir/client/client_test.go @@ -18,66 +18,66 @@ func TestBuildURL(t *testing.T) { }{ { name: "builds the correct URL with a trailing slash", - path: "/api/v1/rules", + path: "/prometheus/config/v1/rules", method: http.MethodPost, - url: "http://cortexurl.com/", - resultURL: "http://cortexurl.com/api/v1/rules", + url: "http://mimir.local/", + resultURL: "http://mimir.local/prometheus/config/v1/rules", }, { name: "builds the correct URL without a trailing slash", - path: "/api/v1/rules", + path: "/prometheus/config/v1/rules", method: http.MethodPost, - url: "http://cortexurl.com", - resultURL: "http://cortexurl.com/api/v1/rules", + url: "http://mimir.local", + resultURL: "http://mimir.local/prometheus/config/v1/rules", }, { name: "builds the correct URL when the base url has a path", - path: "/api/v1/rules", + path: "/prometheus/config/v1/rules", method: http.MethodPost, - url: "http://cortexurl.com/apathto", - resultURL: "http://cortexurl.com/apathto/api/v1/rules", + url: "http://mimir.local/apathto", + resultURL: "http://mimir.local/apathto/prometheus/config/v1/rules", }, { name: "builds the correct URL when the base url has a path with trailing slash", - path: "/api/v1/rules", + path: "/prometheus/config/v1/rules", method: http.MethodPost, - url: "http://cortexurl.com/apathto/", - resultURL: "http://cortexurl.com/apathto/api/v1/rules", + url: "http://mimir.local/apathto/", + resultURL: "http://mimir.local/apathto/prometheus/config/v1/rules", }, { name: "builds the correct URL with a trailing slash and the target path contains special characters", - path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + path: "/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", method: http.MethodPost, - url: "http://cortexurl.com/", - resultURL: "http://cortexurl.com/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + url: "http://mimir.local/", + resultURL: "http://mimir.local/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", }, { name: "builds the correct URL without a trailing slash and the target path contains special characters", - path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + path: "/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", method: http.MethodPost, - url: "http://cortexurl.com", - resultURL: "http://cortexurl.com/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + url: "http://mimir.local", + resultURL: "http://mimir.local/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", }, { name: "builds the correct URL when the base url has a path and the target path contains special characters", - path: "/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + path: "/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", method: http.MethodPost, - url: "http://cortexurl.com/apathto", - resultURL: "http://cortexurl.com/apathto/api/v1/rules/%20%2Fspace%F0%9F%8D%BB", + url: "http://mimir.local/apathto", + resultURL: "http://mimir.local/apathto/prometheus/config/v1/rules/%20%2Fspace%F0%9F%8D%BB", }, { name: "builds the correct URL when the base url has a path and the target path starts with a escaped slash", - path: "/api/v1/rules/%2F-first-char-slash", + path: "/prometheus/config/v1/rules/%2F-first-char-slash", method: http.MethodPost, - url: "http://cortexurl.com/apathto", - resultURL: "http://cortexurl.com/apathto/api/v1/rules/%2F-first-char-slash", + url: "http://mimir.local/apathto", + resultURL: "http://mimir.local/apathto/prometheus/config/v1/rules/%2F-first-char-slash", }, { name: "builds the correct URL when the base url has a path and the target path ends with a escaped slash", - path: "/api/v1/rules/last-char-slash%2F", + path: "/prometheus/config/v1/rules/last-char-slash%2F", method: http.MethodPost, - url: "http://cortexurl.com/apathto", - resultURL: "http://cortexurl.com/apathto/api/v1/rules/last-char-slash%2F", + url: "http://mimir.local/apathto", + resultURL: "http://mimir.local/apathto/prometheus/config/v1/rules/last-char-slash%2F", }, } diff --git a/pkg/mimir/client/rules.go b/pkg/mimir/client/rules.go index daca51e0134c..41a2e577f1b6 100644 --- a/pkg/mimir/client/rules.go +++ b/pkg/mimir/client/rules.go @@ -3,7 +3,7 @@ package client import ( "context" "fmt" - "io/ioutil" + "io" "net/url" "github.com/pkg/errors" @@ -25,7 +25,7 @@ type RemoteWriteConfig struct { } // CreateRuleGroup creates a new rule group -func (r *CortexClient) CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error { +func (r *MimirClient) CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error { payload, err := yaml.Marshal(&rg) if err != nil { return err @@ -44,8 +44,8 @@ func (r *CortexClient) CreateRuleGroup(ctx context.Context, namespace string, rg return nil } -// DeleteRuleGroup creates a new rule group -func (r *CortexClient) DeleteRuleGroup(ctx context.Context, namespace, groupName string) error { +// DeleteRuleGroup deletes a rule group +func (r *MimirClient) DeleteRuleGroup(ctx context.Context, namespace, groupName string) error { escapedNamespace := url.PathEscape(namespace) escapedGroupName := url.PathEscape(groupName) path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName @@ -61,7 +61,7 @@ func (r *CortexClient) DeleteRuleGroup(ctx context.Context, namespace, groupName } // GetRuleGroup retrieves a rule group -func (r *CortexClient) GetRuleGroup(ctx context.Context, namespace, groupName string) (*RuleGroup, error) { +func (r *MimirClient) GetRuleGroup(ctx context.Context, namespace, groupName string) (*RuleGroup, error) { escapedNamespace := url.PathEscape(namespace) escapedGroupName := url.PathEscape(groupName) path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName @@ -73,7 +73,7 @@ func (r *CortexClient) GetRuleGroup(ctx context.Context, namespace, groupName st } defer res.Body.Close() - body, err := ioutil.ReadAll(res.Body) + body, err := io.ReadAll(res.Body) if err != nil { return nil, err @@ -93,7 +93,7 @@ func (r *CortexClient) GetRuleGroup(ctx context.Context, namespace, groupName st } // ListRules retrieves a rule group -func (r *CortexClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { +func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { path := r.apiPath if namespace != "" { path = path + "/" + namespace @@ -105,7 +105,7 @@ func (r *CortexClient) ListRules(ctx context.Context, namespace string) (map[str } defer res.Body.Close() - body, err := ioutil.ReadAll(res.Body) + body, err := io.ReadAll(res.Body) if err != nil { return nil, err diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go index 3ac13c9d4bd0..d98acf4b6512 100644 --- a/pkg/mimir/client/rules_test.go +++ b/pkg/mimir/client/rules_test.go @@ -10,7 +10,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestCortexClient_X(t *testing.T) { +func TestMimirClient_X(t *testing.T) { requestCh := make(chan *http.Request, 1) ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -36,31 +36,31 @@ func TestCortexClient_X(t *testing.T) { test: "regular-characters", namespace: "my-namespace", name: "my-name", - expURLPath: "/api/v1/rules/my-namespace/my-name", + expURLPath: "/prometheus/config/v1/rules/my-namespace/my-name", }, { test: "special-characters-spaces", namespace: "My: Namespace", name: "My: Name", - expURLPath: "/api/v1/rules/My:%20Namespace/My:%20Name", + expURLPath: "/prometheus/config/v1/rules/My:%20Namespace/My:%20Name", }, { test: "special-characters-slashes", namespace: "My/Namespace", name: "My/Name", - expURLPath: "/api/v1/rules/My%2FNamespace/My%2FName", + expURLPath: "/prometheus/config/v1/rules/My%2FNamespace/My%2FName", }, { test: "special-characters-slash-first", namespace: "My/Namespace", name: "/first-char-slash", - expURLPath: "/api/v1/rules/My%2FNamespace/%2Ffirst-char-slash", + expURLPath: "/prometheus/config/v1/rules/My%2FNamespace/%2Ffirst-char-slash", }, { - test: "special-characters-slash-first", + test: "special-characters-slash-last", namespace: "My/Namespace", name: "last-char-slash/", - expURLPath: "/api/v1/rules/My%2FNamespace/last-char-slash%2F", + expURLPath: "/prometheus/config/v1/rules/My%2FNamespace/last-char-slash%2F", }, } { t.Run(tc.test, func(t *testing.T) { @@ -69,7 +69,6 @@ func TestCortexClient_X(t *testing.T) { req := <-requestCh require.Equal(t, tc.expURLPath, req.URL.EscapedPath()) - }) } From 39d9a4749a3adba9b41a0a3401a136a25b1288db Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 2 Dec 2022 16:54:06 -0600 Subject: [PATCH 03/40] Initial implementation pulling from CRDs --- component/mimir/rules/rules.go | 147 ++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 21 deletions(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index bfa9c6138cae..d43e734f1dc6 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -2,14 +2,23 @@ package rules import ( "context" + "fmt" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/agent/component" - "github.com/grafana/agent/pkg/mimir/client" + "github.com/grafana/agent/pkg/flow/rivertypes" + mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" - v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + controller "sigs.k8s.io/controller-runtime" + k8sClient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) func init() { @@ -26,16 +35,30 @@ func init() { type Arguments struct { ClientParams ClientArguments `river:"client,block"` SyncInterval time.Duration `river:"sync_interval,attr,optional"` + + RuleSelector LabelSelector `river:"rule_selector,block,optional"` + RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` +} + +type LabelSelector struct { + MatchLabels map[string]string `river:"match_labels,attr"` + MatchExpressions []MatchExpression `river:"match_expressions,attr"` +} + +type MatchExpression struct { + Key string `river:"key,attr"` + Operator string `river:"operator,attr"` + Values []string `river:"values,attr"` } type ClientArguments struct { - User string `river:"user,attr,optional"` - Key string `river:"key,attr,optional"` - Address string `river:"address,attr"` - ID string `river:"id,attr,optional"` - TLS TLSArguments `river:"tls,block,optional"` - UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` - AuthToken string `river:"auth_token,attr,optional"` + User string `river:"user,attr,optional"` + Key rivertypes.Secret `river:"key,attr,optional"` + Address string `river:"address,attr"` + ID string `river:"id,attr,optional"` + TLS TLSArguments `river:"tls,block,optional"` + UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` + AuthToken rivertypes.Secret `river:"auth_token,attr,optional"` } type TLSArguments struct { @@ -56,11 +79,16 @@ type Component struct { opts component.Options args Arguments - client *client.MimirClient - ticker *time.Ticker + mimirClient *mimirClient.MimirClient + k8sClient k8sClient.Client + ticker *time.Ticker + + namespaceSelector labels.Selector + ruleSelector labels.Selector } var _ component.Component = (*Component)(nil) +var _ reconcile.Reconciler = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { return &Component{ @@ -86,15 +114,36 @@ func (c *Component) Update(newConfig component.Arguments) error { return c.init() } +func (c *Component) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{}, nil +} + func (c *Component) init() error { + + // TODO: allow overriding some stuff in RestConfig and k8s client options? + restConfig := controller.GetConfigOrDie() + + scheme := runtime.NewScheme() + err := corev1.AddToScheme(scheme) + if err != nil { + return fmt.Errorf("failed to add prometheus operator scheme: %w", err) + } + err = promv1.AddToScheme(scheme) + if err != nil { + return fmt.Errorf("failed to add prometheus operator scheme: %w", err) + } + + c.k8sClient, err = k8sClient.New(restConfig, k8sClient.Options{ + Scheme: scheme, + }) + if c.args.SyncInterval == 0 { c.args.SyncInterval = 30 * time.Second } - var err error - c.client, err = client.New(client.Config{ + c.mimirClient, err = mimirClient.New(mimirClient.Config{ User: c.args.ClientParams.User, - Key: c.args.ClientParams.Key, + Key: string(c.args.ClientParams.Key), Address: c.args.ClientParams.Address, ID: c.args.ClientParams.ID, TLS: tls.ClientConfig{ @@ -107,7 +156,7 @@ func (c *Component) init() error { MinVersion: c.args.ClientParams.TLS.MinVersion, }, UseLegacyRoutes: c.args.ClientParams.UseLegacyRoutes, - AuthToken: c.args.ClientParams.AuthToken, + AuthToken: string(c.args.ClientParams.AuthToken), }) if err != nil { return err @@ -115,14 +164,28 @@ func (c *Component) init() error { c.ticker = time.NewTicker(c.args.SyncInterval) + c.namespaceSelector, err = convertSelectorToListOptions(c.args.RuleNamespaceSelector) + if err != nil { + return err + } + + c.ruleSelector, err = convertSelectorToListOptions(c.args.RuleSelector) + if err != nil { + return err + } + return nil } func (c *Component) start(ctx context.Context) { + err := c.syncRules(ctx) + if err != nil { + level.Error(c.log).Log("msg", "failed to sync rules", "err", err) + } + for { select { case <-c.ticker.C: - level.Info(c.log).Log("msg", "syncing rules") err := c.syncRules(ctx) if err != nil { level.Error(c.log).Log("msg", "failed to sync rules", "err", err) @@ -135,6 +198,8 @@ func (c *Component) start(ctx context.Context) { } func (c *Component) syncRules(ctx context.Context) error { + level.Info(c.log).Log("msg", "syncing rules") + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() @@ -155,18 +220,58 @@ func (c *Component) syncRules(ctx context.Context) error { return c.applyChanges(ctx, diff) } -func (c *Component) discoverRuleCRDs(ctx context.Context) ([]v1.PrometheusRule, error) { - return nil, nil +func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, error) { + matchExpressions := []metav1.LabelSelectorRequirement{} + + for _, me := range selector.MatchExpressions { + matchExpressions = append(matchExpressions, metav1.LabelSelectorRequirement{ + Key: me.Key, + Operator: metav1.LabelSelectorOperator(me.Operator), + Values: me.Values, + }) + } + + return metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: selector.MatchLabels, + MatchExpressions: matchExpressions, + }) +} + +func (c *Component) discoverRuleCRDs(ctx context.Context) ([]*promv1.PrometheusRule, error) { + // List namespaces + var namespaces corev1.NamespaceList + err := c.k8sClient.List(ctx, &namespaces, &k8sClient.ListOptions{ + LabelSelector: c.namespaceSelector, + }) + if err != nil { + return nil, err + } + + var crds []*promv1.PrometheusRule + // List rules in each namespace + for _, namespace := range namespaces.Items { + var crdList promv1.PrometheusRuleList + err := c.k8sClient.List(ctx, &crdList, &k8sClient.ListOptions{ + LabelSelector: c.ruleSelector, + Namespace: namespace.Name, + }) + if err != nil { + return nil, err + } + + crds = append(crds, crdList.Items...) + } + return crds, nil } -func (c *Component) loadActiveRules(ctx context.Context) (map[string][]client.RuleGroup, error) { - return c.client.ListRules(ctx, "") +func (c *Component) loadActiveRules(ctx context.Context) (map[string][]mimirClient.RuleGroup, error) { + return c.mimirClient.ListRules(ctx, "") } type RuleGroupDiff struct { } -func (c *Component) diffRuleStates(desired []v1.PrometheusRule, actual map[string][]client.RuleGroup) []RuleGroupDiff { +func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual map[string][]mimirClient.RuleGroup) []RuleGroupDiff { return nil } From ee18ab4ad6f8e087b9447b8de9fe75a0177e5470 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 2 Dec 2022 17:08:48 -0600 Subject: [PATCH 04/40] Add diffing algorithm to update rules --- component/mimir/rules/rules.go | 99 ++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 15 deletions(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index d43e734f1dc6..9601e79c294c 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -33,8 +33,9 @@ func init() { } type Arguments struct { - ClientParams ClientArguments `river:"client,block"` - SyncInterval time.Duration `river:"sync_interval,attr,optional"` + ClientParams ClientArguments `river:"client,block"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` + MimirRuleNamespace string `river:"mimir_rule_namespace,attr"` RuleSelector LabelSelector `river:"rule_selector,block,optional"` RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` @@ -119,6 +120,9 @@ func (c *Component) Reconcile(ctx context.Context, req reconcile.Request) (recon } func (c *Component) init() error { + if c.args.SyncInterval == 0 { + c.args.SyncInterval = 30 * time.Second + } // TODO: allow overriding some stuff in RestConfig and k8s client options? restConfig := controller.GetConfigOrDie() @@ -137,10 +141,6 @@ func (c *Component) init() error { Scheme: scheme, }) - if c.args.SyncInterval == 0 { - c.args.SyncInterval = 30 * time.Second - } - c.mimirClient, err = mimirClient.New(mimirClient.Config{ User: c.args.ClientParams.User, Key: string(c.args.ClientParams.Key), @@ -207,17 +207,15 @@ func (c *Component) syncRules(ctx context.Context) error { if err != nil { return err } - level.Debug(c.log).Log("msg", "found rule crds", "num_crds", len(desiredState)) actualState, err := c.loadActiveRules(ctx) if err != nil { return err } - level.Debug(c.log).Log("msg", "found active rules", "num_namespaces", len(actualState)) - diff := c.diffRuleStates(desiredState, actualState) + diffs := c.diffRuleStates(desiredState, actualState) - return c.applyChanges(ctx, diff) + return c.applyChanges(ctx, diffs) } func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, error) { @@ -264,17 +262,88 @@ func (c *Component) discoverRuleCRDs(ctx context.Context) ([]*promv1.PrometheusR return crds, nil } -func (c *Component) loadActiveRules(ctx context.Context) (map[string][]mimirClient.RuleGroup, error) { - return c.mimirClient.ListRules(ctx, "") +func (c *Component) loadActiveRules(ctx context.Context) ([]mimirClient.RuleGroup, error) { + rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) + if err != nil { + return nil, err + } + + return rulesByNamespace[c.args.MimirRuleNamespace], nil } +type RuleGroupDiffKind string + +const ( + RuleGroupDiffKindAdd RuleGroupDiffKind = "add" + RuleGroupDiffKindRemove RuleGroupDiffKind = "remove" + RuleGroupDiffKindUpdate RuleGroupDiffKind = "update" +) + type RuleGroupDiff struct { + Kind RuleGroupDiffKind + Actual mimirClient.RuleGroup + Desired promv1.RuleGroup } -func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual map[string][]mimirClient.RuleGroup) []RuleGroupDiff { - return nil +func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mimirClient.RuleGroup) []RuleGroupDiff { + var diff []RuleGroupDiff + + seenGroups := map[string]bool{} + + for _, desiredRule := range desired { + desiredGroups: + for _, desiredRuleGroup := range desiredRule.Spec.Groups { + for _, actualRuleGroup := range actual { + if desiredRuleGroup.Name == actualRuleGroup.Name { + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindUpdate, + Actual: actualRuleGroup, + Desired: desiredRuleGroup, + }) + continue desiredGroups + } + } + + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindAdd, + Desired: desiredRuleGroup, + }) + } + } + + for _, actualRuleGroup := range actual { + if seenGroups[actualRuleGroup.Name] { + continue + } + + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindRemove, + Actual: actualRuleGroup, + }) + } + + return diff } -func (c *Component) applyChanges(ctx context.Context, diff []RuleGroupDiff) error { +func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) error { + if len(diffs) == 0 { + return nil + } + + level.Info(c.log).Log("msg", "applying rule changes", "num_changes", len(diffs)) + + for _, diff := range diffs { + switch diff.Kind { + case RuleGroupDiffKindAdd: + level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) + case RuleGroupDiffKindRemove: + level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) + case RuleGroupDiffKindUpdate: + level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) + default: + level.Error(c.log).Log("msg", "unknown rule group diff kind", "kind", diff.Kind) + } + } + return nil } From 257dcd5783e5eb594b45c4549cd8b709b29cd83e Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Sat, 3 Dec 2022 14:35:40 -0600 Subject: [PATCH 05/40] Apply diffs to Mimir --- component/mimir/rules/rules.go | 64 +++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 9601e79c294c..e062ff3c43db 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -5,13 +5,16 @@ import ( "fmt" "time" + "github.com/ghodss/yaml" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/agent/component" "github.com/grafana/agent/pkg/flow/rivertypes" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/dskit/multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/rulefmt" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -140,6 +143,9 @@ func (c *Component) init() error { c.k8sClient, err = k8sClient.New(restConfig, k8sClient.Options{ Scheme: scheme, }) + if err != nil { + return fmt.Errorf("failed to create k8s client: %w", err) + } c.mimirClient, err = mimirClient.New(mimirClient.Config{ User: c.args.ClientParams.User, @@ -198,7 +204,7 @@ func (c *Component) start(ctx context.Context) { } func (c *Component) syncRules(ctx context.Context) error { - level.Info(c.log).Log("msg", "syncing rules") + level.Debug(c.log).Log("msg", "syncing rules") ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() @@ -213,7 +219,10 @@ func (c *Component) syncRules(ctx context.Context) error { return err } - diffs := c.diffRuleStates(desiredState, actualState) + diffs, err := c.diffRuleStates(desiredState, actualState) + if err != nil { + return err + } return c.applyChanges(ctx, diffs) } @@ -282,23 +291,36 @@ const ( type RuleGroupDiff struct { Kind RuleGroupDiffKind Actual mimirClient.RuleGroup - Desired promv1.RuleGroup + Desired mimirClient.RuleGroup } -func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mimirClient.RuleGroup) []RuleGroupDiff { +func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { var diff []RuleGroupDiff seenGroups := map[string]bool{} for _, desiredRule := range desired { + translatedRuleGroups, err := convertCRDRuleGroupToRuleGroup(desiredRule.Spec) + if err != nil { + return nil, err + } + desiredGroups: - for _, desiredRuleGroup := range desiredRule.Spec.Groups { + for _, desiredRuleGroup := range translatedRuleGroups.Groups { + mimirRuleGroup := mimirClient.RuleGroup{ + RuleGroup: desiredRuleGroup, + // TODO: allow setting the remote write configs? + // RWConfigs: , + } + + seenGroups[desiredRuleGroup.Name] = true + for _, actualRuleGroup := range actual { if desiredRuleGroup.Name == actualRuleGroup.Name { diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindUpdate, Actual: actualRuleGroup, - Desired: desiredRuleGroup, + Desired: mimirRuleGroup, }) continue desiredGroups } @@ -306,7 +328,7 @@ func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mi diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindAdd, - Desired: desiredRuleGroup, + Desired: mimirRuleGroup, }) } } @@ -322,7 +344,7 @@ func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mi }) } - return diff + return diff, nil } func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) error { @@ -336,10 +358,22 @@ func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) err switch diff.Kind { case RuleGroupDiffKindAdd: level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) + err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + if err != nil { + return err + } case RuleGroupDiffKindRemove: level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) + err := c.mimirClient.DeleteRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Actual.Name) + if err != nil { + return err + } case RuleGroupDiffKindUpdate: level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) + err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + if err != nil { + return err + } default: level.Error(c.log).Log("msg", "unknown rule group diff kind", "kind", diff.Kind) } @@ -347,3 +381,17 @@ func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) err return nil } + +func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.RuleGroups, error) { + buf, err := yaml.Marshal(crd) + if err != nil { + return &rulefmt.RuleGroups{}, err + } + + groups, errs := rulefmt.Parse(buf) + if len(errs) > 0 { + return &rulefmt.RuleGroups{}, multierror.New(errs...).Err() + } + + return groups, nil +} From 4454c1e3280cb36789d40510aea66581111bfe9b Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Mon, 5 Dec 2022 15:10:01 -0600 Subject: [PATCH 06/40] Rewrite reconciliation to use k8s informer pattern --- component/mimir/rules/rules.go | 385 ++++++++++++++++++++-------- component/mimir/rules/rules_test.go | 13 + go.mod | 1 + go.sum | 2 + 4 files changed, 289 insertions(+), 112 deletions(-) create mode 100644 component/mimir/rules/rules_test.go diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index e062ff3c43db..e2cea989acce 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -13,15 +13,21 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/multierror" + "github.com/pkg/errors" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" controller "sigs.k8s.io/controller-runtime" k8sClient "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/reconcile" + + promExternalVersions "github.com/prometheus-operator/prometheus-operator/pkg/client/informers/externalversions" + promVersioned "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" ) func init() { @@ -83,49 +89,105 @@ type Component struct { opts component.Options args Arguments - mimirClient *mimirClient.MimirClient - k8sClient k8sClient.Client - ticker *time.Ticker + mimirClient *mimirClient.MimirClient + k8sClient k8sClient.Client + promClient promVersioned.Interface + ruleLister promListers.PrometheusRuleLister + ruleInformer cache.SharedIndexInformer + informerStopChan chan struct{} + ticker *time.Ticker + + queue workqueue.RateLimitingInterface + configUpdates chan ConfigUpdate namespaceSelector labels.Selector ruleSelector labels.Selector + + currentState []mimirClient.RuleGroup +} + +type ConfigUpdate struct { + args Arguments + err chan error } var _ component.Component = (*Component)(nil) -var _ reconcile.Reconciler = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { + setDefaultArguments(&c) return &Component{ - log: o.Logger, - opts: o, - args: c, + log: o.Logger, + opts: o, + args: c, + configUpdates: make(chan ConfigUpdate), + ticker: time.NewTicker(c.SyncInterval), }, nil } func (c *Component) Run(ctx context.Context) error { - err := c.init() + err := c.startup(ctx) if err != nil { return err } - c.start(ctx) + for { + select { + case update := <-c.configUpdates: + c.shutdown() + + c.args = update.args + err := c.startup(ctx) + update.err <- err + if err != nil { + return err + } + case <-ctx.Done(): + c.shutdown() + return nil + case <-c.ticker.C: + c.queue.Add(Event{ + Type: EventTypeSyncMimir, + }) + } + } +} + +func (c *Component) startup(ctx context.Context) error { + err := c.init() + if err != nil { + return err + } + c.startRuleInformer() + c.syncMimir(ctx) + go c.eventLoop(ctx) return nil } +func (c *Component) shutdown() { + close(c.informerStopChan) + c.queue.ShutDownWithDrain() +} + func (c *Component) Update(newConfig component.Arguments) error { - c.args = newConfig.(Arguments) - return c.init() + errChan := make(chan error) + c.configUpdates <- ConfigUpdate{ + args: newConfig.(Arguments), + err: errChan, + } + return <-errChan } -func (c *Component) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - return reconcile.Result{}, nil +func setDefaultArguments(args *Arguments) { + if args.SyncInterval == 0 { + args.SyncInterval = 30 * time.Second + } } func (c *Component) init() error { - if c.args.SyncInterval == 0 { - c.args.SyncInterval = 30 * time.Second - } + level.Info(c.log).Log("msg", "initializing with new configuration") + + setDefaultArguments(&c.args) // TODO: allow overriding some stuff in RestConfig and k8s client options? restConfig := controller.GetConfigOrDie() @@ -147,6 +209,11 @@ func (c *Component) init() error { return fmt.Errorf("failed to create k8s client: %w", err) } + c.promClient, err = promVersioned.NewForConfig(restConfig) + if err != nil { + return fmt.Errorf("failed to create prometheus operator client: %w", err) + } + c.mimirClient, err = mimirClient.New(mimirClient.Config{ User: c.args.ClientParams.User, Key: string(c.args.ClientParams.Key), @@ -168,7 +235,7 @@ func (c *Component) init() error { return err } - c.ticker = time.NewTicker(c.args.SyncInterval) + c.ticker.Reset(c.args.SyncInterval) c.namespaceSelector, err = convertSelectorToListOptions(c.args.RuleNamespaceSelector) if err != nil { @@ -183,43 +250,26 @@ func (c *Component) init() error { return nil } -func (c *Component) start(ctx context.Context) { - err := c.syncRules(ctx) - if err != nil { - level.Error(c.log).Log("msg", "failed to sync rules", "err", err) - } - - for { - select { - case <-c.ticker.C: - err := c.syncRules(ctx) - if err != nil { - level.Error(c.log).Log("msg", "failed to sync rules", "err", err) - } - case <-ctx.Done(): - level.Info(c.log).Log("msg", "shutting down") - return - } - } -} - -func (c *Component) syncRules(ctx context.Context) error { - level.Debug(c.log).Log("msg", "syncing rules") - +func (c *Component) reconcileState(ctx context.Context) error { ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() - desiredState, err := c.discoverRuleCRDs(ctx) + crdState, err := c.ruleLister.List(c.ruleSelector) if err != nil { - return err + return fmt.Errorf("failed to list rules: %w", err) } - actualState, err := c.loadActiveRules(ctx) - if err != nil { - return err + desiredState := []rulefmt.RuleGroup{} + for _, pr := range crdState { + groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) + if err != nil { + return fmt.Errorf("failed to convert rule group: %w", err) + } + + desiredState = append(desiredState, groups.Groups...) } - diffs, err := c.diffRuleStates(desiredState, actualState) + diffs, err := c.diffRuleStates(desiredState, c.currentState) if err != nil { return err } @@ -244,42 +294,6 @@ func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, erro }) } -func (c *Component) discoverRuleCRDs(ctx context.Context) ([]*promv1.PrometheusRule, error) { - // List namespaces - var namespaces corev1.NamespaceList - err := c.k8sClient.List(ctx, &namespaces, &k8sClient.ListOptions{ - LabelSelector: c.namespaceSelector, - }) - if err != nil { - return nil, err - } - - var crds []*promv1.PrometheusRule - // List rules in each namespace - for _, namespace := range namespaces.Items { - var crdList promv1.PrometheusRuleList - err := c.k8sClient.List(ctx, &crdList, &k8sClient.ListOptions{ - LabelSelector: c.ruleSelector, - Namespace: namespace.Name, - }) - if err != nil { - return nil, err - } - - crds = append(crds, crdList.Items...) - } - return crds, nil -} - -func (c *Component) loadActiveRules(ctx context.Context) ([]mimirClient.RuleGroup, error) { - rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) - if err != nil { - return nil, err - } - - return rulesByNamespace[c.args.MimirRuleNamespace], nil -} - type RuleGroupDiffKind string const ( @@ -294,43 +308,37 @@ type RuleGroupDiff struct { Desired mimirClient.RuleGroup } -func (c *Component) diffRuleStates(desired []*promv1.PrometheusRule, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { +func (c *Component) diffRuleStates(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { var diff []RuleGroupDiff seenGroups := map[string]bool{} - for _, desiredRule := range desired { - translatedRuleGroups, err := convertCRDRuleGroupToRuleGroup(desiredRule.Spec) - if err != nil { - return nil, err +desiredGroups: + for _, desiredRuleGroup := range desired { + mimirRuleGroup := mimirClient.RuleGroup{ + RuleGroup: desiredRuleGroup, + // TODO: allow setting the remote write configs? + // RWConfigs: , } - desiredGroups: - for _, desiredRuleGroup := range translatedRuleGroups.Groups { - mimirRuleGroup := mimirClient.RuleGroup{ - RuleGroup: desiredRuleGroup, - // TODO: allow setting the remote write configs? - // RWConfigs: , - } - - seenGroups[desiredRuleGroup.Name] = true - - for _, actualRuleGroup := range actual { - if desiredRuleGroup.Name == actualRuleGroup.Name { - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindUpdate, - Actual: actualRuleGroup, - Desired: mimirRuleGroup, - }) - continue desiredGroups - } + seenGroups[desiredRuleGroup.Name] = true + + for _, actualRuleGroup := range actual { + if desiredRuleGroup.Name == actualRuleGroup.Name { + // TODO: check if the rules are the same + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindUpdate, + Actual: actualRuleGroup, + Desired: mimirRuleGroup, + }) + continue desiredGroups } - - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindAdd, - Desired: mimirRuleGroup, - }) } + + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindAdd, + Desired: mimirRuleGroup, + }) } for _, actualRuleGroup := range actual { @@ -379,6 +387,8 @@ func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) err } } + c.syncMimir(ctx) + return nil } @@ -395,3 +405,154 @@ func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.Rul return groups, nil } + +func (c *Component) startRuleInformer() { + c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + factory := promExternalVersions.NewSharedInformerFactory(c.promClient, 24*time.Hour) + + promRules := factory.Monitoring().V1().PrometheusRules() + c.ruleLister = promRules.Lister() + c.ruleInformer = promRules.Informer() + c.ruleInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeAddRule, + NewRules: key, + }) + }, + UpdateFunc: func(oldObj, newObj interface{}) { + oldKey, err := cache.MetaNamespaceKeyFunc(oldObj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + newKey, err := cache.MetaNamespaceKeyFunc(newObj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeUpdateRule, + NewRules: newKey, + OldRules: oldKey, + }) + }, + DeleteFunc: func(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeDeleteRule, + OldRules: key, + }) + }, + }) + + c.informerStopChan = make(chan struct{}) + factory.Start(c.informerStopChan) + factory.WaitForCacheSync(c.informerStopChan) +} + +func (c *Component) eventLoop(ctx context.Context) { + for { + event, shutdown := c.queue.Get() + if shutdown { + level.Info(c.log).Log("msg", "shutting down event loop") + return + } + + evt := event.(Event) + err := c.processEvent(ctx, evt) + + if err != nil { + // TODO: retry limits? + level.Error(c.log).Log("msg", "failed to process event", "err", err) + // c.queue.AddRateLimited(event) + } else { + c.queue.Forget(event) + c.queue.Done(event) + } + } +} + +func (c *Component) getRuleGroupsFromKey(key string) (*rulefmt.RuleGroups, error) { + obj, _, err := c.ruleInformer.GetIndexer().GetByKey(key) + if err != nil { + return nil, errors.Wrap(err, "failed to get rule from informer") + } + + groups, err := convertCRDRuleGroupToRuleGroup(obj.(*promv1.PrometheusRule).Spec) + if err != nil { + return nil, errors.Wrap(err, "failed to convert CRD rule group to rule group") + } + + return groups, nil +} + +func (c *Component) processEvent(ctx context.Context, e Event) error { + switch e.Type { + case EventTypeAddRule: + level.Info(c.log).Log("msg", "processing add rule event", "key", e.NewRules) + case EventTypeUpdateRule: + level.Info(c.log).Log("msg", "processing update rule event", "key", e.NewRules) + case EventTypeDeleteRule: + level.Info(c.log).Log("msg", "processing delete rule event", "key", e.OldRules) + case EventTypeAddNamespace: + case EventTypeDeleteNamespace: + case EventTypeUpdateNamespace: + case EventTypeSyncMimir: + level.Debug(c.log).Log("msg", "syncing current state from ruler") + c.syncMimir(ctx) + default: + return fmt.Errorf("unknown event type: %s", e.Type) + } + + return c.reconcileState(ctx) +} + +func (c *Component) syncMimir(ctx context.Context) { + rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) + if err != nil { + level.Error(c.log).Log("msg", "failed to list rules from mimir", "err", err) + return + } + + c.currentState = rulesByNamespace[c.args.MimirRuleNamespace] +} + +// This type must be hashable, so it is kept simple. The indexer will maintain a +// cache of current state, so this is only used for logging. +type Event struct { + Type EventType + + NewRules string + OldRules string + + NewNamespace string + OldNamespace string +} + +type EventType string + +const ( + EventTypeAddRule EventType = "add-rule" + EventTypeUpdateRule EventType = "update-rule" + EventTypeDeleteRule EventType = "delete-rule" + + EventTypeAddNamespace EventType = "add-namespace" + EventTypeUpdateNamespace EventType = "update-namespace" + EventTypeDeleteNamespace EventType = "delete-namespace" + + EventTypeSyncMimir EventType = "sync-mimir" +) diff --git a/component/mimir/rules/rules_test.go b/component/mimir/rules/rules_test.go new file mode 100644 index 000000000000..f2e41821942e --- /dev/null +++ b/component/mimir/rules/rules_test.go @@ -0,0 +1,13 @@ +package rules + +import ( + "testing" + + "k8s.io/client-go/util/workqueue" +) + +func TestEventTypeIsHashable(t *testing.T) { + // This test is here to ensure that the EventType type is hashable according to the workqueue implementation + queue := workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + queue.AddRateLimited(Event{}) +} diff --git a/go.mod b/go.mod index e57b143b6c10..af226e61129c 100644 --- a/go.mod +++ b/go.mod @@ -442,6 +442,7 @@ require ( github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c // indirect + github.com/prometheus-operator/prometheus-operator/pkg/client v0.61.1 // indirect github.com/prometheus/alertmanager v0.24.0 // indirect github.com/prometheus/common/sigv4 v0.1.0 // indirect github.com/prometheus/exporter-toolkit v0.8.2 // indirect diff --git a/go.sum b/go.sum index 82ef7abf4a2d..3b49154a9b86 100644 --- a/go.sum +++ b/go.sum @@ -2511,6 +2511,8 @@ github.com/prometheus-operator/prometheus-operator v0.61.1 h1:byPe1OQHzTQ2js3hjc github.com/prometheus-operator/prometheus-operator v0.61.1/go.mod h1:fNWiLmBou1oPiL8JEU0N9Qykm585HxU9bAebmjBalmM= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.61.1 h1:ViIkBYnAUumtx9D7PiVPc1n8kNvwm+WMepDZWTZCBPc= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.61.1/go.mod h1:j51242bf6LQwvJ1JPKWApzTnifmCwcQq0i1p29ylWiM= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.61.1 h1:y5ILBCB26Jztm/lgPwm7EcIPxfG20NbY8irIvCIZfKg= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.61.1/go.mod h1:hnvR2Lm/j9sLB1mZHl9gwnuzHuC3iyX4eUPx1SVogF8= github.com/prometheus/alertmanager v0.23.1-0.20210914172521-e35efbddb66a/go.mod h1:U7pGu+z7A9ZKhK8lq1MvIOp5GdVlZjwOYk+S0h3LSbA= github.com/prometheus/alertmanager v0.24.0 h1:HBWR3lk4uy3ys+naDZthDdV7yEsxpaNeZuUS+hJgrOw= github.com/prometheus/alertmanager v0.24.0/go.mod h1:r6fy/D7FRuZh5YbnX6J3MBY0eI4Pb5yPYS7/bPSXXqI= From f9656a223f689c515c27eeaa1e6a4194b2bd3453 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Mon, 5 Dec 2022 15:24:32 -0600 Subject: [PATCH 07/40] Only update rule groups that actually change --- component/mimir/rules/rules.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index e2cea989acce..6f511c361a5f 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -1,6 +1,7 @@ package rules import ( + "bytes" "context" "fmt" "time" @@ -17,6 +18,7 @@ import ( promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" + yamlv3 "gopkg.in/yaml.v3" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -325,6 +327,10 @@ desiredGroups: for _, actualRuleGroup := range actual { if desiredRuleGroup.Name == actualRuleGroup.Name { + if equalRuleGroups(desiredRuleGroup, actualRuleGroup.RuleGroup) { + continue desiredGroups + } + // TODO: check if the rules are the same diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindUpdate, @@ -556,3 +562,26 @@ const ( EventTypeSyncMimir EventType = "sync-mimir" ) + +func equalRuleGroups(a, b rulefmt.RuleGroup) bool { + aBuf, err := yamlv3.Marshal(a) + if err != nil { + return false + } + bBuf, err := yamlv3.Marshal(b) + if err != nil { + return false + } + + if !bytes.Equal(aBuf, bBuf) { + + fmt.Println("----") + fmt.Println(string(aBuf)) + fmt.Println("----") + fmt.Println(string(bBuf)) + + return false + } + + return bytes.Equal(aBuf, bBuf) +} From 1523a481996d9d09d0a431e0cc0d8aa69c978f15 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Mon, 5 Dec 2022 15:36:29 -0600 Subject: [PATCH 08/40] Move arguments into a separate file --- component/mimir/rules/arguments.go | 47 ++++++++++++++++++++++++++++++ component/mimir/rules/rules.go | 41 -------------------------- 2 files changed, 47 insertions(+), 41 deletions(-) create mode 100644 component/mimir/rules/arguments.go diff --git a/component/mimir/rules/arguments.go b/component/mimir/rules/arguments.go new file mode 100644 index 000000000000..98581845952a --- /dev/null +++ b/component/mimir/rules/arguments.go @@ -0,0 +1,47 @@ +package rules + +import ( + "time" + + "github.com/grafana/agent/pkg/flow/rivertypes" +) + +type Arguments struct { + ClientParams ClientArguments `river:"client,block"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` + MimirRuleNamespace string `river:"mimir_rule_namespace,attr"` + + RuleSelector LabelSelector `river:"rule_selector,block,optional"` + RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` +} + +type LabelSelector struct { + MatchLabels map[string]string `river:"match_labels,attr,optional"` + MatchExpressions []MatchExpression `river:"match_expressions,attr,optional"` +} + +type MatchExpression struct { + Key string `river:"key,attr"` + Operator string `river:"operator,attr"` + Values []string `river:"values,attr"` +} + +type ClientArguments struct { + User string `river:"user,attr,optional"` + Key rivertypes.Secret `river:"key,attr,optional"` + Address string `river:"address,attr"` + ID string `river:"id,attr,optional"` + TLS TLSArguments `river:"tls,block,optional"` + UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` + AuthToken rivertypes.Secret `river:"auth_token,attr,optional"` +} + +type TLSArguments struct { + CertPath string `river:"tls_cert_path,attr,optional"` + KeyPath string `river:"tls_key_path,attr,optional"` + CAPath string `river:"tls_ca_path,attr,optional"` + ServerName string `river:"tls_server_name,attr,optional"` + InsecureSkipVerify bool `river:"tls_insecure_skip_verify,attr,optional"` + CipherSuites string `river:"tls_cipher_suites,attr,optional"` + MinVersion string `river:"tls_min_version,attr,optional"` +} diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 6f511c361a5f..6421bd6f2305 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -10,7 +10,6 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/agent/component" - "github.com/grafana/agent/pkg/flow/rivertypes" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/multierror" @@ -43,46 +42,6 @@ func init() { }) } -type Arguments struct { - ClientParams ClientArguments `river:"client,block"` - SyncInterval time.Duration `river:"sync_interval,attr,optional"` - MimirRuleNamespace string `river:"mimir_rule_namespace,attr"` - - RuleSelector LabelSelector `river:"rule_selector,block,optional"` - RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` -} - -type LabelSelector struct { - MatchLabels map[string]string `river:"match_labels,attr"` - MatchExpressions []MatchExpression `river:"match_expressions,attr"` -} - -type MatchExpression struct { - Key string `river:"key,attr"` - Operator string `river:"operator,attr"` - Values []string `river:"values,attr"` -} - -type ClientArguments struct { - User string `river:"user,attr,optional"` - Key rivertypes.Secret `river:"key,attr,optional"` - Address string `river:"address,attr"` - ID string `river:"id,attr,optional"` - TLS TLSArguments `river:"tls,block,optional"` - UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` - AuthToken rivertypes.Secret `river:"auth_token,attr,optional"` -} - -type TLSArguments struct { - CertPath string `river:"tls_cert_path,attr,optional"` - KeyPath string `river:"tls_key_path,attr,optional"` - CAPath string `river:"tls_ca_path,attr,optional"` - ServerName string `river:"tls_server_name,attr,optional"` - InsecureSkipVerify bool `river:"tls_insecure_skip_verify,attr,optional"` - CipherSuites string `river:"tls_cipher_suites,attr,optional"` - MinVersion string `river:"tls_min_version,attr,optional"` -} - type Exports struct { } From 4a94e595570a739103805bff83e8e23c93e74839 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Mon, 5 Dec 2022 16:04:23 -0600 Subject: [PATCH 09/40] Split implementation into smaller files --- component/mimir/rules/diff.go | 88 +++++++++++ component/mimir/rules/events.go | 169 ++++++++++++++++++++ component/mimir/rules/rules.go | 271 +------------------------------- 3 files changed, 260 insertions(+), 268 deletions(-) create mode 100644 component/mimir/rules/diff.go create mode 100644 component/mimir/rules/events.go diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go new file mode 100644 index 000000000000..011dc1b0306e --- /dev/null +++ b/component/mimir/rules/diff.go @@ -0,0 +1,88 @@ +package rules + +import ( + "bytes" + + mimirClient "github.com/grafana/agent/pkg/mimir/client" + "github.com/prometheus/prometheus/model/rulefmt" + + "gopkg.in/yaml.v3" +) + +type RuleGroupDiffKind string + +const ( + RuleGroupDiffKindAdd RuleGroupDiffKind = "add" + RuleGroupDiffKindRemove RuleGroupDiffKind = "remove" + RuleGroupDiffKindUpdate RuleGroupDiffKind = "update" +) + +type RuleGroupDiff struct { + Kind RuleGroupDiffKind + Actual mimirClient.RuleGroup + Desired mimirClient.RuleGroup +} + +func diffRuleStates(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { + var diff []RuleGroupDiff + + seenGroups := map[string]bool{} + +desiredGroups: + for _, desiredRuleGroup := range desired { + mimirRuleGroup := mimirClient.RuleGroup{ + RuleGroup: desiredRuleGroup, + // TODO: allow setting the remote write configs? + // RWConfigs: , + } + + seenGroups[desiredRuleGroup.Name] = true + + for _, actualRuleGroup := range actual { + if desiredRuleGroup.Name == actualRuleGroup.Name { + if equalRuleGroups(desiredRuleGroup, actualRuleGroup.RuleGroup) { + continue desiredGroups + } + + // TODO: check if the rules are the same + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindUpdate, + Actual: actualRuleGroup, + Desired: mimirRuleGroup, + }) + continue desiredGroups + } + } + + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindAdd, + Desired: mimirRuleGroup, + }) + } + + for _, actualRuleGroup := range actual { + if seenGroups[actualRuleGroup.Name] { + continue + } + + diff = append(diff, RuleGroupDiff{ + Kind: RuleGroupDiffKindRemove, + Actual: actualRuleGroup, + }) + } + + return diff, nil +} + +func equalRuleGroups(a, b rulefmt.RuleGroup) bool { + aBuf, err := yaml.Marshal(a) + if err != nil { + return false + } + bBuf, err := yaml.Marshal(b) + if err != nil { + return false + } + + return bytes.Equal(aBuf, bBuf) +} diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go new file mode 100644 index 000000000000..96d73e048beb --- /dev/null +++ b/component/mimir/rules/events.go @@ -0,0 +1,169 @@ +package rules + +import ( + "context" + "fmt" + "time" + + "github.com/ghodss/yaml" + "github.com/go-kit/log/level" + "github.com/grafana/dskit/multierror" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/rulefmt" +) + +// This type must be hashable, so it is kept simple. The indexer will maintain a +// cache of current state, so this is only used for logging. +type Event struct { + Type EventType + + NewRules string + OldRules string + + NewNamespace string + OldNamespace string +} + +type EventType string + +const ( + EventTypeAddRule EventType = "add-rule" + EventTypeUpdateRule EventType = "update-rule" + EventTypeDeleteRule EventType = "delete-rule" + + EventTypeAddNamespace EventType = "add-namespace" + EventTypeUpdateNamespace EventType = "update-namespace" + EventTypeDeleteNamespace EventType = "delete-namespace" + + EventTypeSyncMimir EventType = "sync-mimir" +) + +func (c *Component) eventLoop(ctx context.Context) { + for { + event, shutdown := c.queue.Get() + if shutdown { + level.Info(c.log).Log("msg", "shutting down event loop") + return + } + + evt := event.(Event) + err := c.processEvent(ctx, evt) + + if err != nil { + // TODO: retry limits? + level.Error(c.log).Log("msg", "failed to process event", "err", err) + // c.queue.AddRateLimited(event) + } else { + c.queue.Forget(event) + c.queue.Done(event) + } + } +} +func (c *Component) processEvent(ctx context.Context, e Event) error { + switch e.Type { + case EventTypeAddRule: + level.Info(c.log).Log("msg", "processing add rule event", "key", e.NewRules) + case EventTypeUpdateRule: + level.Info(c.log).Log("msg", "processing update rule event", "key", e.NewRules) + case EventTypeDeleteRule: + level.Info(c.log).Log("msg", "processing delete rule event", "key", e.OldRules) + case EventTypeAddNamespace: + case EventTypeDeleteNamespace: + case EventTypeUpdateNamespace: + case EventTypeSyncMimir: + level.Debug(c.log).Log("msg", "syncing current state from ruler") + c.syncMimir(ctx) + default: + return fmt.Errorf("unknown event type: %s", e.Type) + } + + return c.reconcileState(ctx) +} + +func (c *Component) syncMimir(ctx context.Context) { + rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) + if err != nil { + level.Error(c.log).Log("msg", "failed to list rules from mimir", "err", err) + return + } + + c.currentState = rulesByNamespace[c.args.MimirRuleNamespace] +} + +func (c *Component) reconcileState(ctx context.Context) error { + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + crdState, err := c.ruleLister.List(c.ruleSelector) + if err != nil { + return fmt.Errorf("failed to list rules: %w", err) + } + + desiredState := []rulefmt.RuleGroup{} + for _, pr := range crdState { + groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) + if err != nil { + return fmt.Errorf("failed to convert rule group: %w", err) + } + + desiredState = append(desiredState, groups.Groups...) + } + + diffs, err := diffRuleStates(desiredState, c.currentState) + if err != nil { + return err + } + + return c.applyChanges(ctx, diffs) +} + +func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.RuleGroups, error) { + buf, err := yaml.Marshal(crd) + if err != nil { + return &rulefmt.RuleGroups{}, err + } + + groups, errs := rulefmt.Parse(buf) + if len(errs) > 0 { + return &rulefmt.RuleGroups{}, multierror.New(errs...).Err() + } + + return groups, nil +} + +func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) error { + if len(diffs) == 0 { + return nil + } + + level.Info(c.log).Log("msg", "applying rule changes", "num_changes", len(diffs)) + + for _, diff := range diffs { + switch diff.Kind { + case RuleGroupDiffKindAdd: + level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) + err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + if err != nil { + return err + } + case RuleGroupDiffKindRemove: + level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) + err := c.mimirClient.DeleteRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Actual.Name) + if err != nil { + return err + } + case RuleGroupDiffKindUpdate: + level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) + err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + if err != nil { + return err + } + default: + level.Error(c.log).Log("msg", "unknown rule group diff kind", "kind", diff.Kind) + } + } + + c.syncMimir(ctx) + + return nil +} diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 6421bd6f2305..69a7f04ad702 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -1,23 +1,17 @@ package rules import ( - "bytes" "context" "fmt" "time" - "github.com/ghodss/yaml" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/agent/component" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" - "github.com/grafana/dskit/multierror" - "github.com/pkg/errors" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" - "github.com/prometheus/prometheus/model/rulefmt" - yamlv3 "gopkg.in/yaml.v3" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -139,12 +133,6 @@ func (c *Component) Update(newConfig component.Arguments) error { return <-errChan } -func setDefaultArguments(args *Arguments) { - if args.SyncInterval == 0 { - args.SyncInterval = 30 * time.Second - } -} - func (c *Component) init() error { level.Info(c.log).Log("msg", "initializing with new configuration") @@ -211,33 +199,6 @@ func (c *Component) init() error { return nil } -func (c *Component) reconcileState(ctx context.Context) error { - ctx, cancel := context.WithTimeout(ctx, 5*time.Second) - defer cancel() - - crdState, err := c.ruleLister.List(c.ruleSelector) - if err != nil { - return fmt.Errorf("failed to list rules: %w", err) - } - - desiredState := []rulefmt.RuleGroup{} - for _, pr := range crdState { - groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) - if err != nil { - return fmt.Errorf("failed to convert rule group: %w", err) - } - - desiredState = append(desiredState, groups.Groups...) - } - - diffs, err := c.diffRuleStates(desiredState, c.currentState) - if err != nil { - return err - } - - return c.applyChanges(ctx, diffs) -} - func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, error) { matchExpressions := []metav1.LabelSelectorRequirement{} @@ -255,122 +216,6 @@ func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, erro }) } -type RuleGroupDiffKind string - -const ( - RuleGroupDiffKindAdd RuleGroupDiffKind = "add" - RuleGroupDiffKindRemove RuleGroupDiffKind = "remove" - RuleGroupDiffKindUpdate RuleGroupDiffKind = "update" -) - -type RuleGroupDiff struct { - Kind RuleGroupDiffKind - Actual mimirClient.RuleGroup - Desired mimirClient.RuleGroup -} - -func (c *Component) diffRuleStates(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { - var diff []RuleGroupDiff - - seenGroups := map[string]bool{} - -desiredGroups: - for _, desiredRuleGroup := range desired { - mimirRuleGroup := mimirClient.RuleGroup{ - RuleGroup: desiredRuleGroup, - // TODO: allow setting the remote write configs? - // RWConfigs: , - } - - seenGroups[desiredRuleGroup.Name] = true - - for _, actualRuleGroup := range actual { - if desiredRuleGroup.Name == actualRuleGroup.Name { - if equalRuleGroups(desiredRuleGroup, actualRuleGroup.RuleGroup) { - continue desiredGroups - } - - // TODO: check if the rules are the same - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindUpdate, - Actual: actualRuleGroup, - Desired: mimirRuleGroup, - }) - continue desiredGroups - } - } - - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindAdd, - Desired: mimirRuleGroup, - }) - } - - for _, actualRuleGroup := range actual { - if seenGroups[actualRuleGroup.Name] { - continue - } - - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindRemove, - Actual: actualRuleGroup, - }) - } - - return diff, nil -} - -func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) error { - if len(diffs) == 0 { - return nil - } - - level.Info(c.log).Log("msg", "applying rule changes", "num_changes", len(diffs)) - - for _, diff := range diffs { - switch diff.Kind { - case RuleGroupDiffKindAdd: - level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) - err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) - if err != nil { - return err - } - case RuleGroupDiffKindRemove: - level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) - err := c.mimirClient.DeleteRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Actual.Name) - if err != nil { - return err - } - case RuleGroupDiffKindUpdate: - level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) - err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) - if err != nil { - return err - } - default: - level.Error(c.log).Log("msg", "unknown rule group diff kind", "kind", diff.Kind) - } - } - - c.syncMimir(ctx) - - return nil -} - -func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.RuleGroups, error) { - buf, err := yaml.Marshal(crd) - if err != nil { - return &rulefmt.RuleGroups{}, err - } - - groups, errs := rulefmt.Parse(buf) - if len(errs) > 0 { - return &rulefmt.RuleGroups{}, multierror.New(errs...).Err() - } - - return groups, nil -} - func (c *Component) startRuleInformer() { c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) factory := promExternalVersions.NewSharedInformerFactory(c.promClient, 24*time.Hour) @@ -429,118 +274,8 @@ func (c *Component) startRuleInformer() { factory.WaitForCacheSync(c.informerStopChan) } -func (c *Component) eventLoop(ctx context.Context) { - for { - event, shutdown := c.queue.Get() - if shutdown { - level.Info(c.log).Log("msg", "shutting down event loop") - return - } - - evt := event.(Event) - err := c.processEvent(ctx, evt) - - if err != nil { - // TODO: retry limits? - level.Error(c.log).Log("msg", "failed to process event", "err", err) - // c.queue.AddRateLimited(event) - } else { - c.queue.Forget(event) - c.queue.Done(event) - } - } -} - -func (c *Component) getRuleGroupsFromKey(key string) (*rulefmt.RuleGroups, error) { - obj, _, err := c.ruleInformer.GetIndexer().GetByKey(key) - if err != nil { - return nil, errors.Wrap(err, "failed to get rule from informer") - } - - groups, err := convertCRDRuleGroupToRuleGroup(obj.(*promv1.PrometheusRule).Spec) - if err != nil { - return nil, errors.Wrap(err, "failed to convert CRD rule group to rule group") - } - - return groups, nil -} - -func (c *Component) processEvent(ctx context.Context, e Event) error { - switch e.Type { - case EventTypeAddRule: - level.Info(c.log).Log("msg", "processing add rule event", "key", e.NewRules) - case EventTypeUpdateRule: - level.Info(c.log).Log("msg", "processing update rule event", "key", e.NewRules) - case EventTypeDeleteRule: - level.Info(c.log).Log("msg", "processing delete rule event", "key", e.OldRules) - case EventTypeAddNamespace: - case EventTypeDeleteNamespace: - case EventTypeUpdateNamespace: - case EventTypeSyncMimir: - level.Debug(c.log).Log("msg", "syncing current state from ruler") - c.syncMimir(ctx) - default: - return fmt.Errorf("unknown event type: %s", e.Type) - } - - return c.reconcileState(ctx) -} - -func (c *Component) syncMimir(ctx context.Context) { - rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) - if err != nil { - level.Error(c.log).Log("msg", "failed to list rules from mimir", "err", err) - return - } - - c.currentState = rulesByNamespace[c.args.MimirRuleNamespace] -} - -// This type must be hashable, so it is kept simple. The indexer will maintain a -// cache of current state, so this is only used for logging. -type Event struct { - Type EventType - - NewRules string - OldRules string - - NewNamespace string - OldNamespace string -} - -type EventType string - -const ( - EventTypeAddRule EventType = "add-rule" - EventTypeUpdateRule EventType = "update-rule" - EventTypeDeleteRule EventType = "delete-rule" - - EventTypeAddNamespace EventType = "add-namespace" - EventTypeUpdateNamespace EventType = "update-namespace" - EventTypeDeleteNamespace EventType = "delete-namespace" - - EventTypeSyncMimir EventType = "sync-mimir" -) - -func equalRuleGroups(a, b rulefmt.RuleGroup) bool { - aBuf, err := yamlv3.Marshal(a) - if err != nil { - return false - } - bBuf, err := yamlv3.Marshal(b) - if err != nil { - return false - } - - if !bytes.Equal(aBuf, bBuf) { - - fmt.Println("----") - fmt.Println(string(aBuf)) - fmt.Println("----") - fmt.Println(string(bBuf)) - - return false +func setDefaultArguments(args *Arguments) { + if args.SyncInterval == 0 { + args.SyncInterval = 30 * time.Second } - - return bytes.Equal(aBuf, bBuf) } From a690b92aa73ba2fd46d5dc2e855f70004d4d3aca Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Mon, 5 Dec 2022 16:43:09 -0600 Subject: [PATCH 10/40] Add namespace informer - Also filter resources in the informer to avoid unecessary events and allocations --- component/mimir/rules/events.go | 58 ++++++++------- component/mimir/rules/rules.go | 127 ++++++++++++++++++++++---------- 2 files changed, 117 insertions(+), 68 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 96d73e048beb..a03bc6e3608d 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -13,15 +13,10 @@ import ( ) // This type must be hashable, so it is kept simple. The indexer will maintain a -// cache of current state, so this is only used for logging. +// cache of current state, so this is mostly used for logging. type Event struct { - Type EventType - - NewRules string - OldRules string - - NewNamespace string - OldNamespace string + Type EventType + ObjectKey string } type EventType string @@ -61,15 +56,9 @@ func (c *Component) eventLoop(ctx context.Context) { } func (c *Component) processEvent(ctx context.Context, e Event) error { switch e.Type { - case EventTypeAddRule: - level.Info(c.log).Log("msg", "processing add rule event", "key", e.NewRules) - case EventTypeUpdateRule: - level.Info(c.log).Log("msg", "processing update rule event", "key", e.NewRules) - case EventTypeDeleteRule: - level.Info(c.log).Log("msg", "processing delete rule event", "key", e.OldRules) - case EventTypeAddNamespace: - case EventTypeDeleteNamespace: - case EventTypeUpdateNamespace: + case EventTypeAddRule, EventTypeUpdateRule, EventTypeDeleteRule, + EventTypeAddNamespace, EventTypeUpdateNamespace, EventTypeDeleteNamespace: + level.Info(c.log).Log("msg", "processing event", "type", e.Type, "key", e.ObjectKey) case EventTypeSyncMimir: level.Debug(c.log).Log("msg", "syncing current state from ruler") c.syncMimir(ctx) @@ -94,27 +83,40 @@ func (c *Component) reconcileState(ctx context.Context) error { ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() - crdState, err := c.ruleLister.List(c.ruleSelector) + desiredState, err := c.loadStateFromK8s() + + diffs, err := diffRuleStates(desiredState, c.currentState) if err != nil { - return fmt.Errorf("failed to list rules: %w", err) + return err + } + + return c.applyChanges(ctx, diffs) +} + +func (c *Component) loadStateFromK8s() ([]rulefmt.RuleGroup, error) { + matchedNamespaces, err := c.namespaceLister.List(c.namespaceSelector) + if err != nil { + return nil, fmt.Errorf("failed to list namespaces: %w", err) } desiredState := []rulefmt.RuleGroup{} - for _, pr := range crdState { - groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) + for _, ns := range matchedNamespaces { + crdState, err := c.ruleLister.PrometheusRules(ns.Name).List(c.ruleSelector) if err != nil { - return fmt.Errorf("failed to convert rule group: %w", err) + return nil, fmt.Errorf("failed to list rules: %w", err) } - desiredState = append(desiredState, groups.Groups...) - } + for _, pr := range crdState { + groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) + if err != nil { + return nil, fmt.Errorf("failed to convert rule group: %w", err) + } - diffs, err := diffRuleStates(desiredState, c.currentState) - if err != nil { - return err + desiredState = append(desiredState, groups.Groups...) + } } - return c.applyChanges(ctx, diffs) + return desiredState, nil } func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.RuleGroups, error) { diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 69a7f04ad702..5a13bfa9b5e1 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -10,16 +10,15 @@ import ( "github.com/grafana/agent/component" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + coreListers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" controller "sigs.k8s.io/controller-runtime" - k8sClient "sigs.k8s.io/controller-runtime/pkg/client" promExternalVersions "github.com/prometheus-operator/prometheus-operator/pkg/client/informers/externalversions" promVersioned "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" @@ -44,13 +43,16 @@ type Component struct { opts component.Options args Arguments - mimirClient *mimirClient.MimirClient - k8sClient k8sClient.Client - promClient promVersioned.Interface - ruleLister promListers.PrometheusRuleLister - ruleInformer cache.SharedIndexInformer - informerStopChan chan struct{} - ticker *time.Ticker + mimirClient *mimirClient.MimirClient + k8sClient kubernetes.Interface + promClient promVersioned.Interface + ruleLister promListers.PrometheusRuleLister + ruleInformer cache.SharedIndexInformer + + namespaceLister coreListers.NamespaceLister + namespaceInformer cache.SharedIndexInformer + informerStopChan chan struct{} + ticker *time.Ticker queue workqueue.RateLimitingInterface configUpdates chan ConfigUpdate @@ -113,6 +115,10 @@ func (c *Component) startup(ctx context.Context) error { return err } + c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + c.informerStopChan = make(chan struct{}) + + c.startNamespaceInformer() c.startRuleInformer() c.syncMimir(ctx) go c.eventLoop(ctx) @@ -141,19 +147,8 @@ func (c *Component) init() error { // TODO: allow overriding some stuff in RestConfig and k8s client options? restConfig := controller.GetConfigOrDie() - scheme := runtime.NewScheme() - err := corev1.AddToScheme(scheme) - if err != nil { - return fmt.Errorf("failed to add prometheus operator scheme: %w", err) - } - err = promv1.AddToScheme(scheme) - if err != nil { - return fmt.Errorf("failed to add prometheus operator scheme: %w", err) - } - - c.k8sClient, err = k8sClient.New(restConfig, k8sClient.Options{ - Scheme: scheme, - }) + var err error + c.k8sClient, err = kubernetes.NewForConfig(restConfig) if err != nil { return fmt.Errorf("failed to create k8s client: %w", err) } @@ -216,9 +211,69 @@ func convertSelectorToListOptions(selector LabelSelector) (labels.Selector, erro }) } +func (c *Component) startNamespaceInformer() { + factory := informers.NewSharedInformerFactoryWithOptions( + c.k8sClient, + 24*time.Hour, + informers.WithTweakListOptions(func(lo *metav1.ListOptions) { + lo.LabelSelector = c.namespaceSelector.String() + }), + ) + + namespaces := factory.Core().V1().Namespaces() + c.namespaceLister = namespaces.Lister() + c.namespaceInformer = namespaces.Informer() + c.namespaceInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeAddNamespace, + ObjectKey: key, + }) + }, + UpdateFunc: func(oldObj, newObj interface{}) { + newKey, err := cache.MetaNamespaceKeyFunc(newObj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeUpdateNamespace, + ObjectKey: newKey, + }) + }, + DeleteFunc: func(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key from object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeDeleteNamespace, + ObjectKey: key, + }) + }, + }) + + factory.Start(c.informerStopChan) + factory.WaitForCacheSync(c.informerStopChan) +} + func (c *Component) startRuleInformer() { - c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) - factory := promExternalVersions.NewSharedInformerFactory(c.promClient, 24*time.Hour) + factory := promExternalVersions.NewSharedInformerFactoryWithOptions( + c.promClient, + 24*time.Hour, + promExternalVersions.WithTweakListOptions(func(lo *metav1.ListOptions) { + lo.LabelSelector = c.ruleSelector.String() + }), + ) promRules := factory.Monitoring().V1().PrometheusRules() c.ruleLister = promRules.Lister() @@ -232,17 +287,11 @@ func (c *Component) startRuleInformer() { } c.queue.AddRateLimited(Event{ - Type: EventTypeAddRule, - NewRules: key, + Type: EventTypeAddRule, + ObjectKey: key, }) }, UpdateFunc: func(oldObj, newObj interface{}) { - oldKey, err := cache.MetaNamespaceKeyFunc(oldObj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } - newKey, err := cache.MetaNamespaceKeyFunc(newObj) if err != nil { level.Error(c.log).Log("msg", "failed to get key from object", "err", err) @@ -250,9 +299,8 @@ func (c *Component) startRuleInformer() { } c.queue.AddRateLimited(Event{ - Type: EventTypeUpdateRule, - NewRules: newKey, - OldRules: oldKey, + Type: EventTypeUpdateRule, + ObjectKey: newKey, }) }, DeleteFunc: func(obj interface{}) { @@ -263,13 +311,12 @@ func (c *Component) startRuleInformer() { } c.queue.AddRateLimited(Event{ - Type: EventTypeDeleteRule, - OldRules: key, + Type: EventTypeDeleteRule, + ObjectKey: key, }) }, }) - c.informerStopChan = make(chan struct{}) factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) } From 5e65d59829b41cc6fccc3b17e51b0ab22a3d70dc Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Tue, 6 Dec 2022 21:56:08 -0600 Subject: [PATCH 11/40] Map PrometheusRule 1:1 to Mimir rule namespaces Only namespaces matching the expected naming convention are reconciled --- component/mimir/rules/arguments.go | 5 ++- component/mimir/rules/diff.go | 37 +++++++++++++++++++++- component/mimir/rules/events.go | 50 +++++++++++++++++++++++------- component/mimir/rules/rules.go | 3 +- 4 files changed, 79 insertions(+), 16 deletions(-) diff --git a/component/mimir/rules/arguments.go b/component/mimir/rules/arguments.go index 98581845952a..ad7b8e7e5883 100644 --- a/component/mimir/rules/arguments.go +++ b/component/mimir/rules/arguments.go @@ -7,9 +7,8 @@ import ( ) type Arguments struct { - ClientParams ClientArguments `river:"client,block"` - SyncInterval time.Duration `river:"sync_interval,attr,optional"` - MimirRuleNamespace string `river:"mimir_rule_namespace,attr"` + ClientParams ClientArguments `river:"client,block"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` RuleSelector LabelSelector `river:"rule_selector,block,optional"` RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 011dc1b0306e..8be895357829 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -23,7 +23,42 @@ type RuleGroupDiff struct { Desired mimirClient.RuleGroup } -func diffRuleStates(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { +func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]RuleGroupDiff, error) { + seen := map[string]bool{} + + diff := make(map[string][]RuleGroupDiff) + + for namespace, desiredRuleGroups := range desired { + seen[namespace] = true + + actualRuleGroups := actual[namespace] + subDiff, err := diffRuleNamespaceState(desiredRuleGroups, actualRuleGroups) + if err != nil { + return nil, err + } + diff[namespace] = subDiff + } + + for namespace, actualRuleGroups := range actual { + if seen[namespace] { + continue + } + + if !isManagedMimirNamespace(namespace) { + continue + } + + subDiff, err := diffRuleNamespaceState(nil, actualRuleGroups) + if err != nil { + return nil, err + } + diff[namespace] = subDiff + } + + return diff, nil +} + +func diffRuleNamespaceState(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { var diff []RuleGroupDiff seenGroups := map[string]bool{} diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index a03bc6e3608d..7c9bf3dc2453 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -3,6 +3,7 @@ package rules import ( "context" "fmt" + "regexp" "time" "github.com/ghodss/yaml" @@ -70,13 +71,13 @@ func (c *Component) processEvent(ctx context.Context, e Event) error { } func (c *Component) syncMimir(ctx context.Context) { - rulesByNamespace, err := c.mimirClient.ListRules(ctx, c.args.MimirRuleNamespace) + rulesByNamespace, err := c.mimirClient.ListRules(ctx, "") if err != nil { level.Error(c.log).Log("msg", "failed to list rules from mimir", "err", err) return } - c.currentState = rulesByNamespace[c.args.MimirRuleNamespace] + c.currentState = rulesByNamespace } func (c *Component) reconcileState(ctx context.Context) error { @@ -85,21 +86,29 @@ func (c *Component) reconcileState(ctx context.Context) error { desiredState, err := c.loadStateFromK8s() - diffs, err := diffRuleStates(desiredState, c.currentState) + diffs, err := diffRuleState(desiredState, c.currentState) if err != nil { return err } - return c.applyChanges(ctx, diffs) + for ns, diff := range diffs { + err = c.applyChanges(ctx, ns, diff) + if err != nil { + level.Error(c.log).Log("msg", "failed to apply changes", "mimir-namespace", ns, "err", err) + continue + } + } + + return nil } -func (c *Component) loadStateFromK8s() ([]rulefmt.RuleGroup, error) { +func (c *Component) loadStateFromK8s() (map[string][]rulefmt.RuleGroup, error) { matchedNamespaces, err := c.namespaceLister.List(c.namespaceSelector) if err != nil { return nil, fmt.Errorf("failed to list namespaces: %w", err) } - desiredState := []rulefmt.RuleGroup{} + desiredState := map[string][]rulefmt.RuleGroup{} for _, ns := range matchedNamespaces { crdState, err := c.ruleLister.PrometheusRules(ns.Name).List(c.ruleSelector) if err != nil { @@ -107,12 +116,14 @@ func (c *Component) loadStateFromK8s() ([]rulefmt.RuleGroup, error) { } for _, pr := range crdState { + mimirNs := mimirNamespaceForRuleCRD(pr) + groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) if err != nil { return nil, fmt.Errorf("failed to convert rule group: %w", err) } - desiredState = append(desiredState, groups.Groups...) + desiredState[mimirNs] = groups.Groups } } @@ -133,7 +144,7 @@ func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.Rul return groups, nil } -func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) error { +func (c *Component) applyChanges(ctx context.Context, namespace string, diffs []RuleGroupDiff) error { if len(diffs) == 0 { return nil } @@ -144,19 +155,19 @@ func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) err switch diff.Kind { case RuleGroupDiffKindAdd: level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) - err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err } case RuleGroupDiffKindRemove: level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) - err := c.mimirClient.DeleteRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Actual.Name) + err := c.mimirClient.DeleteRuleGroup(ctx, namespace, diff.Actual.Name) if err != nil { return err } case RuleGroupDiffKindUpdate: level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) - err := c.mimirClient.CreateRuleGroup(ctx, c.args.MimirRuleNamespace, diff.Desired) + err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err } @@ -169,3 +180,20 @@ func (c *Component) applyChanges(ctx context.Context, diffs []RuleGroupDiff) err return nil } + +// mimirNamespaceForRuleCRD returns the namespace that the rule CRD should be +// stored in mimir. This function, along with isManagedNamespace, is used to +// determine if a rule CRD is managed by the agent. +func mimirNamespaceForRuleCRD(pr *promv1.PrometheusRule) string { + return fmt.Sprintf("agent/%s/%s/%s", pr.Namespace, pr.Name, pr.UID) +} + +// isManagedMimirNamespace returns true if the namespace is managed by the agent. +// Unmanaged namespaces are left as is by the operator. +func isManagedMimirNamespace(namespace string) bool { + namespacePart := `.+` + namePart := `.+` + uuidPart := `[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}` + managedNamespaceRegex := regexp.MustCompile(fmt.Sprintf("^(agent/)?%s/%s/%s$", namespacePart, namePart, uuidPart)) + return managedNamespaceRegex.MatchString(namespace) +} diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 5a13bfa9b5e1..84bda73912cb 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -60,7 +60,7 @@ type Component struct { namespaceSelector labels.Selector ruleSelector labels.Selector - currentState []mimirClient.RuleGroup + currentState map[string][]mimirClient.RuleGroup } type ConfigUpdate struct { @@ -69,6 +69,7 @@ type ConfigUpdate struct { } var _ component.Component = (*Component)(nil) +var _ component.DebugComponent = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { setDefaultArguments(&c) From 4d1cf80b8d2bfa699fc8fd7502b6d45208a62431 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Tue, 6 Dec 2022 21:56:36 -0600 Subject: [PATCH 12/40] Implement the DebugComponent interface --- component/mimir/rules/debug.go | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 component/mimir/rules/debug.go diff --git a/component/mimir/rules/debug.go b/component/mimir/rules/debug.go new file mode 100644 index 000000000000..09ec5175b185 --- /dev/null +++ b/component/mimir/rules/debug.go @@ -0,0 +1,64 @@ +package rules + +import "fmt" + +type DebugInfo struct { + Error string `river:"error,attr,optional"` + PrometheusRules []DebugK8sPrometheusRule `river:"prometheusRules,attr,optional"` + MimirRuleNamespaces []DebugMimirNamespace `river:"mimirRuleNamespaces,attr,optional"` +} + +type DebugK8sPrometheusRule struct { + Namespace string `river:"namespace,attr"` + Name string `river:"name,attr"` + UID string `river:"uid,attr"` + NumRuleGroups int `river:"numRuleGroups,attr"` +} + +type DebugMimirNamespace struct { + Name string `river:"name,attr"` + NumRuleGroups int `river:"numRuleGroups,attr"` +} + +func (c *Component) DebugInfo() interface{} { + var output DebugInfo + for ns := range c.currentState { + if !isManagedMimirNamespace(ns) { + continue + } + + output.MimirRuleNamespaces = append(output.MimirRuleNamespaces, DebugMimirNamespace{ + Name: ns, + NumRuleGroups: len(c.currentState[ns]), + }) + } + + // This should load from the informer cache, so it shouldn't fail under normal circumstances. + managedK8sNamespaces, err := c.namespaceLister.List(c.namespaceSelector) + if err != nil { + return DebugInfo{ + Error: fmt.Sprintf("failed to list namespaces: %v", err), + } + } + + for _, n := range managedK8sNamespaces { + // This should load from the informer cache, so it shouldn't fail under normal circumstances. + rules, err := c.ruleLister.PrometheusRules(n.Name).List(c.ruleSelector) + if err != nil { + return DebugInfo{ + Error: fmt.Sprintf("failed to list rules: %v", err), + } + } + + for _, r := range rules { + output.PrometheusRules = append(output.PrometheusRules, DebugK8sPrometheusRule{ + Namespace: n.Name, + Name: r.Name, + UID: string(r.UID), + NumRuleGroups: len(r.Spec.Groups), + }) + } + } + + return output +} From 4a740aa9e1d9f49be7f5226f3ed0f7d3898b6727 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Tue, 6 Dec 2022 22:02:52 -0600 Subject: [PATCH 13/40] Remove outdated comment --- component/mimir/rules/diff.go | 1 - 1 file changed, 1 deletion(-) diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 8be895357829..c0cb4f6ac655 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -79,7 +79,6 @@ desiredGroups: continue desiredGroups } - // TODO: check if the rules are the same diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindUpdate, Actual: actualRuleGroup, From d88ee5141f70ba53daf9940e44e09f682b0b8df6 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Tue, 6 Dec 2022 22:38:45 -0600 Subject: [PATCH 14/40] Backfill tests for core diffing algorithm --- component/mimir/rules/diff.go | 25 ++-- component/mimir/rules/diff_test.go | 190 +++++++++++++++++++++++++++++ component/mimir/rules/events.go | 23 ++-- 3 files changed, 218 insertions(+), 20 deletions(-) create mode 100644 component/mimir/rules/diff_test.go diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index c0cb4f6ac655..cd459f4a70db 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -4,7 +4,6 @@ import ( "bytes" mimirClient "github.com/grafana/agent/pkg/mimir/client" - "github.com/prometheus/prometheus/model/rulefmt" "gopkg.in/yaml.v3" ) @@ -23,7 +22,7 @@ type RuleGroupDiff struct { Desired mimirClient.RuleGroup } -func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]RuleGroupDiff, error) { +func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]RuleGroupDiff, error) { seen := map[string]bool{} diff := make(map[string][]RuleGroupDiff) @@ -36,6 +35,11 @@ func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]m if err != nil { return nil, err } + + if len(subDiff) == 0 { + continue + } + diff[namespace] = subDiff } @@ -52,37 +56,32 @@ func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]m if err != nil { return nil, err } + diff[namespace] = subDiff } return diff, nil } -func diffRuleNamespaceState(desired []rulefmt.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { +func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { var diff []RuleGroupDiff seenGroups := map[string]bool{} desiredGroups: for _, desiredRuleGroup := range desired { - mimirRuleGroup := mimirClient.RuleGroup{ - RuleGroup: desiredRuleGroup, - // TODO: allow setting the remote write configs? - // RWConfigs: , - } - seenGroups[desiredRuleGroup.Name] = true for _, actualRuleGroup := range actual { if desiredRuleGroup.Name == actualRuleGroup.Name { - if equalRuleGroups(desiredRuleGroup, actualRuleGroup.RuleGroup) { + if equalRuleGroups(desiredRuleGroup, actualRuleGroup) { continue desiredGroups } diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindUpdate, Actual: actualRuleGroup, - Desired: mimirRuleGroup, + Desired: desiredRuleGroup, }) continue desiredGroups } @@ -90,7 +89,7 @@ desiredGroups: diff = append(diff, RuleGroupDiff{ Kind: RuleGroupDiffKindAdd, - Desired: mimirRuleGroup, + Desired: desiredRuleGroup, }) } @@ -108,7 +107,7 @@ desiredGroups: return diff, nil } -func equalRuleGroups(a, b rulefmt.RuleGroup) bool { +func equalRuleGroups(a, b mimirClient.RuleGroup) bool { aBuf, err := yaml.Marshal(a) if err != nil { return false diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go new file mode 100644 index 000000000000..3fa350872035 --- /dev/null +++ b/component/mimir/rules/diff_test.go @@ -0,0 +1,190 @@ +package rules + +import ( + "fmt" + "testing" + + mimirClient "github.com/grafana/agent/pkg/mimir/client" + "github.com/prometheus/prometheus/model/rulefmt" + "github.com/stretchr/testify/require" +) + +func parseRuleGroups(t *testing.T, buf []byte) []mimirClient.RuleGroup { + t.Helper() + + groups, errs := rulefmt.Parse(buf) + require.Empty(t, errs) + + var result []mimirClient.RuleGroup + for _, g := range groups.Groups { + result = append(result, mimirClient.RuleGroup{RuleGroup: g}) + } + return result +} + +func TestDiffRuleState(t *testing.T) { + ruleGroupsA := parseRuleGroups(t, []byte(` +groups: +- name: rule-group-a + interval: 1m + rules: + - record: rule_a + expr: 1 +`)) + + ruleGroupsB := parseRuleGroups(t, []byte(` +groups: +- name: rule-group-b + interval: 1m + rules: + - record: rule_b + expr: 2 +`)) + + ruleGroupsAModified := parseRuleGroups(t, []byte(` +groups: +- name: rule-group-a + interval: 1m + rules: + - record: rule_a + expr: 3 +`)) + + managedNamespace := "agent/namespace/name/12345678-1234-1234-1234-123456789012" + unmanagedNamespace := "integrations/kubernetes" + + _ = ruleGroupsB + _ = ruleGroupsA + _ = ruleGroupsAModified + _ = unmanagedNamespace + + type testCase struct { + name string + desired map[string][]mimirClient.RuleGroup + actual map[string][]mimirClient.RuleGroup + expected map[string][]RuleGroupDiff + } + + testCases := []testCase{ + { + name: "empty sets", + desired: map[string][]mimirClient.RuleGroup{}, + actual: map[string][]mimirClient.RuleGroup{}, + expected: map[string][]RuleGroupDiff{}, + }, + { + name: "add rule group", + desired: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + actual: map[string][]mimirClient.RuleGroup{}, + expected: map[string][]RuleGroupDiff{ + managedNamespace: { + { + Kind: RuleGroupDiffKindAdd, + Desired: ruleGroupsA[0], + }, + }, + }, + }, + { + name: "remove rule group", + desired: map[string][]mimirClient.RuleGroup{}, + actual: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + expected: map[string][]RuleGroupDiff{ + managedNamespace: { + { + Kind: RuleGroupDiffKindRemove, + Actual: ruleGroupsA[0], + }, + }, + }, + }, + { + name: "update rule group", + desired: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + actual: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsAModified, + }, + expected: map[string][]RuleGroupDiff{ + managedNamespace: { + { + Kind: RuleGroupDiffKindUpdate, + Desired: ruleGroupsA[0], + Actual: ruleGroupsAModified[0], + }, + }, + }, + }, + { + name: "unchanged rule groups", + desired: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + actual: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + expected: map[string][]RuleGroupDiff{}, + }, + { + name: "unmanaged namespaces", + desired: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + }, + actual: map[string][]mimirClient.RuleGroup{ + managedNamespace: ruleGroupsA, + unmanagedNamespace: ruleGroupsB, + }, + expected: map[string][]RuleGroupDiff{}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual, err := diffRuleState(tc.desired, tc.actual) + require.NoError(t, err) + requireEqualRuleDiffs(t, tc.expected, actual) + }) + } +} + +func requireEqualRuleDiffs(t *testing.T, expected, actual map[string][]RuleGroupDiff) { + require.Equal(t, len(expected), len(actual)) + + var summarizeDiff = func(diff RuleGroupDiff) string { + switch diff.Kind { + case RuleGroupDiffKindAdd: + return fmt.Sprintf("add: %s", diff.Desired.Name) + case RuleGroupDiffKindRemove: + return fmt.Sprintf("remove: %s", diff.Actual.Name) + case RuleGroupDiffKindUpdate: + return fmt.Sprintf("update: %s", diff.Desired.Name) + } + panic("unreachable") + } + + for namespace, expectedDiffs := range expected { + actualDiffs, ok := actual[namespace] + require.True(t, ok) + + require.Equal(t, len(expectedDiffs), len(actualDiffs)) + + for i, expectedDiff := range expectedDiffs { + actualDiff := actualDiffs[i] + + if expectedDiff.Kind != actualDiff.Kind || + !equalRuleGroups(expectedDiff.Desired, actualDiff.Desired) || + !equalRuleGroups(expectedDiff.Actual, actualDiff.Actual) { + + t.Logf("expected diff: %s", summarizeDiff(expectedDiff)) + t.Logf("actual diff: %s", summarizeDiff(actualDiff)) + t.Fail() + } + + } + } +} diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 7c9bf3dc2453..54acccaf13a7 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -8,6 +8,7 @@ import ( "github.com/ghodss/yaml" "github.com/go-kit/log/level" + mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" @@ -102,13 +103,13 @@ func (c *Component) reconcileState(ctx context.Context) error { return nil } -func (c *Component) loadStateFromK8s() (map[string][]rulefmt.RuleGroup, error) { +func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, error) { matchedNamespaces, err := c.namespaceLister.List(c.namespaceSelector) if err != nil { return nil, fmt.Errorf("failed to list namespaces: %w", err) } - desiredState := map[string][]rulefmt.RuleGroup{} + desiredState := map[string][]mimirClient.RuleGroup{} for _, ns := range matchedNamespaces { crdState, err := c.ruleLister.PrometheusRules(ns.Name).List(c.ruleSelector) if err != nil { @@ -123,25 +124,33 @@ func (c *Component) loadStateFromK8s() (map[string][]rulefmt.RuleGroup, error) { return nil, fmt.Errorf("failed to convert rule group: %w", err) } - desiredState[mimirNs] = groups.Groups + desiredState[mimirNs] = groups } } return desiredState, nil } -func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) (*rulefmt.RuleGroups, error) { +func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]mimirClient.RuleGroup, error) { buf, err := yaml.Marshal(crd) if err != nil { - return &rulefmt.RuleGroups{}, err + return nil, err } groups, errs := rulefmt.Parse(buf) if len(errs) > 0 { - return &rulefmt.RuleGroups{}, multierror.New(errs...).Err() + return nil, multierror.New(errs...).Err() } - return groups, nil + mimirGroups := make([]mimirClient.RuleGroup, len(groups.Groups)) + for i, g := range groups.Groups { + mimirGroups[i] = mimirClient.RuleGroup{ + RuleGroup: g, + // TODO: allow setting remote write configs? + } + } + + return mimirGroups, nil } func (c *Component) applyChanges(ctx context.Context, namespace string, diffs []RuleGroupDiff) error { From 00b3e40a3a9dd991f37027ca4d38749b9d55659d Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 09:37:16 -0600 Subject: [PATCH 15/40] Add configurable namespace prefix - This allows multiple agents to manage groups of namespaces without conflicting --- component/mimir/rules/arguments.go | 15 +++++++++++++-- component/mimir/rules/debug.go | 2 +- component/mimir/rules/diff.go | 4 ---- component/mimir/rules/diff_test.go | 26 -------------------------- component/mimir/rules/events.go | 17 +++++++++++++---- component/mimir/rules/rules.go | 6 ------ 6 files changed, 27 insertions(+), 43 deletions(-) diff --git a/component/mimir/rules/arguments.go b/component/mimir/rules/arguments.go index ad7b8e7e5883..32898973c94d 100644 --- a/component/mimir/rules/arguments.go +++ b/component/mimir/rules/arguments.go @@ -7,13 +7,24 @@ import ( ) type Arguments struct { - ClientParams ClientArguments `river:"client,block"` - SyncInterval time.Duration `river:"sync_interval,attr,optional"` + ClientParams ClientArguments `river:"client,block"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` + MimirNameSpacePrefix string `river:"mimir_namespace_prefix,attr,optional"` RuleSelector LabelSelector `river:"rule_selector,block,optional"` RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` } +func setDefaultArguments(args *Arguments) { + if args.SyncInterval == 0 { + args.SyncInterval = 30 * time.Second + } + + if args.MimirNameSpacePrefix == "" { + args.MimirNameSpacePrefix = "agent" + } +} + type LabelSelector struct { MatchLabels map[string]string `river:"match_labels,attr,optional"` MatchExpressions []MatchExpression `river:"match_expressions,attr,optional"` diff --git a/component/mimir/rules/debug.go b/component/mimir/rules/debug.go index 09ec5175b185..29cfcddc7c28 100644 --- a/component/mimir/rules/debug.go +++ b/component/mimir/rules/debug.go @@ -23,7 +23,7 @@ type DebugMimirNamespace struct { func (c *Component) DebugInfo() interface{} { var output DebugInfo for ns := range c.currentState { - if !isManagedMimirNamespace(ns) { + if !isManagedMimirNamespace(c.args.MimirNameSpacePrefix, ns) { continue } diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index cd459f4a70db..9739b582ce73 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -48,10 +48,6 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string continue } - if !isManagedMimirNamespace(namespace) { - continue - } - subDiff, err := diffRuleNamespaceState(nil, actualRuleGroups) if err != nil { return nil, err diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go index 3fa350872035..9cab0daac863 100644 --- a/component/mimir/rules/diff_test.go +++ b/component/mimir/rules/diff_test.go @@ -32,15 +32,6 @@ groups: expr: 1 `)) - ruleGroupsB := parseRuleGroups(t, []byte(` -groups: -- name: rule-group-b - interval: 1m - rules: - - record: rule_b - expr: 2 -`)) - ruleGroupsAModified := parseRuleGroups(t, []byte(` groups: - name: rule-group-a @@ -51,12 +42,6 @@ groups: `)) managedNamespace := "agent/namespace/name/12345678-1234-1234-1234-123456789012" - unmanagedNamespace := "integrations/kubernetes" - - _ = ruleGroupsB - _ = ruleGroupsA - _ = ruleGroupsAModified - _ = unmanagedNamespace type testCase struct { name string @@ -130,17 +115,6 @@ groups: }, expected: map[string][]RuleGroupDiff{}, }, - { - name: "unmanaged namespaces", - desired: map[string][]mimirClient.RuleGroup{ - managedNamespace: ruleGroupsA, - }, - actual: map[string][]mimirClient.RuleGroup{ - managedNamespace: ruleGroupsA, - unmanagedNamespace: ruleGroupsB, - }, - expected: map[string][]RuleGroupDiff{}, - }, } for _, tc := range testCases { diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 54acccaf13a7..a66a3cc3db08 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -78,6 +78,12 @@ func (c *Component) syncMimir(ctx context.Context) { return } + for ns := range rulesByNamespace { + if !isManagedMimirNamespace(c.args.MimirNameSpacePrefix, ns) { + delete(rulesByNamespace, ns) + } + } + c.currentState = rulesByNamespace } @@ -117,7 +123,7 @@ func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, erro } for _, pr := range crdState { - mimirNs := mimirNamespaceForRuleCRD(pr) + mimirNs := mimirNamespaceForRuleCRD(c.args.MimirNameSpacePrefix, pr) groups, err := convertCRDRuleGroupToRuleGroup(pr.Spec) if err != nil { @@ -193,16 +199,19 @@ func (c *Component) applyChanges(ctx context.Context, namespace string, diffs [] // mimirNamespaceForRuleCRD returns the namespace that the rule CRD should be // stored in mimir. This function, along with isManagedNamespace, is used to // determine if a rule CRD is managed by the agent. -func mimirNamespaceForRuleCRD(pr *promv1.PrometheusRule) string { +func mimirNamespaceForRuleCRD(prefix string, pr *promv1.PrometheusRule) string { return fmt.Sprintf("agent/%s/%s/%s", pr.Namespace, pr.Name, pr.UID) } // isManagedMimirNamespace returns true if the namespace is managed by the agent. // Unmanaged namespaces are left as is by the operator. -func isManagedMimirNamespace(namespace string) bool { +func isManagedMimirNamespace(prefix, namespace string) bool { + prefixPart := regexp.QuoteMeta(prefix) namespacePart := `.+` namePart := `.+` uuidPart := `[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}` - managedNamespaceRegex := regexp.MustCompile(fmt.Sprintf("^(agent/)?%s/%s/%s$", namespacePart, namePart, uuidPart)) + managedNamespaceRegex := regexp.MustCompile( + fmt.Sprintf("^%s/%s/%s/%s$", prefixPart, namespacePart, namePart, uuidPart), + ) return managedNamespaceRegex.MatchString(namespace) } diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 84bda73912cb..040d0dfc47c1 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -321,9 +321,3 @@ func (c *Component) startRuleInformer() { factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) } - -func setDefaultArguments(args *Arguments) { - if args.SyncInterval == 0 { - args.SyncInterval = 30 * time.Second - } -} From aa7d2d77937488497088146bcf818b64abc31264 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 13:58:04 -0600 Subject: [PATCH 16/40] Rename arguments.go to types.go --- component/mimir/rules/rules.go | 3 --- component/mimir/rules/{arguments.go => types.go} | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) rename component/mimir/rules/{arguments.go => types.go} (98%) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 040d0dfc47c1..5db1f0828602 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -35,9 +35,6 @@ func init() { }) } -type Exports struct { -} - type Component struct { log log.Logger opts component.Options diff --git a/component/mimir/rules/arguments.go b/component/mimir/rules/types.go similarity index 98% rename from component/mimir/rules/arguments.go rename to component/mimir/rules/types.go index 32898973c94d..b2d30149dbce 100644 --- a/component/mimir/rules/arguments.go +++ b/component/mimir/rules/types.go @@ -55,3 +55,6 @@ type TLSArguments struct { CipherSuites string `river:"tls_cipher_suites,attr,optional"` MinVersion string `river:"tls_min_version,attr,optional"` } + +type Exports struct { +} From 6c4c34d9693c1e0d986f20ceb49536e4e02fe8e7 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 14:29:39 -0600 Subject: [PATCH 17/40] Simplify event handler to use shared implementation --- component/mimir/rules/events.go | 14 +--- component/mimir/rules/events_test.go | 47 +++++++++++++ component/mimir/rules/rules.go | 101 +++++++-------------------- 3 files changed, 76 insertions(+), 86 deletions(-) create mode 100644 component/mimir/rules/events_test.go diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index a66a3cc3db08..2d906f0efae5 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -24,15 +24,8 @@ type Event struct { type EventType string const ( - EventTypeAddRule EventType = "add-rule" - EventTypeUpdateRule EventType = "update-rule" - EventTypeDeleteRule EventType = "delete-rule" - - EventTypeAddNamespace EventType = "add-namespace" - EventTypeUpdateNamespace EventType = "update-namespace" - EventTypeDeleteNamespace EventType = "delete-namespace" - - EventTypeSyncMimir EventType = "sync-mimir" + EventTypeResourceChanged EventType = "resource-changed" + EventTypeSyncMimir EventType = "sync-mimir" ) func (c *Component) eventLoop(ctx context.Context) { @@ -58,8 +51,7 @@ func (c *Component) eventLoop(ctx context.Context) { } func (c *Component) processEvent(ctx context.Context, e Event) error { switch e.Type { - case EventTypeAddRule, EventTypeUpdateRule, EventTypeDeleteRule, - EventTypeAddNamespace, EventTypeUpdateNamespace, EventTypeDeleteNamespace: + case EventTypeResourceChanged: level.Info(c.log).Log("msg", "processing event", "type", e.Type, "key", e.ObjectKey) case EventTypeSyncMimir: level.Debug(c.log).Log("msg", "syncing current state from ruler") diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go new file mode 100644 index 000000000000..cfc768dd6f83 --- /dev/null +++ b/component/mimir/rules/events_test.go @@ -0,0 +1,47 @@ +package rules + +import ( + "os" + "testing" + + "github.com/go-kit/log" + v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/workqueue" +) + +func TestQueueEventHandler(t *testing.T) { + handler := Component{ + log: log.NewLogfmtLogger(os.Stdout), + queue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), + } + + ns := &v1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "name", + Namespace: "namespace", + }, + } + + handler.OnAdd(ns) + event, _ := handler.queue.Get() + require.Equal(t, EventTypeResourceChanged, event.(Event).Type) + require.Equal(t, "namespace/name", event.(Event).ObjectKey) + handler.queue.Forget(event) + handler.queue.Done(event) + + handler.OnDelete(ns) + event, _ = handler.queue.Get() + require.Equal(t, EventTypeResourceChanged, event.(Event).Type) + require.Equal(t, "namespace/name", event.(Event).ObjectKey) + handler.queue.Forget(event) + handler.queue.Done(event) + + handler.OnUpdate(ns, ns) + event, _ = handler.queue.Get() + require.Equal(t, EventTypeResourceChanged, event.(Event).Type) + require.Equal(t, "namespace/name", event.(Event).ObjectKey) + handler.queue.Forget(event) + handler.queue.Done(event) +} diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 5db1f0828602..a25653baf53a 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -221,44 +221,7 @@ func (c *Component) startNamespaceInformer() { namespaces := factory.Core().V1().Namespaces() c.namespaceLister = namespaces.Lister() c.namespaceInformer = namespaces.Informer() - c.namespaceInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } - - c.queue.AddRateLimited(Event{ - Type: EventTypeAddNamespace, - ObjectKey: key, - }) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - newKey, err := cache.MetaNamespaceKeyFunc(newObj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } - - c.queue.AddRateLimited(Event{ - Type: EventTypeUpdateNamespace, - ObjectKey: newKey, - }) - }, - DeleteFunc: func(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } - - c.queue.AddRateLimited(Event{ - Type: EventTypeDeleteNamespace, - ObjectKey: key, - }) - }, - }) + c.namespaceInformer.AddEventHandler(c) factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) @@ -276,45 +239,33 @@ func (c *Component) startRuleInformer() { promRules := factory.Monitoring().V1().PrometheusRules() c.ruleLister = promRules.Lister() c.ruleInformer = promRules.Informer() - c.ruleInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } + c.ruleInformer.AddEventHandler(c) - c.queue.AddRateLimited(Event{ - Type: EventTypeAddRule, - ObjectKey: key, - }) - }, - UpdateFunc: func(oldObj, newObj interface{}) { - newKey, err := cache.MetaNamespaceKeyFunc(newObj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } + factory.Start(c.informerStopChan) + factory.WaitForCacheSync(c.informerStopChan) +} - c.queue.AddRateLimited(Event{ - Type: EventTypeUpdateRule, - ObjectKey: newKey, - }) - }, - DeleteFunc: func(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key from object", "err", err) - return - } +func (c *Component) OnAdd(obj interface{}) { + c.publishEvent(obj) +} - c.queue.AddRateLimited(Event{ - Type: EventTypeDeleteRule, - ObjectKey: key, - }) - }, - }) +func (c *Component) OnUpdate(oldObj, newObj interface{}) { + c.publishEvent(newObj) +} - factory.Start(c.informerStopChan) - factory.WaitForCacheSync(c.informerStopChan) +func (c *Component) OnDelete(obj interface{}) { + c.publishEvent(obj) +} + +func (c *Component) publishEvent(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key for object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeResourceChanged, + ObjectKey: key, + }) } From 96d684360978d65992d6e984e740d7156e8da640 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 15:57:54 -0600 Subject: [PATCH 18/40] Add integration tests for event pipeline --- component/mimir/rules/events.go | 26 +++++ component/mimir/rules/events_test.go | 164 +++++++++++++++++++++++---- component/mimir/rules/rules.go | 27 +---- pkg/mimir/client/client.go | 6 + 4 files changed, 172 insertions(+), 51 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 2d906f0efae5..0bd6ec271eca 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -12,6 +12,7 @@ import ( "github.com/grafana/dskit/multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" + "k8s.io/client-go/tools/cache" ) // This type must be hashable, so it is kept simple. The indexer will maintain a @@ -28,6 +29,31 @@ const ( EventTypeSyncMimir EventType = "sync-mimir" ) +func (c *Component) OnAdd(obj interface{}) { + c.publishEvent(obj) +} + +func (c *Component) OnUpdate(oldObj, newObj interface{}) { + c.publishEvent(newObj) +} + +func (c *Component) OnDelete(obj interface{}) { + c.publishEvent(obj) +} + +func (c *Component) publishEvent(obj interface{}) { + key, err := cache.MetaNamespaceKeyFunc(obj) + if err != nil { + level.Error(c.log).Log("msg", "failed to get key for object", "err", err) + return + } + + c.queue.AddRateLimited(Event{ + Type: EventTypeResourceChanged, + ObjectKey: key, + }) +} + func (c *Component) eventLoop(ctx context.Context) { for { event, shutdown := c.queue.Get() diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index cfc768dd6f83..cb7bef7d0318 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -1,47 +1,161 @@ package rules import ( + "context" "os" "testing" + "time" "github.com/go-kit/log" + mimirClient "github.com/grafana/agent/pkg/mimir/client" v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + coreListers "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" ) -func TestQueueEventHandler(t *testing.T) { - handler := Component{ - log: log.NewLogfmtLogger(os.Stdout), - queue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), +type fakeMimirClient struct { + rules map[string][]mimirClient.RuleGroup +} + +var _ mimirClient.Interface = &fakeMimirClient{} + +func newFakeMimirClient() *fakeMimirClient { + return &fakeMimirClient{ + rules: make(map[string][]mimirClient.RuleGroup), + } +} + +func (m *fakeMimirClient) CreateRuleGroup(ctx context.Context, namespace string, rule mimirClient.RuleGroup) error { + m.DeleteRuleGroup(ctx, namespace, rule.Name) + m.rules[namespace] = append(m.rules[namespace], rule) + return nil +} + +func (m *fakeMimirClient) DeleteRuleGroup(ctx context.Context, namespace, group string) error { + for ns, v := range m.rules { + for i, g := range v { + if g.Name == group { + m.rules[ns] = append(m.rules[ns][:i], m.rules[ns][i+1:]...) + + if len(m.rules[ns]) == 0 { + delete(m.rules, ns) + } + + return nil + } + } + } + return nil +} + +func (m *fakeMimirClient) ListRules(ctx context.Context, namespace string) (map[string][]mimirClient.RuleGroup, error) { + output := make(map[string][]mimirClient.RuleGroup) + for ns, v := range m.rules { + if namespace != "" && namespace != ns { + continue + } + output[ns] = v + } + return output, nil +} + +func TestEventLoop(t *testing.T) { + nsIndexer := cache.NewIndexer( + cache.DeletionHandlingMetaNamespaceKeyFunc, + cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, + ) + nsLister := coreListers.NewNamespaceLister(nsIndexer) + + ruleIndexer := cache.NewIndexer( + cache.DeletionHandlingMetaNamespaceKeyFunc, + cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, + ) + ruleLister := promListers.NewPrometheusRuleLister(ruleIndexer) + + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "namespace", + UID: types.UID("33f8860c-bd06-4c0d-a0b1-a114d6b9937b"), + }, } - ns := &v1.PrometheusRule{ + rule := &v1.PrometheusRule{ ObjectMeta: metav1.ObjectMeta{ Name: "name", Namespace: "namespace", + UID: types.UID("64aab764-c95e-4ee9-a932-cd63ba57e6cf"), + }, + Spec: v1.PrometheusRuleSpec{ + Groups: []v1.RuleGroup{ + { + Name: "group", + Rules: []v1.Rule{ + { + Alert: "alert", + Expr: intstr.FromString("expr"), + }, + }, + }, + }, }, } - handler.OnAdd(ns) - event, _ := handler.queue.Get() - require.Equal(t, EventTypeResourceChanged, event.(Event).Type) - require.Equal(t, "namespace/name", event.(Event).ObjectKey) - handler.queue.Forget(event) - handler.queue.Done(event) - - handler.OnDelete(ns) - event, _ = handler.queue.Get() - require.Equal(t, EventTypeResourceChanged, event.(Event).Type) - require.Equal(t, "namespace/name", event.(Event).ObjectKey) - handler.queue.Forget(event) - handler.queue.Done(event) - - handler.OnUpdate(ns, ns) - event, _ = handler.queue.Get() - require.Equal(t, EventTypeResourceChanged, event.(Event).Type) - require.Equal(t, "namespace/name", event.(Event).ObjectKey) - handler.queue.Forget(event) - handler.queue.Done(event) + handler := Component{ + log: log.NewLogfmtLogger(os.Stdout), + queue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), + namespaceLister: nsLister, + namespaceSelector: labels.Everything(), + ruleLister: ruleLister, + ruleSelector: labels.Everything(), + mimirClient: newFakeMimirClient(), + args: Arguments{MimirNameSpacePrefix: "agent"}, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go handler.eventLoop(ctx) + + // Add a namespace and rule to kubernetes + nsIndexer.Add(ns) + ruleIndexer.Add(rule) + handler.OnAdd(rule) + + // Wait for the rule to be added to mimir + require.Eventually(t, func() bool { + return len(handler.currentState) == 1 + }, time.Second, 10*time.Millisecond) + handler.queue.AddRateLimited(Event{Type: EventTypeSyncMimir}) + + // Update the rule in kubernetes + rule.Spec.Groups[0].Rules = append(rule.Spec.Groups[0].Rules, v1.Rule{ + Alert: "alert2", + Expr: intstr.FromString("expr2"), + }) + ruleIndexer.Update(rule) + handler.OnUpdate(rule, rule) + + // Wait for the rule to be updated in mimir + require.Eventually(t, func() bool { + rules := handler.currentState[mimirNamespaceForRuleCRD("agent", rule)][0].Rules + return len(rules) == 2 + }, time.Second, 10*time.Millisecond) + handler.queue.AddRateLimited(Event{Type: EventTypeSyncMimir}) + + // Remove the rule from kubernetes + ruleIndexer.Delete(rule) + handler.OnDelete(rule) + + // Wait for the rule to be removed from mimir + require.Eventually(t, func() bool { + return len(handler.currentState) == 0 + }, time.Second, 10*time.Millisecond) } diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index a25653baf53a..a606c76b52ec 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -40,7 +40,7 @@ type Component struct { opts component.Options args Arguments - mimirClient *mimirClient.MimirClient + mimirClient mimirClient.Interface k8sClient kubernetes.Interface promClient promVersioned.Interface ruleLister promListers.PrometheusRuleLister @@ -244,28 +244,3 @@ func (c *Component) startRuleInformer() { factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) } - -func (c *Component) OnAdd(obj interface{}) { - c.publishEvent(obj) -} - -func (c *Component) OnUpdate(oldObj, newObj interface{}) { - c.publishEvent(newObj) -} - -func (c *Component) OnDelete(obj interface{}) { - c.publishEvent(obj) -} - -func (c *Component) publishEvent(obj interface{}) { - key, err := cache.MetaNamespaceKeyFunc(obj) - if err != nil { - level.Error(c.log).Log("msg", "failed to get key for object", "err", err) - return - } - - c.queue.AddRateLimited(Event{ - Type: EventTypeResourceChanged, - ObjectKey: key, - }) -} diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index 5a5a9fab36cb..919fcb11bfc3 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -37,6 +37,12 @@ type Config struct { AuthToken string `yaml:"auth_token"` } +type Interface interface { + CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error + DeleteRuleGroup(ctx context.Context, namespace, groupName string) error + ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) +} + // MimirClient is a client to the Mimir API. type MimirClient struct { user string From 6e9fd949dfb85530c4f9f409134d3faa74aa8652 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:09:51 -0600 Subject: [PATCH 19/40] Simplify MimirClient - Remove methods copied from cortextool that are unused here --- pkg/mimir/client/alerts.go | 76 -------------------------------------- pkg/mimir/client/client.go | 13 ------- pkg/mimir/client/rules.go | 35 ------------------ 3 files changed, 124 deletions(-) delete mode 100644 pkg/mimir/client/alerts.go diff --git a/pkg/mimir/client/alerts.go b/pkg/mimir/client/alerts.go deleted file mode 100644 index 47b049b0b17f..000000000000 --- a/pkg/mimir/client/alerts.go +++ /dev/null @@ -1,76 +0,0 @@ -package client - -import ( - "context" - "io" - - "github.com/pkg/errors" - log "github.com/sirupsen/logrus" - "gopkg.in/yaml.v3" -) - -const alertmanagerAPIPath = "/api/v1/alerts" - -type configCompat struct { - TemplateFiles map[string]string `yaml:"template_files"` - AlertmanagerConfig string `yaml:"alertmanager_config"` -} - -// CreateAlertmanagerConfig creates a new alertmanager config -func (r *MimirClient) CreateAlertmanagerConfig(ctx context.Context, cfg string, templates map[string]string) error { - payload, err := yaml.Marshal(&configCompat{ - TemplateFiles: templates, - AlertmanagerConfig: cfg, - }) - if err != nil { - return err - } - - res, err := r.doRequest(alertmanagerAPIPath, "POST", payload) - if err != nil { - return err - } - - res.Body.Close() - - return nil -} - -// DeleteAlermanagerConfig deletes the users alertmanagerconfig -func (r *MimirClient) DeleteAlermanagerConfig(ctx context.Context) error { - res, err := r.doRequest(alertmanagerAPIPath, "DELETE", nil) - if err != nil { - return err - } - - res.Body.Close() - - return nil -} - -// GetAlertmanagerConfig retrieves a Mimir cluster's Alertmanager config. -func (r *MimirClient) GetAlertmanagerConfig(ctx context.Context) (string, map[string]string, error) { - res, err := r.doRequest(alertmanagerAPIPath, "GET", nil) - if err != nil { - log.Debugln("no alert config present in response") - return "", nil, err - } - - defer res.Body.Close() - body, err := io.ReadAll(res.Body) - if err != nil { - return "", nil, err - } - - compat := configCompat{} - err = yaml.Unmarshal(body, &compat) - if err != nil { - log.WithFields(log.Fields{ - "body": string(body), - }).Debugln("failed to unmarshal rule group from response") - - return "", nil, errors.Wrap(err, "unable to unmarshal response") - } - - return compat.AlertmanagerConfig, compat.TemplateFiles, nil -} diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index 919fcb11bfc3..6fdb6aa397aa 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -9,7 +9,6 @@ import ( "net/http" "net/url" "strings" - "time" "github.com/grafana/dskit/crypto/tls" "github.com/pkg/errors" @@ -103,18 +102,6 @@ func New(cfg Config) (*MimirClient, error) { }, nil } -// Query executes a PromQL query against the Mimir cluster. -func (r *MimirClient) Query(ctx context.Context, query string) (*http.Response, error) { - req := fmt.Sprintf("/prometheus/api/v1/query?query=%s&time=%d", url.QueryEscape(query), time.Now().Unix()) - - res, err := r.doRequest(req, "GET", nil) - if err != nil { - return nil, err - } - - return res, nil -} - func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Response, error) { req, err := buildRequest(path, method, *r.endpoint, payload) if err != nil { diff --git a/pkg/mimir/client/rules.go b/pkg/mimir/client/rules.go index 41a2e577f1b6..4eea951a7f2a 100644 --- a/pkg/mimir/client/rules.go +++ b/pkg/mimir/client/rules.go @@ -2,13 +2,10 @@ package client import ( "context" - "fmt" "io" "net/url" - "github.com/pkg/errors" "github.com/prometheus/prometheus/model/rulefmt" - log "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" ) @@ -60,38 +57,6 @@ func (r *MimirClient) DeleteRuleGroup(ctx context.Context, namespace, groupName return nil } -// GetRuleGroup retrieves a rule group -func (r *MimirClient) GetRuleGroup(ctx context.Context, namespace, groupName string) (*RuleGroup, error) { - escapedNamespace := url.PathEscape(namespace) - escapedGroupName := url.PathEscape(groupName) - path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName - - fmt.Println(path) - res, err := r.doRequest(path, "GET", nil) - if err != nil { - return nil, err - } - - defer res.Body.Close() - body, err := io.ReadAll(res.Body) - - if err != nil { - return nil, err - } - - rg := RuleGroup{} - err = yaml.Unmarshal(body, &rg) - if err != nil { - log.WithFields(log.Fields{ - "body": string(body), - }).Debugln("failed to unmarshal rule group from response") - - return nil, errors.Wrap(err, "unable to unmarshal response") - } - - return &rg, nil -} - // ListRules retrieves a rule group func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { path := r.apiPath From 3b498031353396ffc35d7c09ea54d441c2eb686c Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:16:01 -0600 Subject: [PATCH 20/40] Remove logrus from mimir client Use go-kit/log instead --- component/mimir/rules/rules.go | 2 +- pkg/mimir/client/client.go | 81 ++++++++++++++++------------------ pkg/mimir/client/rules_test.go | 3 +- 3 files changed, 42 insertions(+), 44 deletions(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index a606c76b52ec..9746de913a2a 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -156,7 +156,7 @@ func (c *Component) init() error { return fmt.Errorf("failed to create prometheus operator client: %w", err) } - c.mimirClient, err = mimirClient.New(mimirClient.Config{ + c.mimirClient, err = mimirClient.New(c.log, mimirClient.Config{ User: c.args.ClientParams.User, Key: string(c.args.ClientParams.Key), Address: c.args.ClientParams.Address, diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index 6fdb6aa397aa..e39c813f209e 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -10,9 +10,10 @@ import ( "net/url" "strings" + log "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/grafana/dskit/crypto/tls" "github.com/pkg/errors" - log "github.com/sirupsen/logrus" ) const ( @@ -51,31 +52,31 @@ type MimirClient struct { Client http.Client apiPath string authToken string + logger log.Logger } // New returns a new MimirClient. -func New(cfg Config) (*MimirClient, error) { +func New(logger log.Logger, cfg Config) (*MimirClient, error) { endpoint, err := url.Parse(cfg.Address) if err != nil { return nil, err } - log.WithFields(log.Fields{ - "address": cfg.Address, - "id": cfg.ID, - }).Debugln("New ruler client created") + level.Debug(logger).Log("msg", "New Mimir client created", "address", cfg.Address, "id", cfg.ID) client := http.Client{} // Setup TLS client tlsConfig, err := cfg.TLS.GetTLSConfig() if err != nil { - log.WithError(err).WithFields(log.Fields{ - "tls-ca": cfg.TLS.CAPath, - "tls-cert": cfg.TLS.CertPath, - "tls-key": cfg.TLS.KeyPath, - }).Errorf("error loading tls files") - return nil, fmt.Errorf("client initialization unsuccessful") + level.Error(logger).Log( + "msg", "error loading TLS files", + "tls-ca", cfg.TLS.CAPath, + "tls-cert", cfg.TLS.CertPath, + "tls-key", cfg.TLS.KeyPath, + "err", err, + ) + return nil, fmt.Errorf("Mimir client initialization unsuccessful") } if tlsConfig != nil { @@ -99,6 +100,7 @@ func New(cfg Config) (*MimirClient, error) { Client: client, apiPath: path, authToken: cfg.AuthToken, + logger: logger, }, nil } @@ -110,11 +112,12 @@ func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Resp if (r.user != "" || r.key != "") && r.authToken != "" { err := errors.New("atmost one of basic auth or auth token should be configured") - log.WithFields(log.Fields{ - "url": req.URL.String(), - "method": req.Method, - "error": err, - }).Errorln("error during request to Mimir api") + level.Error(r.logger).Log( + "msg", "error during setting up request to mimir api", + "url", req.URL.String(), + "method", req.Method, + "error", err, + ) return nil, err } @@ -130,34 +133,34 @@ func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Resp req.Header.Add("X-Scope-OrgID", r.id) - log.WithFields(log.Fields{ - "url": req.URL.String(), - "method": req.Method, - }).Debugln("sending request to Mimir api") + level.Debug(r.logger).Log( + "msg", "sending request to Grafana Mimir API", + "url", req.URL.String(), + "method", req.Method, + ) resp, err := r.Client.Do(req) if err != nil { - log.WithFields(log.Fields{ - "url": req.URL.String(), - "method": req.Method, - "error": err.Error(), - }).Errorln("error during request to Mimir api") + level.Error(r.logger).Log( + "msg", "error during request to Grafana Mimir API", + "url", req.URL.String(), + "method", req.Method, + "error", err, + ) return nil, err } - err = checkResponse(resp) - if err != nil { - return nil, err + if err := checkResponse(r.logger, resp); err != nil { + _ = resp.Body.Close() + return nil, errors.Wrapf(err, "%s request to %s failed", req.Method, req.URL.String()) } return resp, nil } -// checkResponse checks the API response for errors -func checkResponse(r *http.Response) error { - log.WithFields(log.Fields{ - "status": r.Status, - }).Debugln("checking response") +// checkResponse checks an API response for errors. +func checkResponse(logger log.Logger, r *http.Response) error { + level.Debug(logger).Log("msg", "checking response", "status", r.Status) if 200 <= r.StatusCode && r.StatusCode <= 299 { return nil } @@ -175,17 +178,11 @@ func checkResponse(r *http.Response) error { } if r.StatusCode == http.StatusNotFound { - log.WithFields(log.Fields{ - "status": r.Status, - "msg": msg, - }).Debugln(errMsg) + level.Debug(logger).Log("msg", msg, "status", r.Status) return ErrResourceNotFound } - log.WithFields(log.Fields{ - "status": r.Status, - "msg": msg, - }).Errorln(errMsg) + level.Error(logger).Log("msg", msg, "status", r.Status) return errors.New(errMsg) } diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go index d98acf4b6512..606dda0d8882 100644 --- a/pkg/mimir/client/rules_test.go +++ b/pkg/mimir/client/rules_test.go @@ -7,6 +7,7 @@ import ( "net/http/httptest" "testing" + "github.com/go-kit/log" "github.com/stretchr/testify/require" ) @@ -19,7 +20,7 @@ func TestMimirClient_X(t *testing.T) { })) defer ts.Close() - client, err := New(Config{ + client, err := New(log.NewNopLogger(), Config{ Address: ts.URL, ID: "my-id", Key: "my-key", From 2002adde322ce7a6a6432dd4e9146f4e9e629f28 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:26:29 -0600 Subject: [PATCH 21/40] Remove redundant default setting --- component/mimir/rules/rules.go | 1 - 1 file changed, 1 deletion(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 9746de913a2a..3e3d9c4cb79b 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -69,7 +69,6 @@ var _ component.Component = (*Component)(nil) var _ component.DebugComponent = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { - setDefaultArguments(&c) return &Component{ log: o.Logger, opts: o, From 02bc89e18bed712010226b88286eb53f24451881 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:50:26 -0600 Subject: [PATCH 22/40] Remove extra logging from MimirClient --- pkg/mimir/client/client.go | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index e39c813f209e..cdb6f8cd9b3a 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -112,12 +112,6 @@ func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Resp if (r.user != "" || r.key != "") && r.authToken != "" { err := errors.New("atmost one of basic auth or auth token should be configured") - level.Error(r.logger).Log( - "msg", "error during setting up request to mimir api", - "url", req.URL.String(), - "method", req.Method, - "error", err, - ) return nil, err } @@ -133,24 +127,12 @@ func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Resp req.Header.Add("X-Scope-OrgID", r.id) - level.Debug(r.logger).Log( - "msg", "sending request to Grafana Mimir API", - "url", req.URL.String(), - "method", req.Method, - ) - resp, err := r.Client.Do(req) if err != nil { - level.Error(r.logger).Log( - "msg", "error during request to Grafana Mimir API", - "url", req.URL.String(), - "method", req.Method, - "error", err, - ) return nil, err } - if err := checkResponse(r.logger, resp); err != nil { + if err := checkResponse(resp); err != nil { _ = resp.Body.Close() return nil, errors.Wrapf(err, "%s request to %s failed", req.Method, req.URL.String()) } @@ -159,8 +141,7 @@ func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Resp } // checkResponse checks an API response for errors. -func checkResponse(logger log.Logger, r *http.Response) error { - level.Debug(logger).Log("msg", "checking response", "status", r.Status) +func checkResponse(r *http.Response) error { if 200 <= r.StatusCode && r.StatusCode <= 299 { return nil } @@ -178,12 +159,9 @@ func checkResponse(logger log.Logger, r *http.Response) error { } if r.StatusCode == http.StatusNotFound { - level.Debug(logger).Log("msg", msg, "status", r.Status) return ErrResourceNotFound } - level.Error(logger).Log("msg", msg, "status", r.Status) - return errors.New(errMsg) } From 93695715ff84cb53a18ac43772a40b74f914d60c Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:50:42 -0600 Subject: [PATCH 23/40] Replace default argument value setting --- component/mimir/rules/rules.go | 1 + 1 file changed, 1 insertion(+) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 3e3d9c4cb79b..9746de913a2a 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -69,6 +69,7 @@ var _ component.Component = (*Component)(nil) var _ component.DebugComponent = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { + setDefaultArguments(&c) return &Component{ log: o.Logger, opts: o, From e454ade9eeb236c69c6445203ce66e18c595011a Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 17:51:07 -0600 Subject: [PATCH 24/40] Retry event processing up to 5 times --- component/mimir/rules/events.go | 46 ++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 0bd6ec271eca..58317414a2cf 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -62,20 +62,34 @@ func (c *Component) eventLoop(ctx context.Context) { return } - evt := event.(Event) - err := c.processEvent(ctx, evt) + err := c.processEvent(ctx, event.(Event)) if err != nil { - // TODO: retry limits? - level.Error(c.log).Log("msg", "failed to process event", "err", err) - // c.queue.AddRateLimited(event) - } else { - c.queue.Forget(event) - c.queue.Done(event) + retries := c.queue.NumRequeues(event) + if retries < 5 { + c.queue.AddRateLimited(event) + level.Error(c.log).Log( + "msg", "failed to process event, will retry", + "retries", fmt.Sprintf("%d/5", retries), + "err", err, + ) + continue + } else { + level.Error(c.log).Log( + "msg", "failed to process event, max retries exceeded", + "retries", fmt.Sprintf("%d/5", retries), + "err", err, + ) + } } + + c.queue.Forget(event) } } + func (c *Component) processEvent(ctx context.Context, e Event) error { + defer c.queue.Done(e) + switch e.Type { case EventTypeResourceChanged: level.Info(c.log).Log("msg", "processing event", "type", e.Type, "key", e.ObjectKey) @@ -103,6 +117,8 @@ func (c *Component) syncMimir(ctx context.Context) { } c.currentState = rulesByNamespace + + return } func (c *Component) reconcileState(ctx context.Context) error { @@ -116,15 +132,16 @@ func (c *Component) reconcileState(ctx context.Context) error { return err } + errs := multierror.New() for ns, diff := range diffs { err = c.applyChanges(ctx, ns, diff) if err != nil { - level.Error(c.log).Log("msg", "failed to apply changes", "mimir-namespace", ns, "err", err) + errs = append(errs, err) continue } } - return nil + return errs.Err() } func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, error) { @@ -182,33 +199,32 @@ func (c *Component) applyChanges(ctx context.Context, namespace string, diffs [] return nil } - level.Info(c.log).Log("msg", "applying rule changes", "num_changes", len(diffs)) - for _, diff := range diffs { switch diff.Kind { case RuleGroupDiffKindAdd: - level.Info(c.log).Log("msg", "adding rule group", "group", diff.Desired.Name) err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err } + level.Info(c.log).Log("msg", "added rule group", "namespace", namespace, "group", diff.Desired.Name) case RuleGroupDiffKindRemove: - level.Info(c.log).Log("msg", "removing rule group", "group", diff.Actual.Name) err := c.mimirClient.DeleteRuleGroup(ctx, namespace, diff.Actual.Name) if err != nil { return err } + level.Info(c.log).Log("msg", "removed rule group", "namespace", namespace, "group", diff.Actual.Name) case RuleGroupDiffKindUpdate: - level.Info(c.log).Log("msg", "updating rule group", "group", diff.Desired.Name) err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err } + level.Info(c.log).Log("msg", "updated rule group", "namespace", namespace, "group", diff.Desired.Name) default: level.Error(c.log).Log("msg", "unknown rule group diff kind", "kind", diff.Kind) } } + // resync mimir state after applying changes c.syncMimir(ctx) return nil From 35d19d3c5894688f643a2cf6679195f09cf8a872 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 21:38:55 -0600 Subject: [PATCH 25/40] Add metrics for the workqueue and k8s client - These are namespaced `prometheus_sd_kubernetes` because the registration is global. --- component/mimir/rules/rules.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 9746de913a2a..f303948c0935 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -18,6 +18,7 @@ import ( coreListers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" + _ "k8s.io/component-base/metrics/prometheus/workqueue" controller "sigs.k8s.io/controller-runtime" promExternalVersions "github.com/prometheus-operator/prometheus-operator/pkg/client/informers/externalversions" @@ -113,7 +114,7 @@ func (c *Component) startup(ctx context.Context) error { return err } - c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + c.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "mimir.rules") c.informerStopChan = make(chan struct{}) c.startNamespaceInformer() From 92666a0d0cda8b48b67cd94b129335f70f01670b Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 7 Dec 2022 22:18:10 -0600 Subject: [PATCH 26/40] Add metrics for mimir client and event pipeline --- component/mimir/rules/events.go | 6 +++- component/mimir/rules/rules.go | 64 ++++++++++++++++++++++++++++++++- pkg/mimir/client/client.go | 30 +++++++++++----- pkg/mimir/client/client_test.go | 2 +- pkg/mimir/client/rules.go | 10 ++++-- pkg/mimir/client/rules_test.go | 4 ++- 6 files changed, 100 insertions(+), 16 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 58317414a2cf..876f4acefaa8 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -62,11 +62,14 @@ func (c *Component) eventLoop(ctx context.Context) { return } - err := c.processEvent(ctx, event.(Event)) + evt := event.(Event) + c.metrics.eventsTotal.WithLabelValues(string(evt.Type)).Inc() + err := c.processEvent(ctx, evt) if err != nil { retries := c.queue.NumRequeues(event) if retries < 5 { + c.metrics.eventsRetried.WithLabelValues(string(evt.Type)).Inc() c.queue.AddRateLimited(event) level.Error(c.log).Log( "msg", "failed to process event, will retry", @@ -75,6 +78,7 @@ func (c *Component) eventLoop(ctx context.Context) { ) continue } else { + c.metrics.eventsFailed.WithLabelValues(string(evt.Type)).Inc() level.Error(c.log).Log( "msg", "failed to process event, max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index f303948c0935..9ce68867ed5f 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -11,6 +11,8 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/crypto/tls" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" + "github.com/prometheus/client_golang/prometheus" + "github.com/weaveworks/common/instrument" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -59,6 +61,60 @@ type Component struct { ruleSelector labels.Selector currentState map[string][]mimirClient.RuleGroup + + metrics *metrics +} + +type metrics struct { + configUpdatesTotal prometheus.Counter + + eventsTotal *prometheus.CounterVec + eventsFailed *prometheus.CounterVec + eventsRetried *prometheus.CounterVec + + mimirClientTiming *prometheus.HistogramVec +} + +func (m *metrics) Register(r prometheus.Registerer) error { + r.MustRegister( + m.configUpdatesTotal, + m.eventsTotal, + m.eventsFailed, + m.eventsRetried, + m.mimirClientTiming, + ) + return nil +} + +func newMetrics() *metrics { + return &metrics{ + configUpdatesTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Subsystem: "mimir_rules", + Name: "config_updates_total", + Help: "Total number of times the configuration has been updated.", + }), + eventsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "mimir_rules", + Name: "events_total", + Help: "Total number of events processed, partitioned by event type.", + }, []string{"type"}), + eventsFailed: prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "mimir_rules", + Name: "events_failed_total", + Help: "Total number of events that failed to be processed, even after retries, partitioned by event type.", + }, []string{"type"}), + eventsRetried: prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "mimir_rules", + Name: "events_retried_total", + Help: "Total number of retries across all events, partitioned by event type.", + }, []string{"type"}), + mimirClientTiming: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: "mimir_rules", + Name: "mimir_client_request_duration_seconds", + Help: "Duration of requests to the Mimir API.", + Buckets: instrument.DefBuckets, + }, instrument.HistogramCollectorBuckets), + } } type ConfigUpdate struct { @@ -71,12 +127,17 @@ var _ component.DebugComponent = (*Component)(nil) func NewComponent(o component.Options, c Arguments) (*Component, error) { setDefaultArguments(&c) + + metrics := newMetrics() + metrics.Register(o.Registerer) + return &Component{ log: o.Logger, opts: o, args: c, configUpdates: make(chan ConfigUpdate), ticker: time.NewTicker(c.SyncInterval), + metrics: metrics, }, nil } @@ -89,6 +150,7 @@ func (c *Component) Run(ctx context.Context) error { for { select { case update := <-c.configUpdates: + c.metrics.configUpdatesTotal.Inc() c.shutdown() c.args = update.args @@ -173,7 +235,7 @@ func (c *Component) init() error { }, UseLegacyRoutes: c.args.ClientParams.UseLegacyRoutes, AuthToken: string(c.args.ClientParams.AuthToken), - }) + }, c.metrics.mimirClientTiming) if err != nil { return err } diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index cdb6f8cd9b3a..89fdece2474a 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -14,6 +14,9 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/crypto/tls" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + weaveworksClient "github.com/weaveworks/common/http/client" + "github.com/weaveworks/common/instrument" ) const ( @@ -49,14 +52,14 @@ type MimirClient struct { key string id string endpoint *url.URL - Client http.Client + Client weaveworksClient.Requester apiPath string authToken string logger log.Logger } // New returns a new MimirClient. -func New(logger log.Logger, cfg Config) (*MimirClient, error) { +func New(logger log.Logger, cfg Config, timingHistogram *prometheus.HistogramVec) (*MimirClient, error) { endpoint, err := url.Parse(cfg.Address) if err != nil { return nil, err @@ -64,7 +67,7 @@ func New(logger log.Logger, cfg Config) (*MimirClient, error) { level.Debug(logger).Log("msg", "New Mimir client created", "address", cfg.Address, "id", cfg.ID) - client := http.Client{} + client := &http.Client{} // Setup TLS client tlsConfig, err := cfg.TLS.GetTLSConfig() @@ -84,7 +87,7 @@ func New(logger log.Logger, cfg Config) (*MimirClient, error) { Proxy: http.ProxyFromEnvironment, TLSClientConfig: tlsConfig, } - client = http.Client{Transport: transport} + client = &http.Client{Transport: transport} } path := rulerAPIPath @@ -92,20 +95,23 @@ func New(logger log.Logger, cfg Config) (*MimirClient, error) { path = legacyAPIPath } + collector := instrument.NewHistogramCollector(timingHistogram) + timedClient := weaveworksClient.NewTimedClient(client, collector) + return &MimirClient{ user: cfg.User, key: cfg.Key, id: cfg.ID, endpoint: endpoint, - Client: client, + Client: timedClient, apiPath: path, authToken: cfg.AuthToken, logger: logger, }, nil } -func (r *MimirClient) doRequest(path, method string, payload []byte) (*http.Response, error) { - req, err := buildRequest(path, method, *r.endpoint, payload) +func (r *MimirClient) doRequest(operation, path, method string, payload []byte) (*http.Response, error) { + req, err := buildRequest(operation, path, method, *r.endpoint, payload) if err != nil { return nil, err } @@ -171,7 +177,7 @@ func joinPath(baseURLPath, targetPath string) string { return strings.TrimSuffix(baseURLPath, "/") + targetPath } -func buildRequest(p, m string, endpoint url.URL, payload []byte) (*http.Request, error) { +func buildRequest(op, p, m string, endpoint url.URL, payload []byte) (*http.Request, error) { // parse path parameter again (as it already contains escaped path information pURL, err := url.Parse(p) if err != nil { @@ -183,5 +189,11 @@ func buildRequest(p, m string, endpoint url.URL, payload []byte) (*http.Request, endpoint.RawPath = joinPath(endpoint.EscapedPath(), pURL.EscapedPath()) } endpoint.Path = joinPath(endpoint.Path, pURL.Path) - return http.NewRequest(m, endpoint.String(), bytes.NewBuffer(payload)) + r, err := http.NewRequest(m, endpoint.String(), bytes.NewBuffer(payload)) + if err != nil { + return nil, err + } + r = r.WithContext(context.WithValue(r.Context(), weaveworksClient.OperationNameContextKey, op)) + + return r, nil } diff --git a/pkg/mimir/client/client_test.go b/pkg/mimir/client/client_test.go index 1313d22a4569..0777def46276 100644 --- a/pkg/mimir/client/client_test.go +++ b/pkg/mimir/client/client_test.go @@ -86,7 +86,7 @@ func TestBuildURL(t *testing.T) { url, err := url.Parse(tt.url) require.NoError(t, err) - req, err := buildRequest(tt.path, tt.method, *url, []byte{}) + req, err := buildRequest("op", tt.path, tt.method, *url, []byte{}) require.NoError(t, err) require.Equal(t, tt.resultURL, req.URL.String()) }) diff --git a/pkg/mimir/client/rules.go b/pkg/mimir/client/rules.go index 4eea951a7f2a..c82075b03876 100644 --- a/pkg/mimir/client/rules.go +++ b/pkg/mimir/client/rules.go @@ -30,8 +30,9 @@ func (r *MimirClient) CreateRuleGroup(ctx context.Context, namespace string, rg escapedNamespace := url.PathEscape(namespace) path := r.apiPath + "/" + escapedNamespace + op := r.apiPath + "/" + "" - res, err := r.doRequest(path, "POST", payload) + res, err := r.doRequest(op, path, "POST", payload) if err != nil { return err } @@ -46,8 +47,9 @@ func (r *MimirClient) DeleteRuleGroup(ctx context.Context, namespace, groupName escapedNamespace := url.PathEscape(namespace) escapedGroupName := url.PathEscape(groupName) path := r.apiPath + "/" + escapedNamespace + "/" + escapedGroupName + op := r.apiPath + "/" + "" + "/" + "" - res, err := r.doRequest(path, "DELETE", nil) + res, err := r.doRequest(op, path, "DELETE", nil) if err != nil { return err } @@ -60,11 +62,13 @@ func (r *MimirClient) DeleteRuleGroup(ctx context.Context, namespace, groupName // ListRules retrieves a rule group func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { path := r.apiPath + op := r.apiPath if namespace != "" { path = path + "/" + namespace + op = op + "/" + "" } - res, err := r.doRequest(path, "GET", nil) + res, err := r.doRequest(op, path, "GET", nil) if err != nil { return nil, err } diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go index 606dda0d8882..e8ff4677999c 100644 --- a/pkg/mimir/client/rules_test.go +++ b/pkg/mimir/client/rules_test.go @@ -8,7 +8,9 @@ import ( "testing" "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" + "github.com/weaveworks/common/instrument" ) func TestMimirClient_X(t *testing.T) { @@ -24,7 +26,7 @@ func TestMimirClient_X(t *testing.T) { Address: ts.URL, ID: "my-id", Key: "my-key", - }) + }, prometheus.NewHistogramVec(prometheus.HistogramOpts{}, instrument.HistogramCollectorBuckets)) require.NoError(t, err) for _, tc := range []struct { From bfa32b2063e2ca33a98fdab1d68e41cc5c579575 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 9 Dec 2022 16:18:43 -0600 Subject: [PATCH 27/40] Address PR feedback - Use snake case in river tags - Export minimum api from packages - Re-use existing config type for mimir client - Implement component health - Set defaults in Unmarshal method - Remove exports type - Use river blocks where appropriate - Rename to mimir.rules.kubernetes - Initialize with config during construction Co-authored-by: Robert Fratto --- component/mimir/rules/debug.go | 8 +-- component/mimir/rules/diff.go | 32 +++++----- component/mimir/rules/diff_test.go | 28 ++++----- component/mimir/rules/events.go | 71 +++++++++++++---------- component/mimir/rules/events_test.go | 5 +- component/mimir/rules/health.go | 32 ++++++++++ component/mimir/rules/rules.go | 87 +++++++++++++--------------- component/mimir/rules/rules_test.go | 2 +- component/mimir/rules/types.go | 58 +++++++++---------- pkg/mimir/client/client.go | 86 +++++++-------------------- pkg/mimir/client/rules_test.go | 2 - 11 files changed, 199 insertions(+), 212 deletions(-) create mode 100644 component/mimir/rules/health.go diff --git a/component/mimir/rules/debug.go b/component/mimir/rules/debug.go index 29cfcddc7c28..59b5103858dc 100644 --- a/component/mimir/rules/debug.go +++ b/component/mimir/rules/debug.go @@ -4,20 +4,20 @@ import "fmt" type DebugInfo struct { Error string `river:"error,attr,optional"` - PrometheusRules []DebugK8sPrometheusRule `river:"prometheusRules,attr,optional"` - MimirRuleNamespaces []DebugMimirNamespace `river:"mimirRuleNamespaces,attr,optional"` + PrometheusRules []DebugK8sPrometheusRule `river:"prometheus_rule,block,optional"` + MimirRuleNamespaces []DebugMimirNamespace `river:"mimir_rule_namespace,block,optional"` } type DebugK8sPrometheusRule struct { Namespace string `river:"namespace,attr"` Name string `river:"name,attr"` UID string `river:"uid,attr"` - NumRuleGroups int `river:"numRuleGroups,attr"` + NumRuleGroups int `river:"num_rule_groups,attr"` } type DebugMimirNamespace struct { Name string `river:"name,attr"` - NumRuleGroups int `river:"numRuleGroups,attr"` + NumRuleGroups int `river:"num_rule_groups,attr"` } func (c *Component) DebugInfo() interface{} { diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 9739b582ce73..1a29233f221e 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -8,24 +8,24 @@ import ( "gopkg.in/yaml.v3" ) -type RuleGroupDiffKind string +type ruleGroupDiffKind string const ( - RuleGroupDiffKindAdd RuleGroupDiffKind = "add" - RuleGroupDiffKindRemove RuleGroupDiffKind = "remove" - RuleGroupDiffKindUpdate RuleGroupDiffKind = "update" + ruleGroupDiffKindAdd ruleGroupDiffKind = "add" + ruleGroupDiffKindRemove ruleGroupDiffKind = "remove" + ruleGroupDiffKindUpdate ruleGroupDiffKind = "update" ) -type RuleGroupDiff struct { - Kind RuleGroupDiffKind +type ruleGroupDiff struct { + Kind ruleGroupDiffKind Actual mimirClient.RuleGroup Desired mimirClient.RuleGroup } -func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]RuleGroupDiff, error) { +func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]ruleGroupDiff, error) { seen := map[string]bool{} - diff := make(map[string][]RuleGroupDiff) + diff := make(map[string][]ruleGroupDiff) for namespace, desiredRuleGroups := range desired { seen[namespace] = true @@ -59,8 +59,8 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string return diff, nil } -func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) ([]RuleGroupDiff, error) { - var diff []RuleGroupDiff +func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) ([]ruleGroupDiff, error) { + var diff []ruleGroupDiff seenGroups := map[string]bool{} @@ -74,8 +74,8 @@ desiredGroups: continue desiredGroups } - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindUpdate, + diff = append(diff, ruleGroupDiff{ + Kind: ruleGroupDiffKindUpdate, Actual: actualRuleGroup, Desired: desiredRuleGroup, }) @@ -83,8 +83,8 @@ desiredGroups: } } - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindAdd, + diff = append(diff, ruleGroupDiff{ + Kind: ruleGroupDiffKindAdd, Desired: desiredRuleGroup, }) } @@ -94,8 +94,8 @@ desiredGroups: continue } - diff = append(diff, RuleGroupDiff{ - Kind: RuleGroupDiffKindRemove, + diff = append(diff, ruleGroupDiff{ + Kind: ruleGroupDiffKindRemove, Actual: actualRuleGroup, }) } diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go index 9cab0daac863..54da1a4bd8a0 100644 --- a/component/mimir/rules/diff_test.go +++ b/component/mimir/rules/diff_test.go @@ -47,7 +47,7 @@ groups: name string desired map[string][]mimirClient.RuleGroup actual map[string][]mimirClient.RuleGroup - expected map[string][]RuleGroupDiff + expected map[string][]ruleGroupDiff } testCases := []testCase{ @@ -55,7 +55,7 @@ groups: name: "empty sets", desired: map[string][]mimirClient.RuleGroup{}, actual: map[string][]mimirClient.RuleGroup{}, - expected: map[string][]RuleGroupDiff{}, + expected: map[string][]ruleGroupDiff{}, }, { name: "add rule group", @@ -63,10 +63,10 @@ groups: managedNamespace: ruleGroupsA, }, actual: map[string][]mimirClient.RuleGroup{}, - expected: map[string][]RuleGroupDiff{ + expected: map[string][]ruleGroupDiff{ managedNamespace: { { - Kind: RuleGroupDiffKindAdd, + Kind: ruleGroupDiffKindAdd, Desired: ruleGroupsA[0], }, }, @@ -78,10 +78,10 @@ groups: actual: map[string][]mimirClient.RuleGroup{ managedNamespace: ruleGroupsA, }, - expected: map[string][]RuleGroupDiff{ + expected: map[string][]ruleGroupDiff{ managedNamespace: { { - Kind: RuleGroupDiffKindRemove, + Kind: ruleGroupDiffKindRemove, Actual: ruleGroupsA[0], }, }, @@ -95,10 +95,10 @@ groups: actual: map[string][]mimirClient.RuleGroup{ managedNamespace: ruleGroupsAModified, }, - expected: map[string][]RuleGroupDiff{ + expected: map[string][]ruleGroupDiff{ managedNamespace: { { - Kind: RuleGroupDiffKindUpdate, + Kind: ruleGroupDiffKindUpdate, Desired: ruleGroupsA[0], Actual: ruleGroupsAModified[0], }, @@ -113,7 +113,7 @@ groups: actual: map[string][]mimirClient.RuleGroup{ managedNamespace: ruleGroupsA, }, - expected: map[string][]RuleGroupDiff{}, + expected: map[string][]ruleGroupDiff{}, }, } @@ -126,16 +126,16 @@ groups: } } -func requireEqualRuleDiffs(t *testing.T, expected, actual map[string][]RuleGroupDiff) { +func requireEqualRuleDiffs(t *testing.T, expected, actual map[string][]ruleGroupDiff) { require.Equal(t, len(expected), len(actual)) - var summarizeDiff = func(diff RuleGroupDiff) string { + var summarizeDiff = func(diff ruleGroupDiff) string { switch diff.Kind { - case RuleGroupDiffKindAdd: + case ruleGroupDiffKindAdd: return fmt.Sprintf("add: %s", diff.Desired.Name) - case RuleGroupDiffKindRemove: + case ruleGroupDiffKindRemove: return fmt.Sprintf("remove: %s", diff.Actual.Name) - case RuleGroupDiffKindUpdate: + case ruleGroupDiffKindUpdate: return fmt.Sprintf("update: %s", diff.Desired.Name) } panic("unreachable") diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 876f4acefaa8..20ad767577b5 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -17,26 +17,29 @@ import ( // This type must be hashable, so it is kept simple. The indexer will maintain a // cache of current state, so this is mostly used for logging. -type Event struct { - Type EventType - ObjectKey string +type event struct { + typ eventType + objectKey string } -type EventType string +type eventType string const ( - EventTypeResourceChanged EventType = "resource-changed" - EventTypeSyncMimir EventType = "sync-mimir" + eventTypeResourceChanged eventType = "resource-changed" + eventTypeSyncMimir eventType = "sync-mimir" ) +// OnAdd implements the cache.ResourceEventHandler interface. func (c *Component) OnAdd(obj interface{}) { c.publishEvent(obj) } +// OnUpdate implements the cache.ResourceEventHandler interface. func (c *Component) OnUpdate(oldObj, newObj interface{}) { c.publishEvent(newObj) } +// OnDelete implements the cache.ResourceEventHandler interface. func (c *Component) OnDelete(obj interface{}) { c.publishEvent(obj) } @@ -48,29 +51,29 @@ func (c *Component) publishEvent(obj interface{}) { return } - c.queue.AddRateLimited(Event{ - Type: EventTypeResourceChanged, - ObjectKey: key, + c.queue.AddRateLimited(event{ + typ: eventTypeResourceChanged, + objectKey: key, }) } func (c *Component) eventLoop(ctx context.Context) { for { - event, shutdown := c.queue.Get() + eventInterface, shutdown := c.queue.Get() if shutdown { level.Info(c.log).Log("msg", "shutting down event loop") return } - evt := event.(Event) - c.metrics.eventsTotal.WithLabelValues(string(evt.Type)).Inc() + evt := eventInterface.(event) + c.metrics.eventsTotal.WithLabelValues(string(evt.typ)).Inc() err := c.processEvent(ctx, evt) if err != nil { - retries := c.queue.NumRequeues(event) + retries := c.queue.NumRequeues(evt) if retries < 5 { - c.metrics.eventsRetried.WithLabelValues(string(evt.Type)).Inc() - c.queue.AddRateLimited(event) + c.metrics.eventsRetried.WithLabelValues(string(evt.typ)).Inc() + c.queue.AddRateLimited(evt) level.Error(c.log).Log( "msg", "failed to process event, will retry", "retries", fmt.Sprintf("%d/5", retries), @@ -78,40 +81,46 @@ func (c *Component) eventLoop(ctx context.Context) { ) continue } else { - c.metrics.eventsFailed.WithLabelValues(string(evt.Type)).Inc() + c.metrics.eventsFailed.WithLabelValues(string(evt.typ)).Inc() level.Error(c.log).Log( "msg", "failed to process event, max retries exceeded", "retries", fmt.Sprintf("%d/5", retries), "err", err, ) + c.reportUnhealthy(err) } + } else { + c.reportHealthy() } - c.queue.Forget(event) + c.queue.Forget(evt) } } -func (c *Component) processEvent(ctx context.Context, e Event) error { +func (c *Component) processEvent(ctx context.Context, e event) error { defer c.queue.Done(e) - switch e.Type { - case EventTypeResourceChanged: - level.Info(c.log).Log("msg", "processing event", "type", e.Type, "key", e.ObjectKey) - case EventTypeSyncMimir: + switch e.typ { + case eventTypeResourceChanged: + level.Info(c.log).Log("msg", "processing event", "type", e.typ, "key", e.objectKey) + case eventTypeSyncMimir: level.Debug(c.log).Log("msg", "syncing current state from ruler") - c.syncMimir(ctx) + err := c.syncMimir(ctx) + if err != nil { + return err + } default: - return fmt.Errorf("unknown event type: %s", e.Type) + return fmt.Errorf("unknown event type: %s", e.typ) } return c.reconcileState(ctx) } -func (c *Component) syncMimir(ctx context.Context) { +func (c *Component) syncMimir(ctx context.Context) error { rulesByNamespace, err := c.mimirClient.ListRules(ctx, "") if err != nil { level.Error(c.log).Log("msg", "failed to list rules from mimir", "err", err) - return + return err } for ns := range rulesByNamespace { @@ -122,7 +131,7 @@ func (c *Component) syncMimir(ctx context.Context) { c.currentState = rulesByNamespace - return + return nil } func (c *Component) reconcileState(ctx context.Context) error { @@ -198,26 +207,26 @@ func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]mimirClien return mimirGroups, nil } -func (c *Component) applyChanges(ctx context.Context, namespace string, diffs []RuleGroupDiff) error { +func (c *Component) applyChanges(ctx context.Context, namespace string, diffs []ruleGroupDiff) error { if len(diffs) == 0 { return nil } for _, diff := range diffs { switch diff.Kind { - case RuleGroupDiffKindAdd: + case ruleGroupDiffKindAdd: err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err } level.Info(c.log).Log("msg", "added rule group", "namespace", namespace, "group", diff.Desired.Name) - case RuleGroupDiffKindRemove: + case ruleGroupDiffKindRemove: err := c.mimirClient.DeleteRuleGroup(ctx, namespace, diff.Actual.Name) if err != nil { return err } level.Info(c.log).Log("msg", "removed rule group", "namespace", namespace, "group", diff.Actual.Name) - case RuleGroupDiffKindUpdate: + case ruleGroupDiffKindUpdate: err := c.mimirClient.CreateRuleGroup(ctx, namespace, diff.Desired) if err != nil { return err diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index cb7bef7d0318..40857cd5ecef 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -117,6 +117,7 @@ func TestEventLoop(t *testing.T) { ruleSelector: labels.Everything(), mimirClient: newFakeMimirClient(), args: Arguments{MimirNameSpacePrefix: "agent"}, + metrics: newMetrics(), } ctx, cancel := context.WithCancel(context.Background()) @@ -133,7 +134,7 @@ func TestEventLoop(t *testing.T) { require.Eventually(t, func() bool { return len(handler.currentState) == 1 }, time.Second, 10*time.Millisecond) - handler.queue.AddRateLimited(Event{Type: EventTypeSyncMimir}) + handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) // Update the rule in kubernetes rule.Spec.Groups[0].Rules = append(rule.Spec.Groups[0].Rules, v1.Rule{ @@ -148,7 +149,7 @@ func TestEventLoop(t *testing.T) { rules := handler.currentState[mimirNamespaceForRuleCRD("agent", rule)][0].Rules return len(rules) == 2 }, time.Second, 10*time.Millisecond) - handler.queue.AddRateLimited(Event{Type: EventTypeSyncMimir}) + handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) // Remove the rule from kubernetes ruleIndexer.Delete(rule) diff --git a/component/mimir/rules/health.go b/component/mimir/rules/health.go new file mode 100644 index 000000000000..b48ffb4a547f --- /dev/null +++ b/component/mimir/rules/health.go @@ -0,0 +1,32 @@ +package rules + +import ( + "time" + + "github.com/grafana/agent/component" +) + +func (c *Component) reportUnhealthy(err error) { + c.healthMut.Lock() + defer c.healthMut.Unlock() + c.health = component.Health{ + Health: component.HealthTypeUnhealthy, + Message: err.Error(), + UpdateTime: time.Now(), + } +} + +func (c *Component) reportHealthy() { + c.healthMut.Lock() + defer c.healthMut.Unlock() + c.health = component.Health{ + Health: component.HealthTypeHealthy, + UpdateTime: time.Now(), + } +} + +func (c *Component) CurrentHealth() component.Health { + c.healthMut.RLock() + defer c.healthMut.RUnlock() + return c.health +} diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 9ce68867ed5f..4df1a5dcd361 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -3,13 +3,14 @@ package rules import ( "context" "fmt" + "sync" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/agent/component" mimirClient "github.com/grafana/agent/pkg/mimir/client" - "github.com/grafana/dskit/crypto/tls" + "github.com/pkg/errors" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/client_golang/prometheus" "github.com/weaveworks/common/instrument" @@ -29,9 +30,9 @@ import ( func init() { component.Register(component.Registration{ - Name: "mimir.rules", + Name: "mimir.rules.kubernetes", Args: Arguments{}, - Exports: Exports{}, + Exports: nil, Build: func(o component.Options, c component.Arguments) (component.Component, error) { return NewComponent(o, c.(Arguments)) }, @@ -62,7 +63,9 @@ type Component struct { currentState map[string][]mimirClient.RuleGroup - metrics *metrics + metrics *metrics + healthMut sync.RWMutex + health component.Health } type metrics struct { @@ -124,29 +127,32 @@ type ConfigUpdate struct { var _ component.Component = (*Component)(nil) var _ component.DebugComponent = (*Component)(nil) +var _ component.HealthComponent = (*Component)(nil) -func NewComponent(o component.Options, c Arguments) (*Component, error) { - setDefaultArguments(&c) - +func NewComponent(o component.Options, args Arguments) (*Component, error) { metrics := newMetrics() metrics.Register(o.Registerer) - return &Component{ + c := &Component{ log: o.Logger, opts: o, - args: c, + args: args, configUpdates: make(chan ConfigUpdate), - ticker: time.NewTicker(c.SyncInterval), + ticker: time.NewTicker(args.SyncInterval), metrics: metrics, - }, nil -} + } -func (c *Component) Run(ctx context.Context) error { - err := c.startup(ctx) + err := c.init() if err != nil { - return err + return nil, errors.Wrap(err, "initializing component") } + return c, nil +} + +func (c *Component) Run(ctx context.Context) error { + c.startup(ctx) + for { select { case update := <-c.configUpdates: @@ -154,36 +160,34 @@ func (c *Component) Run(ctx context.Context) error { c.shutdown() c.args = update.args - err := c.startup(ctx) + err := c.init() update.err <- err if err != nil { - return err + level.Error(c.log).Log("msg", "updating configuration failed", "err", err) + c.reportUnhealthy(err) } + + c.startup(ctx) case <-ctx.Done(): c.shutdown() return nil case <-c.ticker.C: - c.queue.Add(Event{ - Type: EventTypeSyncMimir, + c.queue.Add(event{ + typ: eventTypeSyncMimir, }) } } } -func (c *Component) startup(ctx context.Context) error { - err := c.init() - if err != nil { - return err - } - - c.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "mimir.rules") +// startup launches the informers and starts the event loop. +func (c *Component) startup(ctx context.Context) { + c.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "mimir.rules.kubernetes") c.informerStopChan = make(chan struct{}) c.startNamespaceInformer() c.startRuleInformer() c.syncMimir(ctx) go c.eventLoop(ctx) - return nil } func (c *Component) shutdown() { @@ -203,12 +207,12 @@ func (c *Component) Update(newConfig component.Arguments) error { func (c *Component) init() error { level.Info(c.log).Log("msg", "initializing with new configuration") - setDefaultArguments(&c.args) - // TODO: allow overriding some stuff in RestConfig and k8s client options? - restConfig := controller.GetConfigOrDie() + restConfig, err := controller.GetConfig() + if err != nil { + return fmt.Errorf("failed to get k8s config: %w", err) + } - var err error c.k8sClient, err = kubernetes.NewForConfig(restConfig) if err != nil { return fmt.Errorf("failed to create k8s client: %w", err) @@ -219,22 +223,13 @@ func (c *Component) init() error { return fmt.Errorf("failed to create prometheus operator client: %w", err) } + httpClient := c.args.HTTPClientConfig.Convert() + c.mimirClient, err = mimirClient.New(c.log, mimirClient.Config{ - User: c.args.ClientParams.User, - Key: string(c.args.ClientParams.Key), - Address: c.args.ClientParams.Address, - ID: c.args.ClientParams.ID, - TLS: tls.ClientConfig{ - CertPath: c.args.ClientParams.TLS.CertPath, - KeyPath: c.args.ClientParams.TLS.KeyPath, - CAPath: c.args.ClientParams.TLS.CAPath, - ServerName: c.args.ClientParams.TLS.ServerName, - InsecureSkipVerify: c.args.ClientParams.TLS.InsecureSkipVerify, - CipherSuites: c.args.ClientParams.TLS.CipherSuites, - MinVersion: c.args.ClientParams.TLS.MinVersion, - }, - UseLegacyRoutes: c.args.ClientParams.UseLegacyRoutes, - AuthToken: string(c.args.ClientParams.AuthToken), + ID: c.args.TenantID, + Address: c.args.Address, + UseLegacyRoutes: c.args.UseLegacyRoutes, + HTTPClientConfig: *httpClient, }, c.metrics.mimirClientTiming) if err != nil { return err diff --git a/component/mimir/rules/rules_test.go b/component/mimir/rules/rules_test.go index f2e41821942e..6193228ccbc1 100644 --- a/component/mimir/rules/rules_test.go +++ b/component/mimir/rules/rules_test.go @@ -9,5 +9,5 @@ import ( func TestEventTypeIsHashable(t *testing.T) { // This test is here to ensure that the EventType type is hashable according to the workqueue implementation queue := workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) - queue.AddRateLimited(Event{}) + queue.AddRateLimited(event{}) } diff --git a/component/mimir/rules/types.go b/component/mimir/rules/types.go index b2d30149dbce..8ea8adae4d87 100644 --- a/component/mimir/rules/types.go +++ b/component/mimir/rules/types.go @@ -1,33 +1,50 @@ package rules import ( + "fmt" "time" - "github.com/grafana/agent/pkg/flow/rivertypes" + "github.com/grafana/agent/component/common/config" ) type Arguments struct { - ClientParams ClientArguments `river:"client,block"` - SyncInterval time.Duration `river:"sync_interval,attr,optional"` - MimirNameSpacePrefix string `river:"mimir_namespace_prefix,attr,optional"` + Address string `river:"address,attr"` + TenantID string `river:"tenant_id,attr"` + UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` + HTTPClientConfig config.HTTPClientConfig `river:"client,block,optional"` + SyncInterval time.Duration `river:"sync_interval,attr,optional"` + MimirNameSpacePrefix string `river:"mimir_namespace_prefix,attr,optional"` RuleSelector LabelSelector `river:"rule_selector,block,optional"` RuleNamespaceSelector LabelSelector `river:"rule_namespace_selector,block,optional"` } -func setDefaultArguments(args *Arguments) { - if args.SyncInterval == 0 { - args.SyncInterval = 30 * time.Second +var DefaultArguments = Arguments{ + SyncInterval: 30 * time.Second, + MimirNameSpacePrefix: "agent", +} + +func (args *Arguments) UnmarshalRiver(f func(interface{}) error) error { + *args = DefaultArguments + + type arguments Arguments + if err := f((*arguments)(args)); err != nil { + return err } + if args.SyncInterval <= 0 { + return fmt.Errorf("sync_interval must be greater than 0") + } if args.MimirNameSpacePrefix == "" { - args.MimirNameSpacePrefix = "agent" + return fmt.Errorf("mimir_namespace_prefix must not be empty") } + + return nil } type LabelSelector struct { MatchLabels map[string]string `river:"match_labels,attr,optional"` - MatchExpressions []MatchExpression `river:"match_expressions,attr,optional"` + MatchExpressions []MatchExpression `river:"match_expression,block,optional"` } type MatchExpression struct { @@ -35,26 +52,3 @@ type MatchExpression struct { Operator string `river:"operator,attr"` Values []string `river:"values,attr"` } - -type ClientArguments struct { - User string `river:"user,attr,optional"` - Key rivertypes.Secret `river:"key,attr,optional"` - Address string `river:"address,attr"` - ID string `river:"id,attr,optional"` - TLS TLSArguments `river:"tls,block,optional"` - UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` - AuthToken rivertypes.Secret `river:"auth_token,attr,optional"` -} - -type TLSArguments struct { - CertPath string `river:"tls_cert_path,attr,optional"` - KeyPath string `river:"tls_key_path,attr,optional"` - CAPath string `river:"tls_ca_path,attr,optional"` - ServerName string `river:"tls_server_name,attr,optional"` - InsecureSkipVerify bool `river:"tls_insecure_skip_verify,attr,optional"` - CipherSuites string `river:"tls_cipher_suites,attr,optional"` - MinVersion string `river:"tls_min_version,attr,optional"` -} - -type Exports struct { -} diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index 89fdece2474a..6e430004cc37 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -11,12 +11,12 @@ import ( "strings" log "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/grafana/dskit/crypto/tls" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/config" weaveworksClient "github.com/weaveworks/common/http/client" "github.com/weaveworks/common/instrument" + "github.com/weaveworks/common/user" ) const ( @@ -31,13 +31,10 @@ var ( // Config is used to configure a MimirClient. type Config struct { - User string `yaml:"user"` - Key string `yaml:"key"` - Address string `yaml:"address"` - ID string `yaml:"id"` - TLS tls.ClientConfig - UseLegacyRoutes bool `yaml:"use_legacy_routes"` - AuthToken string `yaml:"auth_token"` + ID string + Address string + UseLegacyRoutes bool + HTTPClientConfig config.HTTPClientConfig } type Interface interface { @@ -48,14 +45,12 @@ type Interface interface { // MimirClient is a client to the Mimir API. type MimirClient struct { - user string - key string - id string - endpoint *url.URL - Client weaveworksClient.Requester - apiPath string - authToken string - logger log.Logger + id string + + endpoint *url.URL + client weaveworksClient.Requester + apiPath string + logger log.Logger } // New returns a new MimirClient. @@ -64,30 +59,9 @@ func New(logger log.Logger, cfg Config, timingHistogram *prometheus.HistogramVec if err != nil { return nil, err } - - level.Debug(logger).Log("msg", "New Mimir client created", "address", cfg.Address, "id", cfg.ID) - - client := &http.Client{} - - // Setup TLS client - tlsConfig, err := cfg.TLS.GetTLSConfig() + client, err := config.NewClientFromConfig(cfg.HTTPClientConfig, "GrafanaAgent", config.WithHTTP2Disabled()) if err != nil { - level.Error(logger).Log( - "msg", "error loading TLS files", - "tls-ca", cfg.TLS.CAPath, - "tls-cert", cfg.TLS.CertPath, - "tls-key", cfg.TLS.KeyPath, - "err", err, - ) - return nil, fmt.Errorf("Mimir client initialization unsuccessful") - } - - if tlsConfig != nil { - transport := &http.Transport{ - Proxy: http.ProxyFromEnvironment, - TLSClientConfig: tlsConfig, - } - client = &http.Client{Transport: transport} + return nil, err } path := rulerAPIPath @@ -99,14 +73,11 @@ func New(logger log.Logger, cfg Config, timingHistogram *prometheus.HistogramVec timedClient := weaveworksClient.NewTimedClient(client, collector) return &MimirClient{ - user: cfg.User, - key: cfg.Key, - id: cfg.ID, - endpoint: endpoint, - Client: timedClient, - apiPath: path, - authToken: cfg.AuthToken, - logger: logger, + id: cfg.ID, + endpoint: endpoint, + client: timedClient, + apiPath: path, + logger: logger, }, nil } @@ -116,24 +87,11 @@ func (r *MimirClient) doRequest(operation, path, method string, payload []byte) return nil, err } - if (r.user != "" || r.key != "") && r.authToken != "" { - err := errors.New("atmost one of basic auth or auth token should be configured") - return nil, err - } - - if r.user != "" { - req.SetBasicAuth(r.user, r.key) - } else if r.key != "" { - req.SetBasicAuth(r.id, r.key) + if r.id != "" { + req.Header.Add(user.OrgIDHeaderName, r.id) } - if r.authToken != "" { - req.Header.Add("Authorization", "Bearer "+r.authToken) - } - - req.Header.Add("X-Scope-OrgID", r.id) - - resp, err := r.Client.Do(req) + resp, err := r.client.Do(req) if err != nil { return nil, err } diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go index e8ff4677999c..fb22b15a7f84 100644 --- a/pkg/mimir/client/rules_test.go +++ b/pkg/mimir/client/rules_test.go @@ -24,8 +24,6 @@ func TestMimirClient_X(t *testing.T) { client, err := New(log.NewNopLogger(), Config{ Address: ts.URL, - ID: "my-id", - Key: "my-key", }, prometheus.NewHistogramVec(prometheus.HistogramOpts{}, instrument.HistogramCollectorBuckets)) require.NoError(t, err) From ff233b50d08b72efbb6d011c2201e405f9e39c51 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 9 Dec 2022 17:04:12 -0600 Subject: [PATCH 28/40] Fix linter --- component/mimir/rules/diff.go | 14 ++++------- component/mimir/rules/diff_test.go | 1 - component/mimir/rules/events.go | 9 ++++---- component/mimir/rules/rules.go | 37 ++++++++++++++++++++++-------- pkg/mimir/client/client.go | 4 ++-- pkg/mimir/client/client_test.go | 1 - pkg/mimir/client/rules_test.go | 1 - 7 files changed, 39 insertions(+), 28 deletions(-) diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 1a29233f221e..3d8f30e3897c 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -31,10 +31,7 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string seen[namespace] = true actualRuleGroups := actual[namespace] - subDiff, err := diffRuleNamespaceState(desiredRuleGroups, actualRuleGroups) - if err != nil { - return nil, err - } + subDiff := diffRuleNamespaceState(desiredRuleGroups, actualRuleGroups) if len(subDiff) == 0 { continue @@ -48,10 +45,7 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string continue } - subDiff, err := diffRuleNamespaceState(nil, actualRuleGroups) - if err != nil { - return nil, err - } + subDiff := diffRuleNamespaceState(nil, actualRuleGroups) diff[namespace] = subDiff } @@ -59,7 +53,7 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string return diff, nil } -func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) ([]ruleGroupDiff, error) { +func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) []ruleGroupDiff { var diff []ruleGroupDiff seenGroups := map[string]bool{} @@ -100,7 +94,7 @@ desiredGroups: }) } - return diff, nil + return diff } func equalRuleGroups(a, b mimirClient.RuleGroup) bool { diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go index 54da1a4bd8a0..517e0bb2a7da 100644 --- a/component/mimir/rules/diff_test.go +++ b/component/mimir/rules/diff_test.go @@ -158,7 +158,6 @@ func requireEqualRuleDiffs(t *testing.T, expected, actual map[string][]ruleGroup t.Logf("actual diff: %s", summarizeDiff(actualDiff)) t.Fail() } - } } } diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 20ad767577b5..51a9678e2c8a 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -139,6 +139,9 @@ func (c *Component) reconcileState(ctx context.Context) error { defer cancel() desiredState, err := c.loadStateFromK8s() + if err != nil { + return err + } diffs, err := diffRuleState(desiredState, c.currentState) if err != nil { @@ -238,16 +241,14 @@ func (c *Component) applyChanges(ctx context.Context, namespace string, diffs [] } // resync mimir state after applying changes - c.syncMimir(ctx) - - return nil + return c.syncMimir(ctx) } // mimirNamespaceForRuleCRD returns the namespace that the rule CRD should be // stored in mimir. This function, along with isManagedNamespace, is used to // determine if a rule CRD is managed by the agent. func mimirNamespaceForRuleCRD(prefix string, pr *promv1.PrometheusRule) string { - return fmt.Sprintf("agent/%s/%s/%s", pr.Namespace, pr.Name, pr.UID) + return fmt.Sprintf("%s/%s/%s/%s", prefix, pr.Namespace, pr.Name, pr.UID) } // isManagedMimirNamespace returns true if the namespace is managed by the agent. diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 4df1a5dcd361..63544a73b390 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -10,7 +10,6 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/agent/component" mimirClient "github.com/grafana/agent/pkg/mimir/client" - "github.com/pkg/errors" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/client_golang/prometheus" "github.com/weaveworks/common/instrument" @@ -131,7 +130,10 @@ var _ component.HealthComponent = (*Component)(nil) func NewComponent(o component.Options, args Arguments) (*Component, error) { metrics := newMetrics() - metrics.Register(o.Registerer) + err := metrics.Register(o.Registerer) + if err != nil { + return nil, fmt.Errorf("registering metrics failed: %w", err) + } c := &Component{ log: o.Logger, @@ -142,16 +144,20 @@ func NewComponent(o component.Options, args Arguments) (*Component, error) { metrics: metrics, } - err := c.init() + err = c.init() if err != nil { - return nil, errors.Wrap(err, "initializing component") + return nil, fmt.Errorf("initializing component failed: %w", err) } return c, nil } func (c *Component) Run(ctx context.Context) error { - c.startup(ctx) + err := c.startup(ctx) + if err != nil { + level.Error(c.log).Log("msg", "starting up component failed", "err", err) + c.reportUnhealthy(err) + } for { select { @@ -161,13 +167,22 @@ func (c *Component) Run(ctx context.Context) error { c.args = update.args err := c.init() - update.err <- err if err != nil { level.Error(c.log).Log("msg", "updating configuration failed", "err", err) c.reportUnhealthy(err) + update.err <- err + continue } - c.startup(ctx) + err = c.startup(ctx) + if err != nil { + level.Error(c.log).Log("msg", "updating configuration failed", "err", err) + c.reportUnhealthy(err) + update.err <- err + continue + } + + update.err <- nil case <-ctx.Done(): c.shutdown() return nil @@ -180,14 +195,18 @@ func (c *Component) Run(ctx context.Context) error { } // startup launches the informers and starts the event loop. -func (c *Component) startup(ctx context.Context) { +func (c *Component) startup(ctx context.Context) error { c.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "mimir.rules.kubernetes") c.informerStopChan = make(chan struct{}) c.startNamespaceInformer() c.startRuleInformer() - c.syncMimir(ctx) + err := c.syncMimir(ctx) + if err != nil { + return err + } go c.eventLoop(ctx) + return nil } func (c *Component) shutdown() { diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index 6e430004cc37..b7b2ce63466d 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "context" + "errors" "fmt" "io" "net/http" @@ -11,7 +12,6 @@ import ( "strings" log "github.com/go-kit/log" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/config" weaveworksClient "github.com/weaveworks/common/http/client" @@ -98,7 +98,7 @@ func (r *MimirClient) doRequest(operation, path, method string, payload []byte) if err := checkResponse(resp); err != nil { _ = resp.Body.Close() - return nil, errors.Wrapf(err, "%s request to %s failed", req.Method, req.URL.String()) + return nil, fmt.Errorf("error %s %s: %w", method, path, err) } return resp, nil diff --git a/pkg/mimir/client/client_test.go b/pkg/mimir/client/client_test.go index 0777def46276..5ef8a373519b 100644 --- a/pkg/mimir/client/client_test.go +++ b/pkg/mimir/client/client_test.go @@ -91,5 +91,4 @@ func TestBuildURL(t *testing.T) { require.Equal(t, tt.resultURL, req.URL.String()) }) } - } diff --git a/pkg/mimir/client/rules_test.go b/pkg/mimir/client/rules_test.go index fb22b15a7f84..4bd8f6ddf369 100644 --- a/pkg/mimir/client/rules_test.go +++ b/pkg/mimir/client/rules_test.go @@ -72,5 +72,4 @@ func TestMimirClient_X(t *testing.T) { require.Equal(t, tc.expURLPath, req.URL.EscapedPath()) }) } - } From f7cddda2acb6ac77d62e54e648cd630eb2f44514 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 9 Dec 2022 17:06:56 -0600 Subject: [PATCH 29/40] Fix linter --- component/mimir/rules/diff.go | 4 ++-- component/mimir/rules/diff_test.go | 3 +-- component/mimir/rules/events.go | 6 +----- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 3d8f30e3897c..a7d7ed53e6fa 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -22,7 +22,7 @@ type ruleGroupDiff struct { Desired mimirClient.RuleGroup } -func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) (map[string][]ruleGroupDiff, error) { +func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) map[string][]ruleGroupDiff { seen := map[string]bool{} diff := make(map[string][]ruleGroupDiff) @@ -50,7 +50,7 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string diff[namespace] = subDiff } - return diff, nil + return diff } func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) []ruleGroupDiff { diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go index 517e0bb2a7da..3da9c0faf9e7 100644 --- a/component/mimir/rules/diff_test.go +++ b/component/mimir/rules/diff_test.go @@ -119,8 +119,7 @@ groups: for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - actual, err := diffRuleState(tc.desired, tc.actual) - require.NoError(t, err) + actual := diffRuleState(tc.desired, tc.actual) requireEqualRuleDiffs(t, tc.expected, actual) }) } diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 51a9678e2c8a..d5461bb9a140 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -143,11 +143,7 @@ func (c *Component) reconcileState(ctx context.Context) error { return err } - diffs, err := diffRuleState(desiredState, c.currentState) - if err != nil { - return err - } - + diffs := diffRuleState(desiredState, c.currentState) errs := multierror.New() for ns, diff := range diffs { err = c.applyChanges(ctx, ns, diff) From 2885fb599e675603b9d967d878ed07004dc98f3e Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 9 Dec 2022 17:28:03 -0600 Subject: [PATCH 30/40] Fix race condition in tests --- component/mimir/rules/events_test.go | 32 ++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index 40857cd5ecef..72c73db4c158 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -3,6 +3,7 @@ package rules import ( "context" "os" + "sync" "testing" "time" @@ -22,7 +23,8 @@ import ( ) type fakeMimirClient struct { - rules map[string][]mimirClient.RuleGroup + rulesMut sync.RWMutex + rules map[string][]mimirClient.RuleGroup } var _ mimirClient.Interface = &fakeMimirClient{} @@ -34,12 +36,21 @@ func newFakeMimirClient() *fakeMimirClient { } func (m *fakeMimirClient) CreateRuleGroup(ctx context.Context, namespace string, rule mimirClient.RuleGroup) error { - m.DeleteRuleGroup(ctx, namespace, rule.Name) + m.rulesMut.Lock() + defer m.rulesMut.Unlock() + m.deleteLocked(namespace, rule.Name) m.rules[namespace] = append(m.rules[namespace], rule) return nil } func (m *fakeMimirClient) DeleteRuleGroup(ctx context.Context, namespace, group string) error { + m.rulesMut.Lock() + defer m.rulesMut.Unlock() + m.deleteLocked(namespace, group) + return nil +} + +func (m *fakeMimirClient) deleteLocked(namespace, group string) { for ns, v := range m.rules { for i, g := range v { if g.Name == group { @@ -49,14 +60,15 @@ func (m *fakeMimirClient) DeleteRuleGroup(ctx context.Context, namespace, group delete(m.rules, ns) } - return nil + return } } } - return nil } func (m *fakeMimirClient) ListRules(ctx context.Context, namespace string) (map[string][]mimirClient.RuleGroup, error) { + m.rulesMut.RLock() + defer m.rulesMut.RUnlock() output := make(map[string][]mimirClient.RuleGroup) for ns, v := range m.rules { if namespace != "" && namespace != ns { @@ -132,7 +144,9 @@ func TestEventLoop(t *testing.T) { // Wait for the rule to be added to mimir require.Eventually(t, func() bool { - return len(handler.currentState) == 1 + rules, err := handler.mimirClient.ListRules(ctx, "") + require.NoError(t, err) + return len(rules) == 1 }, time.Second, 10*time.Millisecond) handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) @@ -146,7 +160,9 @@ func TestEventLoop(t *testing.T) { // Wait for the rule to be updated in mimir require.Eventually(t, func() bool { - rules := handler.currentState[mimirNamespaceForRuleCRD("agent", rule)][0].Rules + allRules, err := handler.mimirClient.ListRules(ctx, "") + require.NoError(t, err) + rules := allRules[mimirNamespaceForRuleCRD("agent", rule)][0].Rules return len(rules) == 2 }, time.Second, 10*time.Millisecond) handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) @@ -157,6 +173,8 @@ func TestEventLoop(t *testing.T) { // Wait for the rule to be removed from mimir require.Eventually(t, func() bool { - return len(handler.currentState) == 0 + rules, err := handler.mimirClient.ListRules(ctx, "") + require.NoError(t, err) + return len(rules) == 0 }, time.Second, 10*time.Millisecond) } From 8669e6371e89dcdab910f9a23500c9c612a2d833 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 9 Dec 2022 17:30:13 -0600 Subject: [PATCH 31/40] Fix linter --- component/mimir/rules/events_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index 72c73db4c158..062a850e3117 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -52,6 +52,9 @@ func (m *fakeMimirClient) DeleteRuleGroup(ctx context.Context, namespace, group func (m *fakeMimirClient) deleteLocked(namespace, group string) { for ns, v := range m.rules { + if namespace != "" && namespace != ns { + continue + } for i, g := range v { if g.Name == group { m.rules[ns] = append(m.rules[ns][:i], m.rules[ns][i+1:]...) From b602fc86e608381ee30d0c175784a3e92f55615f Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 4 Jan 2023 18:06:17 -0600 Subject: [PATCH 32/40] Add explanation of the different yaml packages used --- component/mimir/rules/diff.go | 2 +- component/mimir/rules/events.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index a7d7ed53e6fa..29a6fdb0930a 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -5,7 +5,7 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" - "gopkg.in/yaml.v3" + "gopkg.in/yaml.v3" // Used for prometheus rulefmt compatibility instead of gopkg.in/yaml.v2 ) type ruleGroupDiffKind string diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index d5461bb9a140..29db76769807 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -6,7 +6,7 @@ import ( "regexp" "time" - "github.com/ghodss/yaml" + "github.com/ghodss/yaml" // Used for CRD compatibility instead of gopkg.in/yaml.v2 "github.com/go-kit/log/level" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/grafana/dskit/multierror" From 5f8b25bfbbed7ed592a4c3ca452ec244aa4065a1 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Wed, 4 Jan 2023 18:12:47 -0600 Subject: [PATCH 33/40] Use hashicorp/go-multierror instead of dskit --- component/mimir/rules/events.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 29db76769807..4c8c9fedb1ca 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -9,7 +9,7 @@ import ( "github.com/ghodss/yaml" // Used for CRD compatibility instead of gopkg.in/yaml.v2 "github.com/go-kit/log/level" mimirClient "github.com/grafana/agent/pkg/mimir/client" - "github.com/grafana/dskit/multierror" + "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" "k8s.io/client-go/tools/cache" @@ -144,16 +144,16 @@ func (c *Component) reconcileState(ctx context.Context) error { } diffs := diffRuleState(desiredState, c.currentState) - errs := multierror.New() + var result error for ns, diff := range diffs { err = c.applyChanges(ctx, ns, diff) if err != nil { - errs = append(errs, err) + result = multierror.Append(result, err) continue } } - return errs.Err() + return result } func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, error) { @@ -192,7 +192,7 @@ func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]mimirClien groups, errs := rulefmt.Parse(buf) if len(errs) > 0 { - return nil, multierror.New(errs...).Err() + return nil, multierror.Append(nil, errs...) } mimirGroups := make([]mimirClient.RuleGroup, len(groups.Groups)) From 1479d4faebe77b98dc8d3a2ad4444c497981f599 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Thu, 5 Jan 2023 13:31:48 -0600 Subject: [PATCH 34/40] Implement event handler interface separately - The component no longer implements the event handler interface --- component/mimir/rules/events.go | 22 ++++++++++++++++++---- component/mimir/rules/events_test.go | 21 +++++++++++---------- component/mimir/rules/rules.go | 4 ++-- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index 4c8c9fedb1ca..b1b3d0108898 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -7,12 +7,14 @@ import ( "time" "github.com/ghodss/yaml" // Used for CRD compatibility instead of gopkg.in/yaml.v2 + "github.com/go-kit/log" "github.com/go-kit/log/level" mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" ) // This type must be hashable, so it is kept simple. The indexer will maintain a @@ -29,22 +31,34 @@ const ( eventTypeSyncMimir eventType = "sync-mimir" ) +type queuedEventHandler struct { + log log.Logger + queue workqueue.RateLimitingInterface +} + +func newQueuedEventHandler(log log.Logger, queue workqueue.RateLimitingInterface) *queuedEventHandler { + return &queuedEventHandler{ + log: log, + queue: queue, + } +} + // OnAdd implements the cache.ResourceEventHandler interface. -func (c *Component) OnAdd(obj interface{}) { +func (c *queuedEventHandler) OnAdd(obj interface{}) { c.publishEvent(obj) } // OnUpdate implements the cache.ResourceEventHandler interface. -func (c *Component) OnUpdate(oldObj, newObj interface{}) { +func (c *queuedEventHandler) OnUpdate(oldObj, newObj interface{}) { c.publishEvent(newObj) } // OnDelete implements the cache.ResourceEventHandler interface. -func (c *Component) OnDelete(obj interface{}) { +func (c *queuedEventHandler) OnDelete(obj interface{}) { c.publishEvent(obj) } -func (c *Component) publishEvent(obj interface{}) { +func (c *queuedEventHandler) publishEvent(obj interface{}) { key, err := cache.MetaNamespaceKeyFunc(obj) if err != nil { level.Error(c.log).Log("msg", "failed to get key for object", "err", err) diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index 062a850e3117..fb61960b89b2 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -123,7 +123,7 @@ func TestEventLoop(t *testing.T) { }, } - handler := Component{ + component := Component{ log: log.NewLogfmtLogger(os.Stdout), queue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), namespaceLister: nsLister, @@ -134,24 +134,25 @@ func TestEventLoop(t *testing.T) { args: Arguments{MimirNameSpacePrefix: "agent"}, metrics: newMetrics(), } + eventHandler := newQueuedEventHandler(component.log, component.queue) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go handler.eventLoop(ctx) + go component.eventLoop(ctx) // Add a namespace and rule to kubernetes nsIndexer.Add(ns) ruleIndexer.Add(rule) - handler.OnAdd(rule) + eventHandler.OnAdd(rule) // Wait for the rule to be added to mimir require.Eventually(t, func() bool { - rules, err := handler.mimirClient.ListRules(ctx, "") + rules, err := component.mimirClient.ListRules(ctx, "") require.NoError(t, err) return len(rules) == 1 }, time.Second, 10*time.Millisecond) - handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) + component.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) // Update the rule in kubernetes rule.Spec.Groups[0].Rules = append(rule.Spec.Groups[0].Rules, v1.Rule{ @@ -159,24 +160,24 @@ func TestEventLoop(t *testing.T) { Expr: intstr.FromString("expr2"), }) ruleIndexer.Update(rule) - handler.OnUpdate(rule, rule) + eventHandler.OnUpdate(rule, rule) // Wait for the rule to be updated in mimir require.Eventually(t, func() bool { - allRules, err := handler.mimirClient.ListRules(ctx, "") + allRules, err := component.mimirClient.ListRules(ctx, "") require.NoError(t, err) rules := allRules[mimirNamespaceForRuleCRD("agent", rule)][0].Rules return len(rules) == 2 }, time.Second, 10*time.Millisecond) - handler.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) + component.queue.AddRateLimited(event{typ: eventTypeSyncMimir}) // Remove the rule from kubernetes ruleIndexer.Delete(rule) - handler.OnDelete(rule) + eventHandler.OnDelete(rule) // Wait for the rule to be removed from mimir require.Eventually(t, func() bool { - rules, err := handler.mimirClient.ListRules(ctx, "") + rules, err := component.mimirClient.ListRules(ctx, "") require.NoError(t, err) return len(rules) == 0 }, time.Second, 10*time.Millisecond) diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index 63544a73b390..c823c5d4154b 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -298,7 +298,7 @@ func (c *Component) startNamespaceInformer() { namespaces := factory.Core().V1().Namespaces() c.namespaceLister = namespaces.Lister() c.namespaceInformer = namespaces.Informer() - c.namespaceInformer.AddEventHandler(c) + c.namespaceInformer.AddEventHandler(newQueuedEventHandler(c.log, c.queue)) factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) @@ -316,7 +316,7 @@ func (c *Component) startRuleInformer() { promRules := factory.Monitoring().V1().PrometheusRules() c.ruleLister = promRules.Lister() c.ruleInformer = promRules.Informer() - c.ruleInformer.AddEventHandler(c) + c.ruleInformer.AddEventHandler(newQueuedEventHandler(c.log, c.queue)) factory.Start(c.informerStopChan) factory.WaitForCacheSync(c.informerStopChan) From 43db7ebc71fc2d47d55aef3c856af4cac15cd007 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Thu, 5 Jan 2023 13:38:19 -0600 Subject: [PATCH 35/40] Remove remote write config from mimir client - This is a GEM-only feature and rarely used. We can add it back in the future if needed. --- component/mimir/rules/diff.go | 13 +++++------ component/mimir/rules/diff_test.go | 33 ++++++++++++---------------- component/mimir/rules/events.go | 17 ++++---------- component/mimir/rules/events_test.go | 11 +++++----- component/mimir/rules/rules.go | 3 ++- pkg/mimir/client/client.go | 5 +++-- pkg/mimir/client/rules.go | 13 +++-------- 7 files changed, 38 insertions(+), 57 deletions(-) diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/diff.go index 29a6fdb0930a..aeaa01785636 100644 --- a/component/mimir/rules/diff.go +++ b/component/mimir/rules/diff.go @@ -3,8 +3,7 @@ package rules import ( "bytes" - mimirClient "github.com/grafana/agent/pkg/mimir/client" - + "github.com/prometheus/prometheus/model/rulefmt" "gopkg.in/yaml.v3" // Used for prometheus rulefmt compatibility instead of gopkg.in/yaml.v2 ) @@ -18,11 +17,11 @@ const ( type ruleGroupDiff struct { Kind ruleGroupDiffKind - Actual mimirClient.RuleGroup - Desired mimirClient.RuleGroup + Actual rulefmt.RuleGroup + Desired rulefmt.RuleGroup } -func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string][]mimirClient.RuleGroup) map[string][]ruleGroupDiff { +func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]rulefmt.RuleGroup) map[string][]ruleGroupDiff { seen := map[string]bool{} diff := make(map[string][]ruleGroupDiff) @@ -53,7 +52,7 @@ func diffRuleState(desired map[string][]mimirClient.RuleGroup, actual map[string return diff } -func diffRuleNamespaceState(desired []mimirClient.RuleGroup, actual []mimirClient.RuleGroup) []ruleGroupDiff { +func diffRuleNamespaceState(desired []rulefmt.RuleGroup, actual []rulefmt.RuleGroup) []ruleGroupDiff { var diff []ruleGroupDiff seenGroups := map[string]bool{} @@ -97,7 +96,7 @@ desiredGroups: return diff } -func equalRuleGroups(a, b mimirClient.RuleGroup) bool { +func equalRuleGroups(a, b rulefmt.RuleGroup) bool { aBuf, err := yaml.Marshal(a) if err != nil { return false diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/diff_test.go index 3da9c0faf9e7..e52ae13288d7 100644 --- a/component/mimir/rules/diff_test.go +++ b/component/mimir/rules/diff_test.go @@ -4,22 +4,17 @@ import ( "fmt" "testing" - mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/prometheus/prometheus/model/rulefmt" "github.com/stretchr/testify/require" ) -func parseRuleGroups(t *testing.T, buf []byte) []mimirClient.RuleGroup { +func parseRuleGroups(t *testing.T, buf []byte) []rulefmt.RuleGroup { t.Helper() groups, errs := rulefmt.Parse(buf) require.Empty(t, errs) - var result []mimirClient.RuleGroup - for _, g := range groups.Groups { - result = append(result, mimirClient.RuleGroup{RuleGroup: g}) - } - return result + return groups.Groups } func TestDiffRuleState(t *testing.T) { @@ -45,24 +40,24 @@ groups: type testCase struct { name string - desired map[string][]mimirClient.RuleGroup - actual map[string][]mimirClient.RuleGroup + desired map[string][]rulefmt.RuleGroup + actual map[string][]rulefmt.RuleGroup expected map[string][]ruleGroupDiff } testCases := []testCase{ { name: "empty sets", - desired: map[string][]mimirClient.RuleGroup{}, - actual: map[string][]mimirClient.RuleGroup{}, + desired: map[string][]rulefmt.RuleGroup{}, + actual: map[string][]rulefmt.RuleGroup{}, expected: map[string][]ruleGroupDiff{}, }, { name: "add rule group", - desired: map[string][]mimirClient.RuleGroup{ + desired: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsA, }, - actual: map[string][]mimirClient.RuleGroup{}, + actual: map[string][]rulefmt.RuleGroup{}, expected: map[string][]ruleGroupDiff{ managedNamespace: { { @@ -74,8 +69,8 @@ groups: }, { name: "remove rule group", - desired: map[string][]mimirClient.RuleGroup{}, - actual: map[string][]mimirClient.RuleGroup{ + desired: map[string][]rulefmt.RuleGroup{}, + actual: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsA, }, expected: map[string][]ruleGroupDiff{ @@ -89,10 +84,10 @@ groups: }, { name: "update rule group", - desired: map[string][]mimirClient.RuleGroup{ + desired: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsA, }, - actual: map[string][]mimirClient.RuleGroup{ + actual: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsAModified, }, expected: map[string][]ruleGroupDiff{ @@ -107,10 +102,10 @@ groups: }, { name: "unchanged rule groups", - desired: map[string][]mimirClient.RuleGroup{ + desired: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsA, }, - actual: map[string][]mimirClient.RuleGroup{ + actual: map[string][]rulefmt.RuleGroup{ managedNamespace: ruleGroupsA, }, expected: map[string][]ruleGroupDiff{}, diff --git a/component/mimir/rules/events.go b/component/mimir/rules/events.go index b1b3d0108898..a2361910814a 100644 --- a/component/mimir/rules/events.go +++ b/component/mimir/rules/events.go @@ -9,7 +9,6 @@ import ( "github.com/ghodss/yaml" // Used for CRD compatibility instead of gopkg.in/yaml.v2 "github.com/go-kit/log" "github.com/go-kit/log/level" - mimirClient "github.com/grafana/agent/pkg/mimir/client" "github.com/hashicorp/go-multierror" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/prometheus/model/rulefmt" @@ -170,13 +169,13 @@ func (c *Component) reconcileState(ctx context.Context) error { return result } -func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, error) { +func (c *Component) loadStateFromK8s() (map[string][]rulefmt.RuleGroup, error) { matchedNamespaces, err := c.namespaceLister.List(c.namespaceSelector) if err != nil { return nil, fmt.Errorf("failed to list namespaces: %w", err) } - desiredState := map[string][]mimirClient.RuleGroup{} + desiredState := map[string][]rulefmt.RuleGroup{} for _, ns := range matchedNamespaces { crdState, err := c.ruleLister.PrometheusRules(ns.Name).List(c.ruleSelector) if err != nil { @@ -198,7 +197,7 @@ func (c *Component) loadStateFromK8s() (map[string][]mimirClient.RuleGroup, erro return desiredState, nil } -func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]mimirClient.RuleGroup, error) { +func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]rulefmt.RuleGroup, error) { buf, err := yaml.Marshal(crd) if err != nil { return nil, err @@ -209,15 +208,7 @@ func convertCRDRuleGroupToRuleGroup(crd promv1.PrometheusRuleSpec) ([]mimirClien return nil, multierror.Append(nil, errs...) } - mimirGroups := make([]mimirClient.RuleGroup, len(groups.Groups)) - for i, g := range groups.Groups { - mimirGroups[i] = mimirClient.RuleGroup{ - RuleGroup: g, - // TODO: allow setting remote write configs? - } - } - - return mimirGroups, nil + return groups.Groups, nil } func (c *Component) applyChanges(ctx context.Context, namespace string, diffs []ruleGroupDiff) error { diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/events_test.go index fb61960b89b2..a2d04213e77a 100644 --- a/component/mimir/rules/events_test.go +++ b/component/mimir/rules/events_test.go @@ -11,6 +11,7 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" + "github.com/prometheus/prometheus/model/rulefmt" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -24,18 +25,18 @@ import ( type fakeMimirClient struct { rulesMut sync.RWMutex - rules map[string][]mimirClient.RuleGroup + rules map[string][]rulefmt.RuleGroup } var _ mimirClient.Interface = &fakeMimirClient{} func newFakeMimirClient() *fakeMimirClient { return &fakeMimirClient{ - rules: make(map[string][]mimirClient.RuleGroup), + rules: make(map[string][]rulefmt.RuleGroup), } } -func (m *fakeMimirClient) CreateRuleGroup(ctx context.Context, namespace string, rule mimirClient.RuleGroup) error { +func (m *fakeMimirClient) CreateRuleGroup(ctx context.Context, namespace string, rule rulefmt.RuleGroup) error { m.rulesMut.Lock() defer m.rulesMut.Unlock() m.deleteLocked(namespace, rule.Name) @@ -69,10 +70,10 @@ func (m *fakeMimirClient) deleteLocked(namespace, group string) { } } -func (m *fakeMimirClient) ListRules(ctx context.Context, namespace string) (map[string][]mimirClient.RuleGroup, error) { +func (m *fakeMimirClient) ListRules(ctx context.Context, namespace string) (map[string][]rulefmt.RuleGroup, error) { m.rulesMut.RLock() defer m.rulesMut.RUnlock() - output := make(map[string][]mimirClient.RuleGroup) + output := make(map[string][]rulefmt.RuleGroup) for ns, v := range m.rules { if namespace != "" && namespace != ns { continue diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/rules.go index c823c5d4154b..15f809de0743 100644 --- a/component/mimir/rules/rules.go +++ b/component/mimir/rules/rules.go @@ -12,6 +12,7 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/rulefmt" "github.com/weaveworks/common/instrument" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -60,7 +61,7 @@ type Component struct { namespaceSelector labels.Selector ruleSelector labels.Selector - currentState map[string][]mimirClient.RuleGroup + currentState map[string][]rulefmt.RuleGroup metrics *metrics healthMut sync.RWMutex diff --git a/pkg/mimir/client/client.go b/pkg/mimir/client/client.go index b7b2ce63466d..4292e37737c4 100644 --- a/pkg/mimir/client/client.go +++ b/pkg/mimir/client/client.go @@ -14,6 +14,7 @@ import ( log "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/config" + "github.com/prometheus/prometheus/model/rulefmt" weaveworksClient "github.com/weaveworks/common/http/client" "github.com/weaveworks/common/instrument" "github.com/weaveworks/common/user" @@ -38,9 +39,9 @@ type Config struct { } type Interface interface { - CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error + CreateRuleGroup(ctx context.Context, namespace string, rg rulefmt.RuleGroup) error DeleteRuleGroup(ctx context.Context, namespace, groupName string) error - ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) + ListRules(ctx context.Context, namespace string) (map[string][]rulefmt.RuleGroup, error) } // MimirClient is a client to the Mimir API. diff --git a/pkg/mimir/client/rules.go b/pkg/mimir/client/rules.go index c82075b03876..54b591d958cb 100644 --- a/pkg/mimir/client/rules.go +++ b/pkg/mimir/client/rules.go @@ -9,20 +9,13 @@ import ( "gopkg.in/yaml.v3" ) -// RuleGroup is a list of sequentially evaluated recording and alerting rules. -type RuleGroup struct { - rulefmt.RuleGroup `yaml:",inline"` - // RWConfigs is used by the remote write forwarding ruler - RWConfigs []RemoteWriteConfig `yaml:"remote_write,omitempty"` -} - // RemoteWriteConfig is used to specify a remote write endpoint type RemoteWriteConfig struct { URL string `json:"url,omitempty"` } // CreateRuleGroup creates a new rule group -func (r *MimirClient) CreateRuleGroup(ctx context.Context, namespace string, rg RuleGroup) error { +func (r *MimirClient) CreateRuleGroup(ctx context.Context, namespace string, rg rulefmt.RuleGroup) error { payload, err := yaml.Marshal(&rg) if err != nil { return err @@ -60,7 +53,7 @@ func (r *MimirClient) DeleteRuleGroup(ctx context.Context, namespace, groupName } // ListRules retrieves a rule group -func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[string][]RuleGroup, error) { +func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[string][]rulefmt.RuleGroup, error) { path := r.apiPath op := r.apiPath if namespace != "" { @@ -80,7 +73,7 @@ func (r *MimirClient) ListRules(ctx context.Context, namespace string) (map[stri return nil, err } - ruleSet := map[string][]RuleGroup{} + ruleSet := map[string][]rulefmt.RuleGroup{} err = yaml.Unmarshal(body, &ruleSet) if err != nil { return nil, err From 8f3299b64f6774698bda0275eea62ccc4a757957 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Thu, 5 Jan 2023 15:10:56 -0600 Subject: [PATCH 36/40] Add CHANGELOG and docs --- CHANGELOG.md | 4 + .../components/mimir.rules.kubernetes.md | 202 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 docs/sources/flow/reference/components/mimir.rules.kubernetes.md diff --git a/CHANGELOG.md b/CHANGELOG.md index fba72a0e3548..377348d94a4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,8 +107,12 @@ v0.30.0 (2022-12-20) - `discovery.file` discovers files on the filesystem following glob patterns. (@mattdurham) + - `mimir.rules.kubernetes` discovers `PrometheusRule` kubernetes resources and + loads them into a Mimir instance. (@Logiraptor) + - Integrations: Introduce the `snowflake` integration. (@binaryfissiongames) + ### Enhancements - Integrations: Always use direct connection in mongodb_exporter integration. (@v-zhuravlev) diff --git a/docs/sources/flow/reference/components/mimir.rules.kubernetes.md b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md new file mode 100644 index 000000000000..17cf4f5f1697 --- /dev/null +++ b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md @@ -0,0 +1,202 @@ +--- +aliases: +- /docs/agent/latest/flow/reference/components/mimir.rules.kubernetes +title: mimir.rules.kubernetes +--- + +# mimir.rules.kubernetes + +`mimir.rules.kubernetes` discovers `PrometheusRule` kubernetes resources and +loads them into a Mimir instance. + +* Multiple `mimir.rules.kubernetes` components can be specified by giving them + different labels. +* [Kubernetes label selectors][] can be used to limit the `Namespace` and + `PrometheusRule` resources considered during reconciliation. +* Compatible with the Ruler APIs of Grafana Mimir, Grafana Cloud, and Grafana Enterprise Metrics +* Compatible with the `PrometheusRule` CRD from the [prometheus-operator][] + +[Kubernetes label selectors]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors +[prometheus-operator]: https://prometheus-operator.dev/ + +## Usage + +```river +mimir.rules.kubernetes "LABEL" { + address = MIMIR_RULER_URL +} +``` + +## Arguments + +`mimir.rules.kubernetes` supports the following arguments: + +Name | Type | Description | Default | Required +-------------------------|------------|---------------------------------------------------------|---------|--------- +`address` | `string` | URL of the Mimir ruler. | | yes +`tenant_id` | `string` | Mimir tenant ID | | no +`use_legacy_routes` | `bool` | Whether to use deprecated ruler API endpoints | false | no +`sync_interval` | `duration` | Amount of time between reconciliations with Mimir | "30s" | no +`mimir_namespace_prefix` | `string` | Prefix used to differentiate multiple agent deployments | "agent" | no + +If no `tenant_id` is provided, the component assumes that the Mimir instance at +`address` is running in single-tenant mode and no X-Scope-OrgID header is sent. + +The `sync_interval` argument determines how often Mimir's ruler API is accessed +to reload the current state of rules. Interaction with the kubernetes API works +differently. Updates are processed as events from the kubernetes API server +according to the informer pattern. + +The `mimir_namespace_prefix` argument can be used to separate the rules managed +by multiple agent deployments across your infrastructure. It should be set to a +unique value for each deployment. + +## Blocks + +The following blocks are supported inside the definition of +`mimir.rules.kubernetes`: + +Hierarchy | Block | Description | Required +-------------------------------------------|------------------------|----------------------------------------------------------|--------- +rule_namespace_selector | [label_selector][] | Label selector for `Namespace` resources | no +rule_namespace_selector > match_expression | [match_expression][] | Label match expression for `Namespace` resources | no +rule_selector | [label_selector][] | Label selector for `PrometheusRule` resources | no +rule_selector > match_expression | [match_expression][] | Label match expression for `PrometheusRule` resources | no +http_client_config | [http_client_config][] | HTTP client settings when connecting to the endpoint. | no +http_client_config > basic_auth | [basic_auth][] | Configure basic_auth for authenticating to the endpoint. | no +http_client_config > authorization | [authorization][] | Configure generic authorization to the endpoint. | no +http_client_config > oauth2 | [oauth2][] | Configure OAuth2 for authenticating to the endpoint. | no +http_client_config > oauth2 > tls_config | [tls_config][] | Configure TLS settings for connecting to the endpoint. | no +http_client_config > tls_config | [tls_config][] | Configure TLS settings for connecting to the endpoint. | no + + +The `>` symbol indicates deeper levels of nesting. For example, +`http_client_config > basic_auth` refers to a `basic_auth` block defined inside +an `http_client_config` block. + +[http_client_config]: #http_client_config-block +[basic_auth]: #basic_auth-block +[authorization]: #authorization-block +[oauth2]: #oauth2-block +[tls_config]: #tls_config-block +[label_selector]: #label_selector-block +[match_expression]: #match_expression-block + +### label_selector block + +The `label_selector` block describes a kubernetes label selector for rule or namespace discovery. + +The following arguments are supported: + +Name | Type | Description | Default | Required +---------------|---------------|---------------------------------------------------|-----------------------------|--------- +`match_labels` | `map(string)` | Label keys and values used to discover resources. | empty (match all resources) | yes + +### match_expression block + +The `match_expression` block describes a kubernetes label match expression for rule or namespace discovery. + +The following arguments are supported: + +Name | Type | Description | Default | Required +-----------|------------|----------------------------------------------------|---------|--------- +`key` | `string` | The label name to match against. | | yes +`operator` | `string` | The operator used when matching. (in,notin,exists) | | yes +`values` | `[]string` | The values used when matching. | | no + +### http_client_config block + +The `http_client_config` configures settings used to connect to the Mimir API. + +{{< docs/shared lookup="flow/reference/components/http-client-config-block.md" source="agent" >}} + +### basic_auth block + +{{< docs/shared lookup="flow/reference/components/basic-auth-block.md" source="agent" >}} + +### authorization block + +{{< docs/shared lookup="flow/reference/components/authorization-block.md" source="agent" >}} + +### oauth2 block + +{{< docs/shared lookup="flow/reference/components/oauth2-block.md" source="agent" >}} + +### tls_config block + +{{< docs/shared lookup="flow/reference/components/tls-config-block.md" source="agent" >}} + +## Exported fields + +`mimir.rules.kubernetes` does not export any fields. + +## Component health + +`mimir.rules.kubernetes` is reported as unhealthy if given an invalid configuration or an error occurs during reconciliation. + +## Debug information + +`mimir.rules.kubernetes` exposes resource-level debug information. + +The following are exposed per discovered `PrometheusRule` resource: +* The kubernetes namespace. +* The resource name. +* The resource uid. +* The number of rule groups. + +The following are exposed per discovered Mimir rule namespace resource: +* The namespace name. +* The number of rule groups. + +Only resources managed by the component are exposed - regardless of how many +actually exist. + +## Debug metrics + +* `mimir_rules_config_updates_total` (counter): Number of times the configuration has been updated. +* `mimir_rules_events_total` (counter): Number of events processed, partitioned by event type. +* `mimir_rules_events_failed_total` (counter): Number of events that failed to be processed, partitioned by event type. +* `mimir_rules_events_retried_total` (counter): Number of events that were retried, partitioned by event type. +* `mimir_rules_client_request_duration_seconds` (histogram): Duration of requests to the Mimir API. + +## Example + +This example creates a `mimir.rules.kubernetes` component that loads discovered +rules to a local Mimir instance under the `team-a` tenant. Only namespaces and +rules with the `agent` label set to `yes` are included. + +```river +mimir.rules.kubernetes "local" { + address = "mimir:8080" + tenant_id = "team-a" + + rule_namespace_selector { + match_labels = { + agent = "yes", + } + } + + rule_selector { + match_labels = { + agent = "yes", + } + } +} +``` + +This example creates a `mimir.rules.kubernetes` component that loads discovered +rules to Grafana Cloud. + +``` +mimir.rules.kubernetes "default" { + address = "GRAFANA_CLOUD_METRICS_URL" + http_client_config { + basic_auth { + username = "GRAFANA_CLOUD_USER" + password = "GRAFANA_CLOUD_API_KEY" + // Alternatively, load the password from a file: + // password_file = "GRAFANA_CLOUD_API_KEY_PATH" + } + } +} +``` \ No newline at end of file From 5185d9ab91d74515887304da02187d5a1ea25df9 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Thu, 5 Jan 2023 15:11:12 -0600 Subject: [PATCH 37/40] Rename component directory to match naming --- component/all/all.go | 2 +- component/mimir/rules/{ => kubernetes}/debug.go | 0 component/mimir/rules/{ => kubernetes}/diff.go | 0 component/mimir/rules/{ => kubernetes}/diff_test.go | 0 component/mimir/rules/{ => kubernetes}/events.go | 0 component/mimir/rules/{ => kubernetes}/events_test.go | 0 component/mimir/rules/{ => kubernetes}/health.go | 0 component/mimir/rules/{ => kubernetes}/rules.go | 0 component/mimir/rules/{ => kubernetes}/rules_test.go | 0 component/mimir/rules/{ => kubernetes}/types.go | 6 +++--- 10 files changed, 4 insertions(+), 4 deletions(-) rename component/mimir/rules/{ => kubernetes}/debug.go (100%) rename component/mimir/rules/{ => kubernetes}/diff.go (100%) rename component/mimir/rules/{ => kubernetes}/diff_test.go (100%) rename component/mimir/rules/{ => kubernetes}/events.go (100%) rename component/mimir/rules/{ => kubernetes}/events_test.go (100%) rename component/mimir/rules/{ => kubernetes}/health.go (100%) rename component/mimir/rules/{ => kubernetes}/rules.go (100%) rename component/mimir/rules/{ => kubernetes}/rules_test.go (100%) rename component/mimir/rules/{ => kubernetes}/types.go (90%) diff --git a/component/all/all.go b/component/all/all.go index 877dfa768065..52e0a661e0d4 100644 --- a/component/all/all.go +++ b/component/all/all.go @@ -11,7 +11,7 @@ import ( _ "github.com/grafana/agent/component/loki/relabel" // Import loki.relabel _ "github.com/grafana/agent/component/loki/source/file" // Import loki.source.file _ "github.com/grafana/agent/component/loki/write" // Import loki.write - _ "github.com/grafana/agent/component/mimir/rules" // Import mimir.rules + _ "github.com/grafana/agent/component/mimir/rules/kubernetes" // Import mimir.rules.kubernetes _ "github.com/grafana/agent/component/otelcol/auth/basic" // Import otelcol.auth.basic _ "github.com/grafana/agent/component/otelcol/auth/bearer" // Import otelcol.auth.bearer _ "github.com/grafana/agent/component/otelcol/auth/headers" // Import otelcol.auth.headers diff --git a/component/mimir/rules/debug.go b/component/mimir/rules/kubernetes/debug.go similarity index 100% rename from component/mimir/rules/debug.go rename to component/mimir/rules/kubernetes/debug.go diff --git a/component/mimir/rules/diff.go b/component/mimir/rules/kubernetes/diff.go similarity index 100% rename from component/mimir/rules/diff.go rename to component/mimir/rules/kubernetes/diff.go diff --git a/component/mimir/rules/diff_test.go b/component/mimir/rules/kubernetes/diff_test.go similarity index 100% rename from component/mimir/rules/diff_test.go rename to component/mimir/rules/kubernetes/diff_test.go diff --git a/component/mimir/rules/events.go b/component/mimir/rules/kubernetes/events.go similarity index 100% rename from component/mimir/rules/events.go rename to component/mimir/rules/kubernetes/events.go diff --git a/component/mimir/rules/events_test.go b/component/mimir/rules/kubernetes/events_test.go similarity index 100% rename from component/mimir/rules/events_test.go rename to component/mimir/rules/kubernetes/events_test.go diff --git a/component/mimir/rules/health.go b/component/mimir/rules/kubernetes/health.go similarity index 100% rename from component/mimir/rules/health.go rename to component/mimir/rules/kubernetes/health.go diff --git a/component/mimir/rules/rules.go b/component/mimir/rules/kubernetes/rules.go similarity index 100% rename from component/mimir/rules/rules.go rename to component/mimir/rules/kubernetes/rules.go diff --git a/component/mimir/rules/rules_test.go b/component/mimir/rules/kubernetes/rules_test.go similarity index 100% rename from component/mimir/rules/rules_test.go rename to component/mimir/rules/kubernetes/rules_test.go diff --git a/component/mimir/rules/types.go b/component/mimir/rules/kubernetes/types.go similarity index 90% rename from component/mimir/rules/types.go rename to component/mimir/rules/kubernetes/types.go index 8ea8adae4d87..7c335c0fda7c 100644 --- a/component/mimir/rules/types.go +++ b/component/mimir/rules/kubernetes/types.go @@ -9,9 +9,9 @@ import ( type Arguments struct { Address string `river:"address,attr"` - TenantID string `river:"tenant_id,attr"` + TenantID string `river:"tenant_id,attr,optional"` UseLegacyRoutes bool `river:"use_legacy_routes,attr,optional"` - HTTPClientConfig config.HTTPClientConfig `river:"client,block,optional"` + HTTPClientConfig config.HTTPClientConfig `river:"http_client_config,block,optional"` SyncInterval time.Duration `river:"sync_interval,attr,optional"` MimirNameSpacePrefix string `river:"mimir_namespace_prefix,attr,optional"` @@ -50,5 +50,5 @@ type LabelSelector struct { type MatchExpression struct { Key string `river:"key,attr"` Operator string `river:"operator,attr"` - Values []string `river:"values,attr"` + Values []string `river:"values,attr,optional"` } From 0bd63f942dd43bbf958815071b360cdbd885092a Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Thu, 5 Jan 2023 15:46:59 -0600 Subject: [PATCH 38/40] Define a type for the namespace-grouped rule groups --- component/mimir/rules/kubernetes/diff.go | 13 ++++++++----- component/mimir/rules/kubernetes/events.go | 4 ++-- component/mimir/rules/kubernetes/rules.go | 3 +-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/component/mimir/rules/kubernetes/diff.go b/component/mimir/rules/kubernetes/diff.go index aeaa01785636..34c74ed62e37 100644 --- a/component/mimir/rules/kubernetes/diff.go +++ b/component/mimir/rules/kubernetes/diff.go @@ -21,13 +21,16 @@ type ruleGroupDiff struct { Desired rulefmt.RuleGroup } -func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]rulefmt.RuleGroup) map[string][]ruleGroupDiff { - seen := map[string]bool{} +type ruleGroupsByNamespace map[string][]rulefmt.RuleGroup +type ruleGroupDiffsByNamespace map[string][]ruleGroupDiff - diff := make(map[string][]ruleGroupDiff) +func diffRuleState(desired, actual ruleGroupsByNamespace) ruleGroupDiffsByNamespace { + seenNamespaces := map[string]bool{} + + diff := make(ruleGroupDiffsByNamespace) for namespace, desiredRuleGroups := range desired { - seen[namespace] = true + seenNamespaces[namespace] = true actualRuleGroups := actual[namespace] subDiff := diffRuleNamespaceState(desiredRuleGroups, actualRuleGroups) @@ -40,7 +43,7 @@ func diffRuleState(desired map[string][]rulefmt.RuleGroup, actual map[string][]r } for namespace, actualRuleGroups := range actual { - if seen[namespace] { + if seenNamespaces[namespace] { continue } diff --git a/component/mimir/rules/kubernetes/events.go b/component/mimir/rules/kubernetes/events.go index a2361910814a..e4f606f2ff7b 100644 --- a/component/mimir/rules/kubernetes/events.go +++ b/component/mimir/rules/kubernetes/events.go @@ -169,13 +169,13 @@ func (c *Component) reconcileState(ctx context.Context) error { return result } -func (c *Component) loadStateFromK8s() (map[string][]rulefmt.RuleGroup, error) { +func (c *Component) loadStateFromK8s() (ruleGroupsByNamespace, error) { matchedNamespaces, err := c.namespaceLister.List(c.namespaceSelector) if err != nil { return nil, fmt.Errorf("failed to list namespaces: %w", err) } - desiredState := map[string][]rulefmt.RuleGroup{} + desiredState := make(ruleGroupsByNamespace) for _, ns := range matchedNamespaces { crdState, err := c.ruleLister.PrometheusRules(ns.Name).List(c.ruleSelector) if err != nil { diff --git a/component/mimir/rules/kubernetes/rules.go b/component/mimir/rules/kubernetes/rules.go index 15f809de0743..a957277b1b1c 100644 --- a/component/mimir/rules/kubernetes/rules.go +++ b/component/mimir/rules/kubernetes/rules.go @@ -12,7 +12,6 @@ import ( mimirClient "github.com/grafana/agent/pkg/mimir/client" promListers "github.com/prometheus-operator/prometheus-operator/pkg/client/listers/monitoring/v1" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/prometheus/model/rulefmt" "github.com/weaveworks/common/instrument" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -61,7 +60,7 @@ type Component struct { namespaceSelector labels.Selector ruleSelector labels.Selector - currentState map[string][]rulefmt.RuleGroup + currentState ruleGroupsByNamespace metrics *metrics healthMut sync.RWMutex From 881c66a1ad133f642f0c8d301b9523655bc97842 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 6 Jan 2023 14:15:01 -0600 Subject: [PATCH 39/40] Apply suggestions from code review Co-authored-by: Robert Fratto Co-authored-by: Karen Germond <110922559+karengermond@users.noreply.github.com> --- CHANGELOG.md | 2 +- .../components/mimir.rules.kubernetes.md | 48 +++++++++++-------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 377348d94a4e..7e52849893a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,7 +107,7 @@ v0.30.0 (2022-12-20) - `discovery.file` discovers files on the filesystem following glob patterns. (@mattdurham) - - `mimir.rules.kubernetes` discovers `PrometheusRule` kubernetes resources and + - `mimir.rules.kubernetes` discovers `PrometheusRule` Kubernetes resources and loads them into a Mimir instance. (@Logiraptor) - Integrations: Introduce the `snowflake` integration. (@binaryfissiongames) diff --git a/docs/sources/flow/reference/components/mimir.rules.kubernetes.md b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md index 17cf4f5f1697..97cae5b0ffd1 100644 --- a/docs/sources/flow/reference/components/mimir.rules.kubernetes.md +++ b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md @@ -6,15 +6,15 @@ title: mimir.rules.kubernetes # mimir.rules.kubernetes -`mimir.rules.kubernetes` discovers `PrometheusRule` kubernetes resources and +`mimir.rules.kubernetes` discovers `PrometheusRule` Kubernetes resources and loads them into a Mimir instance. * Multiple `mimir.rules.kubernetes` components can be specified by giving them different labels. * [Kubernetes label selectors][] can be used to limit the `Namespace` and `PrometheusRule` resources considered during reconciliation. -* Compatible with the Ruler APIs of Grafana Mimir, Grafana Cloud, and Grafana Enterprise Metrics -* Compatible with the `PrometheusRule` CRD from the [prometheus-operator][] +* Compatible with the Ruler APIs of Grafana Mimir, Grafana Cloud, and Grafana Enterprise Metrics. +* Compatible with the `PrometheusRule` CRD from the [prometheus-operator][]. [Kubernetes label selectors]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors [prometheus-operator]: https://prometheus-operator.dev/ @@ -34,17 +34,17 @@ mimir.rules.kubernetes "LABEL" { Name | Type | Description | Default | Required -------------------------|------------|---------------------------------------------------------|---------|--------- `address` | `string` | URL of the Mimir ruler. | | yes -`tenant_id` | `string` | Mimir tenant ID | | no -`use_legacy_routes` | `bool` | Whether to use deprecated ruler API endpoints | false | no -`sync_interval` | `duration` | Amount of time between reconciliations with Mimir | "30s" | no -`mimir_namespace_prefix` | `string` | Prefix used to differentiate multiple agent deployments | "agent" | no +`tenant_id` | `string` | Mimir tenant ID. | | no +`use_legacy_routes` | `bool` | Whether to use deprecated ruler API endpoints. | false | no +`sync_interval` | `duration` | Amount of time between reconciliations with Mimir. | "30s" | no +`mimir_namespace_prefix` | `string` | Prefix used to differentiate multiple agent deployments. | "agent" | no If no `tenant_id` is provided, the component assumes that the Mimir instance at -`address` is running in single-tenant mode and no X-Scope-OrgID header is sent. +`address` is running in single-tenant mode and no `X-Scope-OrgID` header is sent. The `sync_interval` argument determines how often Mimir's ruler API is accessed -to reload the current state of rules. Interaction with the kubernetes API works -differently. Updates are processed as events from the kubernetes API server +to reload the current state of rules. Interaction with the Kubernetes API works +differently. Updates are processed as events from the Kubernetes API server according to the informer pattern. The `mimir_namespace_prefix` argument can be used to separate the rules managed @@ -58,10 +58,10 @@ The following blocks are supported inside the definition of Hierarchy | Block | Description | Required -------------------------------------------|------------------------|----------------------------------------------------------|--------- -rule_namespace_selector | [label_selector][] | Label selector for `Namespace` resources | no -rule_namespace_selector > match_expression | [match_expression][] | Label match expression for `Namespace` resources | no -rule_selector | [label_selector][] | Label selector for `PrometheusRule` resources | no -rule_selector > match_expression | [match_expression][] | Label match expression for `PrometheusRule` resources | no +rule_namespace_selector | [label_selector][] | Label selector for `Namespace` resources. | no +rule_namespace_selector > match_expression | [match_expression][] | Label match expression for `Namespace` resources. | no +rule_selector | [label_selector][] | Label selector for `PrometheusRule` resources. | no +rule_selector > match_expression | [match_expression][] | Label match expression for `PrometheusRule` resources. | no http_client_config | [http_client_config][] | HTTP client settings when connecting to the endpoint. | no http_client_config > basic_auth | [basic_auth][] | Configure basic_auth for authenticating to the endpoint. | no http_client_config > authorization | [authorization][] | Configure generic authorization to the endpoint. | no @@ -84,26 +84,34 @@ an `http_client_config` block. ### label_selector block -The `label_selector` block describes a kubernetes label selector for rule or namespace discovery. +The `label_selector` block describes a Kubernetes label selector for rule or namespace discovery. The following arguments are supported: Name | Type | Description | Default | Required ---------------|---------------|---------------------------------------------------|-----------------------------|--------- -`match_labels` | `map(string)` | Label keys and values used to discover resources. | empty (match all resources) | yes +`match_labels` | `map(string)` | Label keys and values used to discover resources. | `{}` | yes + +When the `match_labels` argument is empty, all resources will be matched. ### match_expression block -The `match_expression` block describes a kubernetes label match expression for rule or namespace discovery. +The `match_expression` block describes a Kubernetes label match expression for rule or namespace discovery. The following arguments are supported: Name | Type | Description | Default | Required -----------|------------|----------------------------------------------------|---------|--------- `key` | `string` | The label name to match against. | | yes -`operator` | `string` | The operator used when matching. (in,notin,exists) | | yes +`operator` | `string` | The operator to use when matching. | | yes `values` | `[]string` | The values used when matching. | | no +The `operator` argument should be one of the following strings: + +* `"in"` +* `"notin"` +* `"exists"` + ### http_client_config block The `http_client_config` configures settings used to connect to the Mimir API. @@ -139,7 +147,7 @@ The `http_client_config` configures settings used to connect to the Mimir API. `mimir.rules.kubernetes` exposes resource-level debug information. The following are exposed per discovered `PrometheusRule` resource: -* The kubernetes namespace. +* The Kubernetes namespace. * The resource name. * The resource uid. * The number of rule groups. @@ -187,7 +195,7 @@ mimir.rules.kubernetes "local" { This example creates a `mimir.rules.kubernetes` component that loads discovered rules to Grafana Cloud. -``` +```river mimir.rules.kubernetes "default" { address = "GRAFANA_CLOUD_METRICS_URL" http_client_config { From a0b5f4a8f1be8dfd870d8694e359d9a375dc68a7 Mon Sep 17 00:00:00 2001 From: Patrick Oyarzun Date: Fri, 6 Jan 2023 14:22:16 -0600 Subject: [PATCH 40/40] Replace metric descriptions with a table --- .../reference/components/mimir.rules.kubernetes.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/sources/flow/reference/components/mimir.rules.kubernetes.md b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md index 97cae5b0ffd1..523e000910f0 100644 --- a/docs/sources/flow/reference/components/mimir.rules.kubernetes.md +++ b/docs/sources/flow/reference/components/mimir.rules.kubernetes.md @@ -161,11 +161,13 @@ actually exist. ## Debug metrics -* `mimir_rules_config_updates_total` (counter): Number of times the configuration has been updated. -* `mimir_rules_events_total` (counter): Number of events processed, partitioned by event type. -* `mimir_rules_events_failed_total` (counter): Number of events that failed to be processed, partitioned by event type. -* `mimir_rules_events_retried_total` (counter): Number of events that were retried, partitioned by event type. -* `mimir_rules_client_request_duration_seconds` (histogram): Duration of requests to the Mimir API. +Metric Name | Type | Description +----------------------------------------------|-------------|------------------------------------------------------------------------- +`mimir_rules_config_updates_total` | `counter` | Number of times the configuration has been updated. +`mimir_rules_events_total` | `counter` | Number of events processed, partitioned by event type. +`mimir_rules_events_failed_total` | `counter` | Number of events that failed to be processed, partitioned by event type. +`mimir_rules_events_retried_total` | `counter` | Number of events that were retried, partitioned by event type. +`mimir_rules_client_request_duration_seconds` | `histogram` | Duration of requests to the Mimir API. ## Example