From b643653a6360ea8d4e555d9a52e25073e53ca0f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gu=C3=B0mundur=20Bj=C3=B6rn=20Birkisson?= Date: Tue, 5 Sep 2023 16:33:14 +0200 Subject: [PATCH] Add denominator adjusted SLOs (#7) * Add denominator adjusted SLOs Add recording rules for storing how the current number of events compares to the average of the preceding week. This allows an SLO to burn faster or slower depending on whether there are a lot or few events in the denominator. This will make sure that SLO's burn slower at night when there are few events. --- ...loth.slok.dev_prometheusservicelevels.yaml | 28 + internal/k8sprometheus/spec.go | 8 + internal/prometheus/model.go | 28 +- internal/prometheus/recording_rules.go | 106 ++++ pkg/kubernetes/api/sloth/v1/README.md | 48 ++ pkg/kubernetes/api/sloth/v1/types.go | 27 + .../api/sloth/v1/zz_generated.deepcopy.go | 31 + ...loth.slok.dev_prometheusservicelevels.yaml | 28 + test/integration/prometheus/generate_test.go | 5 + .../testdata/in-base-k8s-denom.yaml | 45 ++ .../testdata/out-base-k8s-denom.yaml.tpl | 589 ++++++++++++++++++ 11 files changed, 941 insertions(+), 2 deletions(-) create mode 100644 test/integration/prometheus/testdata/in-base-k8s-denom.yaml create mode 100644 test/integration/prometheus/testdata/out-base-k8s-denom.yaml.tpl diff --git a/deploy/kubernetes/helm/sloth/crds/sloth.slok.dev_prometheusservicelevels.yaml b/deploy/kubernetes/helm/sloth/crds/sloth.slok.dev_prometheusservicelevels.yaml index 7283e667..aae9e8b0 100644 --- a/deploy/kubernetes/helm/sloth/crds/sloth.slok.dev_prometheusservicelevels.yaml +++ b/deploy/kubernetes/helm/sloth/crds/sloth.slok.dev_prometheusservicelevels.yaml @@ -167,6 +167,34 @@ spec: description: SLI is the indicator (service level indicator) for this specific SLO. properties: + denominator_corrected: + description: DenominatorCorrected is the denominator corrected + events SLI type. + properties: + errorQuery: + description: ErrorQuery is a Prometheus query that will + get the number/count of events that we consider that + are bad for the SLO (e.g "http 5xx", "latency > 250ms"...). + Requires the usage of `{{.window}}` template variable. + ErrorQuery and SuccessQuery are mutually exclusive. + type: string + successQuery: + description: SuccessQuery is a Prometheus query that + will get the number/count of events that we consider + that are good for the SLO (e.g "http not 5xx", "latency + < 250ms"...). Requires the usage of `{{.window}}` + template variable. ErrorQuery and SuccessQuery are + mutually exclusive. + type: string + totalQuery: + description: TotalQuery is a Prometheus query that will + get the total number/count of events for the SLO (e.g + "all http requests"...). Requires the usage of `{{.window}}` + template variable. + type: string + required: + - totalQuery + type: object events: description: Events is the events SLI type. properties: diff --git a/internal/k8sprometheus/spec.go b/internal/k8sprometheus/spec.go index d2226435..fbc4d1af 100644 --- a/internal/k8sprometheus/spec.go +++ b/internal/k8sprometheus/spec.go @@ -119,6 +119,14 @@ func mapSpecToModel(ctx context.Context, defaultWindowPeriod time.Duration, plug } } + if specSLO.SLI.DenominatorCorrected != nil { + slo.SLI.DenominatorCorrected = &prometheus.SLIDenominatorCorrectedEvents{ + ErrorQuery: specSLO.SLI.DenominatorCorrected.ErrorQuery, + SuccessQuery: specSLO.SLI.DenominatorCorrected.SuccessQuery, + TotalQuery: specSLO.SLI.DenominatorCorrected.TotalQuery, + } + } + if specSLO.SLI.Plugin != nil { plugin, err := pluginsRepo.GetSLIPlugin(ctx, specSLO.SLI.Plugin.ID) if err != nil { diff --git a/internal/prometheus/model.go b/internal/prometheus/model.go index eb9c1532..7982c52b 100644 --- a/internal/prometheus/model.go +++ b/internal/prometheus/model.go @@ -16,8 +16,9 @@ import ( // SLI reprensents an SLI with custom error and total expressions. type SLI struct { - Raw *SLIRaw - Events *SLIEvents + Raw *SLIRaw + Events *SLIEvents + DenominatorCorrected *SLIDenominatorCorrectedEvents } type SLIRaw struct { @@ -29,6 +30,12 @@ type SLIEvents struct { TotalQuery string `validate:"required,prom_expr,template_vars"` } +type SLIDenominatorCorrectedEvents struct { + ErrorQuery *string `validate:"omitempty,prom_expr,template_vars"` + SuccessQuery *string `validate:"omitempty,prom_expr,template_vars"` + TotalQuery string `validate:"required,prom_expr,template_vars"` +} + // AlertMeta is the metadata of an alert settings. type AlertMeta struct { Disable bool @@ -90,6 +97,7 @@ var modelSpecValidate = func() *validator.Validate { v.RegisterStructValidation(validateOneSLI, SLI{}) v.RegisterStructValidation(validateSLOGroup, SLOGroup{}) v.RegisterStructValidation(validateSLIEvents, SLIEvents{}) + v.RegisterStructValidation(validateDenominatorCorrected, SLIDenominatorCorrectedEvents{}) return v }() @@ -258,6 +266,22 @@ func validateOneSLI(sl validator.StructLevel) { } } +func validateDenominatorCorrected(sl validator.StructLevel) { + denominatorCorrected, ok := sl.Current().Interface().(SLIDenominatorCorrectedEvents) + if !ok { + sl.ReportError(denominatorCorrected, "", "SLIDenominatorCorrectedEvents", "not_denominator_corrected", "") + return + } + + if denominatorCorrected.ErrorQuery != nil && denominatorCorrected.SuccessQuery != nil { + sl.ReportError(denominatorCorrected, "", "", "query_repeated", "") + } + + if denominatorCorrected.ErrorQuery == nil && denominatorCorrected.SuccessQuery == nil { + sl.ReportError(denominatorCorrected, "", "", "no_query_supplied", "") + } +} + // validateSLOGroup validates SLO IDs are not repeated. func validateSLOGroup(sl validator.StructLevel) { sloGroup, ok := sl.Current().Interface().(SLOGroup) diff --git a/internal/prometheus/recording_rules.go b/internal/prometheus/recording_rules.go index 45411457..723fab6f 100644 --- a/internal/prometheus/recording_rules.go +++ b/internal/prometheus/recording_rules.go @@ -70,6 +70,8 @@ func factorySLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBA // Raw based SLI. case slo.SLI.Raw != nil: return rawSLIRecordGenerator(slo, window, alerts) + case slo.SLI.DenominatorCorrected != nil: + return denominatorCorrectedSLIRecordGenerator(slo, window, alerts) } return nil, fmt.Errorf("invalid SLI type") @@ -141,6 +143,64 @@ func eventsSLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBAl }, nil } +func denominatorCorrectedSLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBAlertGroup) (*rulefmt.Rule, error) { + var sliExprTpl string + + if slo.SLI.DenominatorCorrected.ErrorQuery != nil { + const sliExprTplFmt = `( +slo:numerator_correction:ratio{{.window}}{{.filter}} +* on() +%s +) +/ +(%s) +` + sliExprTpl = fmt.Sprintf(sliExprTplFmt, *slo.SLI.DenominatorCorrected.ErrorQuery, slo.SLI.DenominatorCorrected.TotalQuery) + } else if slo.SLI.DenominatorCorrected.SuccessQuery != nil { + const sliExprTplFmt = `slo:numerator_correction:ratio{{.window}}{{.filter}} +* on() (1 - +( +%s +) +/ +(%s) +) +` + sliExprTpl = fmt.Sprintf(sliExprTplFmt, *slo.SLI.DenominatorCorrected.SuccessQuery, slo.SLI.DenominatorCorrected.TotalQuery) + } else { + return nil, fmt.Errorf("missing error or success query") + } + + // Render with our templated data. + tpl, err := template.New("sliExpr").Option("missingkey=error").Parse(sliExprTpl) + if err != nil { + return nil, fmt.Errorf("could not create SLI expression template data: %w", err) + } + + strWindow := timeDurationToPromStr(window) + var b bytes.Buffer + err = tpl.Execute(&b, map[string]string{ + tplKeyWindow: strWindow, + "filter": labelsToPromFilter(slo.GetSLOIDPromLabels()), + "windowKey": sloWindowLabelName, + }) + if err != nil { + return nil, fmt.Errorf("could not render SLI expression template: %w", err) + } + + return &rulefmt.Rule{ + Record: slo.GetSLIErrorMetric(window), + Expr: b.String(), + Labels: mergeLabels( + slo.GetSLOIDPromLabels(), + map[string]string{ + sloWindowLabelName: strWindow, + }, + slo.Labels, + ), + }, nil +} + // optimizedSLIRecordGenerator gets a SLI recording rule from other SLI recording rules. This optimization // will make Prometheus consume less CPU and memory, however the result will be less accurate. Used wisely // is a good tradeoff. For example on calculating informative metrics like total period window (30d). @@ -302,9 +362,55 @@ func (m metadataRecordingRulesGenerator) GenerateMetadataRecordingRules(ctx cont }, } + if slo.SLI.DenominatorCorrected != nil { + windows := getAlertGroupWindows(alerts) + windows = append(windows, slo.TimeWindow) // Add the total time window as a handy helper. + for _, window := range windows { + rule, err := createNumeratorCorrection(slo, labels, window) + if err != nil { + return nil, fmt.Errorf("could not create numerator rule: %v", err) + } + rules = append(rules, *rule) + } + } + return rules, nil } +func createNumeratorCorrection(slo SLO, labels map[string]string, window time.Duration) (*rulefmt.Rule, error) { + windowString := timeDurationToPromStr(window) + metricSLONumeratorCorrection := fmt.Sprintf("slo:numerator_correction:ratio%s", windowString) + totalquery := slo.SLI.DenominatorCorrected.TotalQuery + + tpl, err := template.New("sliExpr").Option("missingkey=error").Parse(totalquery) + if err != nil { + return nil, fmt.Errorf("could not create %s expression template data: %w", metricSLONumeratorCorrection, err) + } + + var numeratorBuffer bytes.Buffer + err = tpl.Execute(&numeratorBuffer, map[string]string{ + tplKeyWindow: windowString, + }) + if err != nil { + return nil, fmt.Errorf("could not create numerator for %s: %w", metricSLONumeratorCorrection, err) + } + + denominatorWindow := timeDurationToPromStr(time.Hour * 24 * 30) + var denominatorBuffer bytes.Buffer + err = tpl.Execute(&denominatorBuffer, map[string]string{ + tplKeyWindow: denominatorWindow, + }) + if err != nil { + return nil, fmt.Errorf("could not create denominator for %s: %w", metricSLONumeratorCorrection, err) + } + + return &rulefmt.Rule{ + Record: metricSLONumeratorCorrection, + Expr: fmt.Sprintf(`(%s)/(%s)`, numeratorBuffer.String(), denominatorBuffer.String()), + Labels: labels, + }, nil +} + var burnRateRecordingExprTpl = template.Must(template.New("burnRateExpr").Option("missingkey=error").Parse(`{{ .SLIErrorMetric }}{{ .MetricFilter }} / on({{ .SLOIDName }}, {{ .SLOLabelName }}, {{ .SLOServiceName }}) group_left {{ .ErrorBudgetRatioMetric }}{{ .MetricFilter }} diff --git a/pkg/kubernetes/api/sloth/v1/README.md b/pkg/kubernetes/api/sloth/v1/README.md index 46ef875c..117e8c06 100755 --- a/pkg/kubernetes/api/sloth/v1/README.md +++ b/pkg/kubernetes/api/sloth/v1/README.md @@ -35,6 +35,9 @@ import "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" - [type SLI](<#type-sli>) - [func (in *SLI) DeepCopy() *SLI](<#func-sli-deepcopy>) - [func (in *SLI) DeepCopyInto(out *SLI)](<#func-sli-deepcopyinto>) +- [type SLIDenominatorCorrected](<#type-slidenominatorcorrected>) + - [func (in *SLIDenominatorCorrected) DeepCopy() *SLIDenominatorCorrected](<#func-slidenominatorcorrected-deepcopy>) + - [func (in *SLIDenominatorCorrected) DeepCopyInto(out *SLIDenominatorCorrected)](<#func-slidenominatorcorrected-deepcopyinto>) - [type SLIEvents](<#type-slievents>) - [func (in *SLIEvents) DeepCopy() *SLIEvents](<#func-slievents-deepcopy>) - [func (in *SLIEvents) DeepCopyInto(out *SLIEvents)](<#func-slievents-deepcopyinto>) @@ -339,6 +342,10 @@ type SLI struct { // +optional Events *SLIEvents `json:"events,omitempty"` + // DenominatorCorrected is the denominator corrected events SLI type. + // +optional + DenominatorCorrected *SLIDenominatorCorrected `json:"denominator_corrected,omitempty"` + // Plugin is the pluggable SLI type. // +optional Plugin *SLIPlugin `json:"plugin,omitempty"` @@ -361,6 +368,47 @@ func (in *SLI) DeepCopyInto(out *SLI) DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non\-nil. +## type SLIDenominatorCorrected + +SLIDenominatorCorrected is an SLI that is calculated as the division of bad events and total events, or 1 \- \(good / total\) events giving a ratio SLI. This SLI is corrected based on the total number of events for the last 30d, meaning that low\-event hours will have less impact on burn\-rate than high\-event hours. In other words, ratios with low denominators will have less impact. + +```go +type SLIDenominatorCorrected struct { + // ErrorQuery is a Prometheus query that will get the number/count of events + // that we consider that are bad for the SLO (e.g "http 5xx", "latency > 250ms"...). + // Requires the usage of `{{.window}}` template variable. ErrorQuery and + // SuccessQuery are mutually exclusive. + ErrorQuery *string `json:"errorQuery,omitempty"` + + // SuccessQuery is a Prometheus query that will get the number/count of events + // that we consider that are good for the SLO (e.g "http not 5xx", "latency < 250ms"...). + // Requires the usage of `{{.window}}` template variable. ErrorQuery and + // SuccessQuery are mutually exclusive. + SuccessQuery *string `json:"successQuery,omitempty"` + + // TotalQuery is a Prometheus query that will get the total number/count of events + // for the SLO (e.g "all http requests"...). + // Requires the usage of `{{.window}}` template variable. + TotalQuery string `json:"totalQuery"` +} +``` + +### func \(\*SLIDenominatorCorrected\) DeepCopy + +```go +func (in *SLIDenominatorCorrected) DeepCopy() *SLIDenominatorCorrected +``` + +DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SLIDenominatorCorrected. + +### func \(\*SLIDenominatorCorrected\) DeepCopyInto + +```go +func (in *SLIDenominatorCorrected) DeepCopyInto(out *SLIDenominatorCorrected) +``` + +DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non\-nil. + ## type SLIEvents SLIEvents is an SLI that is calculated as the division of bad events and total events, giving a ratio SLI. Normally this is the most common ratio type. diff --git a/pkg/kubernetes/api/sloth/v1/types.go b/pkg/kubernetes/api/sloth/v1/types.go index 00e27c73..ab3199c8 100644 --- a/pkg/kubernetes/api/sloth/v1/types.go +++ b/pkg/kubernetes/api/sloth/v1/types.go @@ -94,6 +94,10 @@ type SLI struct { // +optional Events *SLIEvents `json:"events,omitempty"` + // DenominatorCorrected is the denominator corrected events SLI type. + // +optional + DenominatorCorrected *SLIDenominatorCorrected `json:"denominator_corrected,omitempty"` + // Plugin is the pluggable SLI type. // +optional Plugin *SLIPlugin `json:"plugin,omitempty"` @@ -120,6 +124,29 @@ type SLIEvents struct { TotalQuery string `json:"totalQuery"` } +// SLIDenominatorCorrected is an SLI that is calculated as the division of bad events and total events, or +// 1 - (good / total) events giving a ratio SLI. This SLI is corrected based on the total number of events +// for the last 30d, meaning that low-event hours will have less impact on burn-rate than high-event hours. +// In other words, ratios with low denominators will have less impact. +type SLIDenominatorCorrected struct { + // ErrorQuery is a Prometheus query that will get the number/count of events + // that we consider that are bad for the SLO (e.g "http 5xx", "latency > 250ms"...). + // Requires the usage of `{{.window}}` template variable. ErrorQuery and + // SuccessQuery are mutually exclusive. + ErrorQuery *string `json:"errorQuery,omitempty"` + + // SuccessQuery is a Prometheus query that will get the number/count of events + // that we consider that are good for the SLO (e.g "http not 5xx", "latency < 250ms"...). + // Requires the usage of `{{.window}}` template variable. ErrorQuery and + // SuccessQuery are mutually exclusive. + SuccessQuery *string `json:"successQuery,omitempty"` + + // TotalQuery is a Prometheus query that will get the total number/count of events + // for the SLO (e.g "all http requests"...). + // Requires the usage of `{{.window}}` template variable. + TotalQuery string `json:"totalQuery"` +} + // SLIPlugin will use the SLI returned by the SLI plugin selected along with the options. type SLIPlugin struct { // Name is the name of the plugin that needs to load. diff --git a/pkg/kubernetes/api/sloth/v1/zz_generated.deepcopy.go b/pkg/kubernetes/api/sloth/v1/zz_generated.deepcopy.go index ea5cdfb9..8925c56c 100644 --- a/pkg/kubernetes/api/sloth/v1/zz_generated.deepcopy.go +++ b/pkg/kubernetes/api/sloth/v1/zz_generated.deepcopy.go @@ -195,6 +195,11 @@ func (in *SLI) DeepCopyInto(out *SLI) { *out = new(SLIEvents) **out = **in } + if in.DenominatorCorrected != nil { + in, out := &in.DenominatorCorrected, &out.DenominatorCorrected + *out = new(SLIDenominatorCorrected) + (*in).DeepCopyInto(*out) + } if in.Plugin != nil { in, out := &in.Plugin, &out.Plugin *out = new(SLIPlugin) @@ -213,6 +218,32 @@ func (in *SLI) DeepCopy() *SLI { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SLIDenominatorCorrected) DeepCopyInto(out *SLIDenominatorCorrected) { + *out = *in + if in.ErrorQuery != nil { + in, out := &in.ErrorQuery, &out.ErrorQuery + *out = new(string) + **out = **in + } + if in.SuccessQuery != nil { + in, out := &in.SuccessQuery, &out.SuccessQuery + *out = new(string) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SLIDenominatorCorrected. +func (in *SLIDenominatorCorrected) DeepCopy() *SLIDenominatorCorrected { + if in == nil { + return nil + } + out := new(SLIDenominatorCorrected) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SLIEvents) DeepCopyInto(out *SLIEvents) { *out = *in diff --git a/pkg/kubernetes/gen/crd/sloth.slok.dev_prometheusservicelevels.yaml b/pkg/kubernetes/gen/crd/sloth.slok.dev_prometheusservicelevels.yaml index 7283e667..aae9e8b0 100644 --- a/pkg/kubernetes/gen/crd/sloth.slok.dev_prometheusservicelevels.yaml +++ b/pkg/kubernetes/gen/crd/sloth.slok.dev_prometheusservicelevels.yaml @@ -167,6 +167,34 @@ spec: description: SLI is the indicator (service level indicator) for this specific SLO. properties: + denominator_corrected: + description: DenominatorCorrected is the denominator corrected + events SLI type. + properties: + errorQuery: + description: ErrorQuery is a Prometheus query that will + get the number/count of events that we consider that + are bad for the SLO (e.g "http 5xx", "latency > 250ms"...). + Requires the usage of `{{.window}}` template variable. + ErrorQuery and SuccessQuery are mutually exclusive. + type: string + successQuery: + description: SuccessQuery is a Prometheus query that + will get the number/count of events that we consider + that are good for the SLO (e.g "http not 5xx", "latency + < 250ms"...). Requires the usage of `{{.window}}` + template variable. ErrorQuery and SuccessQuery are + mutually exclusive. + type: string + totalQuery: + description: TotalQuery is a Prometheus query that will + get the total number/count of events for the SLO (e.g + "all http requests"...). Requires the usage of `{{.window}}` + template variable. + type: string + required: + - totalQuery + type: object events: description: Events is the events SLI type. properties: diff --git a/test/integration/prometheus/generate_test.go b/test/integration/prometheus/generate_test.go index 0b2f28eb..1c0501d1 100644 --- a/test/integration/prometheus/generate_test.go +++ b/test/integration/prometheus/generate_test.go @@ -60,6 +60,11 @@ func TestPrometheusGenerate(t *testing.T) { expOut: expectLoader.mustLoadExp("./testdata/out-base-k8s.yaml.tpl"), }, + "Generate should generate the correct rules for all the corrected SLOs (Kubernetes).": { + genCmdArgs: "--input ./testdata/in-base-k8s-denom.yaml", + expOut: expectLoader.mustLoadExp("./testdata/out-base-k8s-denom.yaml.tpl"), + }, + "Generate without alerts should generate the correct recording rules for all the SLOs.": { genCmdArgs: "--input ./testdata/in-base.yaml --disable-alerts", expOut: expectLoader.mustLoadExp("./testdata/out-base-no-alerts.yaml.tpl"), diff --git a/test/integration/prometheus/testdata/in-base-k8s-denom.yaml b/test/integration/prometheus/testdata/in-base-k8s-denom.yaml new file mode 100644 index 00000000..328cac4f --- /dev/null +++ b/test/integration/prometheus/testdata/in-base-k8s-denom.yaml @@ -0,0 +1,45 @@ +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + name: svc + namespace: test-ns +spec: + service: "svc01" + labels: + global01k1: global01v1 + slos: + - name: "slo1" + objective: 99.9 + description: "This is SLO 01." + labels: + global02k1: global02v1 + sli: + denominator_corrected: + errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) + totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) + alerting: + name: myServiceAlert + labels: + alert01k1: "alert01v1" + annotations: + alert02k1: "alert02k2" + pageAlert: + labels: + alert03k1: "alert03v1" + ticketAlert: + labels: + alert04k1: "alert04v1" + - name: "slo02" + objective: 95 + description: "This is SLO 02." + labels: + global03k1: global03v1 + sli: + denominator_corrected: + successQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}])) + totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) + alerting: + pageAlert: + disable: true + ticketAlert: + disable: true diff --git a/test/integration/prometheus/testdata/out-base-k8s-denom.yaml.tpl b/test/integration/prometheus/testdata/out-base-k8s-denom.yaml.tpl new file mode 100644 index 00000000..e4dae925 --- /dev/null +++ b/test/integration/prometheus/testdata/out-base-k8s-denom.yaml.tpl @@ -0,0 +1,589 @@ + +--- +# Code generated by Sloth ({{ .version }}): https://github.com/slok/sloth. +# DO NOT EDIT. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: SLO + app.kubernetes.io/managed-by: sloth + name: svc + namespace: test-ns +spec: + groups: + - name: sloth-slo-sli-recordings-svc01-slo1 + rules: + - expr: | + ( + slo:numerator_correction:ratio5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[5m])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + ( + slo:numerator_correction:ratio30m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[30m])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + ( + slo:numerator_correction:ratio1h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + ( + slo:numerator_correction:ratio2h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[2h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + ( + slo:numerator_correction:ratio6h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[6h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + ( + slo:numerator_correction:ratio1d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[1d])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + ( + slo:numerator_correction:ratio3d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + * on() + sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[3d])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(sum(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"})[30d:]) + / + count_over_time(sum(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"})[30d:]) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-svc01-slo1 + rules: + - expr: vector(0.9990000000000001) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:objective:ratio + - expr: vector(1-0.9990000000000001) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:error_budget:ratio + - expr: vector(30) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="svc01-slo1", sloth_service="svc01", + sloth_slo="slo1"} + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_mode: cli-gen-k8s + sloth_objective: "99.9" + sloth_service: svc01 + sloth_slo: slo1 + sloth_spec: sloth.slok.dev/v1 + sloth_version: {{ .version }} + record: sloth_slo_info + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio5m + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio30m + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio1h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio2h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio6h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio1d + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio3d + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[30d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global02k1: global02v1 + sloth_id: svc01-slo1 + sloth_service: svc01 + sloth_slo: slo1 + record: slo:numerator_correction:ratio30d + - name: sloth-slo-alerts-svc01-slo1 + rules: + - alert: myServiceAlert + annotations: + alert02k1: alert02k2 + summary: '{{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget + burn rate is over expected.' + title: (page) {{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget + burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (14.4 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (6 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (6 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + alert01k1: alert01v1 + alert03k1: alert03v1 + sloth_severity: page + - alert: myServiceAlert + annotations: + alert02k1: alert02k2 + summary: '{{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error budget + burn rate is over expected.' + title: (ticket) {{"{{$labels.sloth_service}}"}} {{"{{$labels.sloth_slo}}"}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (3 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (3 * 0.0009999999999999432)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (1 * 0.0009999999999999432)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="svc01-slo1", sloth_service="svc01", sloth_slo="slo1"} > (1 * 0.0009999999999999432)) without (sloth_window) + ) + labels: + alert01k1: alert01v1 + alert04k1: alert04v1 + sloth_severity: ticket + - name: sloth-slo-sli-recordings-svc01-slo02 + rules: + - expr: | + slo:numerator_correction:ratio5m{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[5m])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + slo:numerator_correction:ratio30m{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[30m])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + slo:numerator_correction:ratio1h{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[1h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + slo:numerator_correction:ratio2h{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[2h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + slo:numerator_correction:ratio6h{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[6h])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + slo:numerator_correction:ratio1d{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[1d])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + slo:numerator_correction:ratio3d{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + * on() (1 - + ( + sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[3d])) + ) + / + (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d]))) + ) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(sum(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"})[30d:]) + / + count_over_time(sum(slo:sli_error:ratio_rate5m{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"})[30d:]) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-svc01-slo02 + rules: + - expr: vector(0.95) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:objective:ratio + - expr: vector(1-0.95) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:error_budget:ratio + - expr: vector(30) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="svc01-slo02", sloth_service="svc01", sloth_slo="slo02"} + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="svc01-slo02", sloth_service="svc01", + sloth_slo="slo02"} + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_mode: cli-gen-k8s + sloth_objective: "95" + sloth_service: svc01 + sloth_slo: slo02 + sloth_spec: sloth.slok.dev/v1 + sloth_version: {{ .version }} + record: sloth_slo_info + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[5m])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio5m + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[30m])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio30m + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[1h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio1h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[2h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio2h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[6h])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio6h + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[1d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio1d + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[3d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio3d + - expr: (sum(rate(http_request_duration_seconds_count{job="myservice"}[30d])))/(sum(rate(http_request_duration_seconds_count{job="myservice"}[30d]))) + labels: + global01k1: global01v1 + global03k1: global03v1 + sloth_id: svc01-slo02 + sloth_service: svc01 + sloth_slo: slo02 + record: slo:numerator_correction:ratio30d