Skip to content

Commit

Permalink
Add denominator adjusted SLOs (#7)
Browse files Browse the repository at this point in the history
* Add denominator adjusted SLOs

Add recording rules for storing how the current number of events
compares to the average of the preceding week. This allows an SLO to
burn faster or slower depending on whether there are a lot or few events
in the denominator.

This will make sure that SLO's burn slower at night when there are few
events.
  • Loading branch information
Guðmundur Björn Birkisson authored Sep 5, 2023
1 parent f813004 commit b643653
Show file tree
Hide file tree
Showing 11 changed files with 941 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,34 @@ spec:
description: SLI is the indicator (service level indicator)
for this specific SLO.
properties:
denominator_corrected:
description: DenominatorCorrected is the denominator corrected
events SLI type.
properties:
errorQuery:
description: ErrorQuery is a Prometheus query that will
get the number/count of events that we consider that
are bad for the SLO (e.g "http 5xx", "latency > 250ms"...).
Requires the usage of `{{.window}}` template variable.
ErrorQuery and SuccessQuery are mutually exclusive.
type: string
successQuery:
description: SuccessQuery is a Prometheus query that
will get the number/count of events that we consider
that are good for the SLO (e.g "http not 5xx", "latency
< 250ms"...). Requires the usage of `{{.window}}`
template variable. ErrorQuery and SuccessQuery are
mutually exclusive.
type: string
totalQuery:
description: TotalQuery is a Prometheus query that will
get the total number/count of events for the SLO (e.g
"all http requests"...). Requires the usage of `{{.window}}`
template variable.
type: string
required:
- totalQuery
type: object
events:
description: Events is the events SLI type.
properties:
Expand Down
8 changes: 8 additions & 0 deletions internal/k8sprometheus/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@ func mapSpecToModel(ctx context.Context, defaultWindowPeriod time.Duration, plug
}
}

if specSLO.SLI.DenominatorCorrected != nil {
slo.SLI.DenominatorCorrected = &prometheus.SLIDenominatorCorrectedEvents{
ErrorQuery: specSLO.SLI.DenominatorCorrected.ErrorQuery,
SuccessQuery: specSLO.SLI.DenominatorCorrected.SuccessQuery,
TotalQuery: specSLO.SLI.DenominatorCorrected.TotalQuery,
}
}

if specSLO.SLI.Plugin != nil {
plugin, err := pluginsRepo.GetSLIPlugin(ctx, specSLO.SLI.Plugin.ID)
if err != nil {
Expand Down
28 changes: 26 additions & 2 deletions internal/prometheus/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ import (

// SLI reprensents an SLI with custom error and total expressions.
type SLI struct {
Raw *SLIRaw
Events *SLIEvents
Raw *SLIRaw
Events *SLIEvents
DenominatorCorrected *SLIDenominatorCorrectedEvents
}

type SLIRaw struct {
Expand All @@ -29,6 +30,12 @@ type SLIEvents struct {
TotalQuery string `validate:"required,prom_expr,template_vars"`
}

type SLIDenominatorCorrectedEvents struct {
ErrorQuery *string `validate:"omitempty,prom_expr,template_vars"`
SuccessQuery *string `validate:"omitempty,prom_expr,template_vars"`
TotalQuery string `validate:"required,prom_expr,template_vars"`
}

// AlertMeta is the metadata of an alert settings.
type AlertMeta struct {
Disable bool
Expand Down Expand Up @@ -90,6 +97,7 @@ var modelSpecValidate = func() *validator.Validate {
v.RegisterStructValidation(validateOneSLI, SLI{})
v.RegisterStructValidation(validateSLOGroup, SLOGroup{})
v.RegisterStructValidation(validateSLIEvents, SLIEvents{})
v.RegisterStructValidation(validateDenominatorCorrected, SLIDenominatorCorrectedEvents{})
return v
}()

Expand Down Expand Up @@ -258,6 +266,22 @@ func validateOneSLI(sl validator.StructLevel) {
}
}

func validateDenominatorCorrected(sl validator.StructLevel) {
denominatorCorrected, ok := sl.Current().Interface().(SLIDenominatorCorrectedEvents)
if !ok {
sl.ReportError(denominatorCorrected, "", "SLIDenominatorCorrectedEvents", "not_denominator_corrected", "")
return
}

if denominatorCorrected.ErrorQuery != nil && denominatorCorrected.SuccessQuery != nil {
sl.ReportError(denominatorCorrected, "", "", "query_repeated", "")
}

if denominatorCorrected.ErrorQuery == nil && denominatorCorrected.SuccessQuery == nil {
sl.ReportError(denominatorCorrected, "", "", "no_query_supplied", "")
}
}

// validateSLOGroup validates SLO IDs are not repeated.
func validateSLOGroup(sl validator.StructLevel) {
sloGroup, ok := sl.Current().Interface().(SLOGroup)
Expand Down
106 changes: 106 additions & 0 deletions internal/prometheus/recording_rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ func factorySLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBA
// Raw based SLI.
case slo.SLI.Raw != nil:
return rawSLIRecordGenerator(slo, window, alerts)
case slo.SLI.DenominatorCorrected != nil:
return denominatorCorrectedSLIRecordGenerator(slo, window, alerts)
}

return nil, fmt.Errorf("invalid SLI type")
Expand Down Expand Up @@ -141,6 +143,64 @@ func eventsSLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBAl
}, nil
}

func denominatorCorrectedSLIRecordGenerator(slo SLO, window time.Duration, alerts alert.MWMBAlertGroup) (*rulefmt.Rule, error) {
var sliExprTpl string

if slo.SLI.DenominatorCorrected.ErrorQuery != nil {
const sliExprTplFmt = `(
slo:numerator_correction:ratio{{.window}}{{.filter}}
* on()
%s
)
/
(%s)
`
sliExprTpl = fmt.Sprintf(sliExprTplFmt, *slo.SLI.DenominatorCorrected.ErrorQuery, slo.SLI.DenominatorCorrected.TotalQuery)
} else if slo.SLI.DenominatorCorrected.SuccessQuery != nil {
const sliExprTplFmt = `slo:numerator_correction:ratio{{.window}}{{.filter}}
* on() (1 -
(
%s
)
/
(%s)
)
`
sliExprTpl = fmt.Sprintf(sliExprTplFmt, *slo.SLI.DenominatorCorrected.SuccessQuery, slo.SLI.DenominatorCorrected.TotalQuery)
} else {
return nil, fmt.Errorf("missing error or success query")
}

// Render with our templated data.
tpl, err := template.New("sliExpr").Option("missingkey=error").Parse(sliExprTpl)
if err != nil {
return nil, fmt.Errorf("could not create SLI expression template data: %w", err)
}

strWindow := timeDurationToPromStr(window)
var b bytes.Buffer
err = tpl.Execute(&b, map[string]string{
tplKeyWindow: strWindow,
"filter": labelsToPromFilter(slo.GetSLOIDPromLabels()),
"windowKey": sloWindowLabelName,
})
if err != nil {
return nil, fmt.Errorf("could not render SLI expression template: %w", err)
}

return &rulefmt.Rule{
Record: slo.GetSLIErrorMetric(window),
Expr: b.String(),
Labels: mergeLabels(
slo.GetSLOIDPromLabels(),
map[string]string{
sloWindowLabelName: strWindow,
},
slo.Labels,
),
}, nil
}

// optimizedSLIRecordGenerator gets a SLI recording rule from other SLI recording rules. This optimization
// will make Prometheus consume less CPU and memory, however the result will be less accurate. Used wisely
// is a good tradeoff. For example on calculating informative metrics like total period window (30d).
Expand Down Expand Up @@ -302,9 +362,55 @@ func (m metadataRecordingRulesGenerator) GenerateMetadataRecordingRules(ctx cont
},
}

if slo.SLI.DenominatorCorrected != nil {
windows := getAlertGroupWindows(alerts)
windows = append(windows, slo.TimeWindow) // Add the total time window as a handy helper.
for _, window := range windows {
rule, err := createNumeratorCorrection(slo, labels, window)
if err != nil {
return nil, fmt.Errorf("could not create numerator rule: %v", err)
}
rules = append(rules, *rule)
}
}

return rules, nil
}

func createNumeratorCorrection(slo SLO, labels map[string]string, window time.Duration) (*rulefmt.Rule, error) {
windowString := timeDurationToPromStr(window)
metricSLONumeratorCorrection := fmt.Sprintf("slo:numerator_correction:ratio%s", windowString)
totalquery := slo.SLI.DenominatorCorrected.TotalQuery

tpl, err := template.New("sliExpr").Option("missingkey=error").Parse(totalquery)
if err != nil {
return nil, fmt.Errorf("could not create %s expression template data: %w", metricSLONumeratorCorrection, err)
}

var numeratorBuffer bytes.Buffer
err = tpl.Execute(&numeratorBuffer, map[string]string{
tplKeyWindow: windowString,
})
if err != nil {
return nil, fmt.Errorf("could not create numerator for %s: %w", metricSLONumeratorCorrection, err)
}

denominatorWindow := timeDurationToPromStr(time.Hour * 24 * 30)
var denominatorBuffer bytes.Buffer
err = tpl.Execute(&denominatorBuffer, map[string]string{
tplKeyWindow: denominatorWindow,
})
if err != nil {
return nil, fmt.Errorf("could not create denominator for %s: %w", metricSLONumeratorCorrection, err)
}

return &rulefmt.Rule{
Record: metricSLONumeratorCorrection,
Expr: fmt.Sprintf(`(%s)/(%s)`, numeratorBuffer.String(), denominatorBuffer.String()),
Labels: labels,
}, nil
}

var burnRateRecordingExprTpl = template.Must(template.New("burnRateExpr").Option("missingkey=error").Parse(`{{ .SLIErrorMetric }}{{ .MetricFilter }}
/ on({{ .SLOIDName }}, {{ .SLOLabelName }}, {{ .SLOServiceName }}) group_left
{{ .ErrorBudgetRatioMetric }}{{ .MetricFilter }}
Expand Down
48 changes: 48 additions & 0 deletions pkg/kubernetes/api/sloth/v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ import "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1"
- [type SLI](<#type-sli>)
- [func (in *SLI) DeepCopy() *SLI](<#func-sli-deepcopy>)
- [func (in *SLI) DeepCopyInto(out *SLI)](<#func-sli-deepcopyinto>)
- [type SLIDenominatorCorrected](<#type-slidenominatorcorrected>)
- [func (in *SLIDenominatorCorrected) DeepCopy() *SLIDenominatorCorrected](<#func-slidenominatorcorrected-deepcopy>)
- [func (in *SLIDenominatorCorrected) DeepCopyInto(out *SLIDenominatorCorrected)](<#func-slidenominatorcorrected-deepcopyinto>)
- [type SLIEvents](<#type-slievents>)
- [func (in *SLIEvents) DeepCopy() *SLIEvents](<#func-slievents-deepcopy>)
- [func (in *SLIEvents) DeepCopyInto(out *SLIEvents)](<#func-slievents-deepcopyinto>)
Expand Down Expand Up @@ -339,6 +342,10 @@ type SLI struct {
// +optional
Events *SLIEvents `json:"events,omitempty"`

// DenominatorCorrected is the denominator corrected events SLI type.
// +optional
DenominatorCorrected *SLIDenominatorCorrected `json:"denominator_corrected,omitempty"`

// Plugin is the pluggable SLI type.
// +optional
Plugin *SLIPlugin `json:"plugin,omitempty"`
Expand All @@ -361,6 +368,47 @@ func (in *SLI) DeepCopyInto(out *SLI)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non\-nil.

## type SLIDenominatorCorrected

SLIDenominatorCorrected is an SLI that is calculated as the division of bad events and total events, or 1 \- \(good / total\) events giving a ratio SLI. This SLI is corrected based on the total number of events for the last 30d, meaning that low\-event hours will have less impact on burn\-rate than high\-event hours. In other words, ratios with low denominators will have less impact.

```go
type SLIDenominatorCorrected struct {
// ErrorQuery is a Prometheus query that will get the number/count of events
// that we consider that are bad for the SLO (e.g "http 5xx", "latency > 250ms"...).
// Requires the usage of `{{.window}}` template variable. ErrorQuery and
// SuccessQuery are mutually exclusive.
ErrorQuery *string `json:"errorQuery,omitempty"`

// SuccessQuery is a Prometheus query that will get the number/count of events
// that we consider that are good for the SLO (e.g "http not 5xx", "latency < 250ms"...).
// Requires the usage of `{{.window}}` template variable. ErrorQuery and
// SuccessQuery are mutually exclusive.
SuccessQuery *string `json:"successQuery,omitempty"`

// TotalQuery is a Prometheus query that will get the total number/count of events
// for the SLO (e.g "all http requests"...).
// Requires the usage of `{{.window}}` template variable.
TotalQuery string `json:"totalQuery"`
}
```

### func \(\*SLIDenominatorCorrected\) DeepCopy

```go
func (in *SLIDenominatorCorrected) DeepCopy() *SLIDenominatorCorrected
```

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SLIDenominatorCorrected.

### func \(\*SLIDenominatorCorrected\) DeepCopyInto

```go
func (in *SLIDenominatorCorrected) DeepCopyInto(out *SLIDenominatorCorrected)
```

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non\-nil.

## type SLIEvents

SLIEvents is an SLI that is calculated as the division of bad events and total events, giving a ratio SLI. Normally this is the most common ratio type.
Expand Down
27 changes: 27 additions & 0 deletions pkg/kubernetes/api/sloth/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ type SLI struct {
// +optional
Events *SLIEvents `json:"events,omitempty"`

// DenominatorCorrected is the denominator corrected events SLI type.
// +optional
DenominatorCorrected *SLIDenominatorCorrected `json:"denominator_corrected,omitempty"`

// Plugin is the pluggable SLI type.
// +optional
Plugin *SLIPlugin `json:"plugin,omitempty"`
Expand All @@ -120,6 +124,29 @@ type SLIEvents struct {
TotalQuery string `json:"totalQuery"`
}

// SLIDenominatorCorrected is an SLI that is calculated as the division of bad events and total events, or
// 1 - (good / total) events giving a ratio SLI. This SLI is corrected based on the total number of events
// for the last 30d, meaning that low-event hours will have less impact on burn-rate than high-event hours.
// In other words, ratios with low denominators will have less impact.
type SLIDenominatorCorrected struct {
// ErrorQuery is a Prometheus query that will get the number/count of events
// that we consider that are bad for the SLO (e.g "http 5xx", "latency > 250ms"...).
// Requires the usage of `{{.window}}` template variable. ErrorQuery and
// SuccessQuery are mutually exclusive.
ErrorQuery *string `json:"errorQuery,omitempty"`

// SuccessQuery is a Prometheus query that will get the number/count of events
// that we consider that are good for the SLO (e.g "http not 5xx", "latency < 250ms"...).
// Requires the usage of `{{.window}}` template variable. ErrorQuery and
// SuccessQuery are mutually exclusive.
SuccessQuery *string `json:"successQuery,omitempty"`

// TotalQuery is a Prometheus query that will get the total number/count of events
// for the SLO (e.g "all http requests"...).
// Requires the usage of `{{.window}}` template variable.
TotalQuery string `json:"totalQuery"`
}

// SLIPlugin will use the SLI returned by the SLI plugin selected along with the options.
type SLIPlugin struct {
// Name is the name of the plugin that needs to load.
Expand Down
31 changes: 31 additions & 0 deletions pkg/kubernetes/api/sloth/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b643653

Please sign in to comment.