Skip to content

Commit

Permalink
refactor recording rules and alerts code
Browse files Browse the repository at this point in the history
Signed-off-by: avlitman <[email protected]>
  • Loading branch information
avlitman committed Feb 5, 2024
1 parent 6e46f20 commit 07cca3e
Show file tree
Hide file tree
Showing 25 changed files with 811 additions and 349 deletions.
19 changes: 14 additions & 5 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
# SSP Operator metrics
This document aims to help users that are not familiar with metrics exposed by the SSP Operator.
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## SSP Operator Metrics List
### kubevirt_ssp_common_templates_restored_increase
The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge.

### kubevirt_ssp_common_templates_restored_total
The total number of common templates restored by the operator back to their original state. Type: Counter.

### kubevirt_ssp_operator_reconcile_succeeded
Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge.

### kubevirt_ssp_operator_reconcile_succeeded_aggregated
The total number of ssp-operator pods reconciling with no errors. Type: Gauge.

### kubevirt_ssp_operator_up
The total number of running ssp-operator pods. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_increase
The increase in the number of rejected template validators, over the last hour. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_total
The total number of rejected template validators. Type: Counter.

### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.

### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.

## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/blang/semver/v4 v4.0.0
github.com/fsnotify/fsnotify v1.7.0
github.com/go-logr/logr v1.4.1
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409
github.com/machadovilaca/operator-observability v0.0.12
github.com/onsi/ginkgo/v2 v2.15.0
github.com/onsi/gomega v1.31.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a h1:7YL/LNARjQWuXihwJ4b/nVzddGvoFRI7JqxAKISyJkg=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409 h1:w+MkYRwdxddjNwR7BbNMWP24wVli/G6zna86wfbhiAk=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/machadovilaca/operator-observability v0.0.12 h1:rd9iFmvWJiYS8LdW6siAiz8kLigcNLa1+dmCVb7dFxs=
github.com/machadovilaca/operator-observability v0.0.12/go.mod h1:NGkaR3HEYLScVQf6kQAyxWOSN1ltHcsEvHU/8iIJ8cE=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
Expand Down
8 changes: 7 additions & 1 deletion internal/operands/metrics/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/internal/operands"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

// Define RBAC rules needed by this operand:
Expand Down Expand Up @@ -96,7 +97,12 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci
}

func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) {
prometheusRule, err := newPrometheusRule(request.Namespace)
err := rules.SetupRules()
if err != nil {
return common.ReconcileResult{}, err
}

prometheusRule, err := rules.BuildPrometheusRule(request.Namespace)
if err != nil {
return common.ReconcileResult{}, err
}
Expand Down
5 changes: 3 additions & 2 deletions internal/operands/metrics/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

var log = logf.Log.WithName("metrics_operand")
Expand Down Expand Up @@ -67,7 +68,7 @@ var _ = Describe("Metrics operand", func() {
_, err := operand.Reconcile(&request)
Expect(err).ToNot(HaveOccurred())

prometheusRule, err := newPrometheusRule(namespace)
prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

ExpectResourceExists(prometheusRule, request)
Expand All @@ -82,7 +83,7 @@ var _ = Describe("Metrics operand", func() {
os.Setenv(runbookURLTemplateEnv, template)
}

prometheusRule, err := newPrometheusRule(namespace)
prometheusRule, err := rules.BuildPrometheusRule(namespace)

if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 {
Expect(err).To(HaveOccurred())
Expand Down
47 changes: 0 additions & 47 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
package metrics

import (
"errors"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

const (
Expand Down Expand Up @@ -96,44 +90,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
},
}
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: PrometheusRuleName,
Namespace: namespace,
Labels: map[string]string{
"prometheus": "k8s",
"role": "alert-rules",
"kubevirt.io": "prometheus-rules",
PrometheusLabelKey: PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
}, nil
}

func getRunbookURLTemplate() (string, error) {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 {
return "", errors.New("runbook URL template must have exactly 1 %s substring")
}

return runbookURLTemplate, nil
}
15 changes: 11 additions & 4 deletions pkg/monitoring/metrics/ssp-operator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,21 @@ import (
runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func SetupMetrics() {
func SetupMetrics() error {
operatormetrics.Register = runtimemetrics.Registry.Register

if err := operatormetrics.RegisterMetrics(
err := operatormetrics.RegisterMetrics(
operatorMetrics,
rbdMetrics,
templateMetrics,
); err != nil {
panic(err)
)
if err != nil {
return err
}
return nil
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
15 changes: 11 additions & 4 deletions pkg/monitoring/metrics/template-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@ import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
)

func SetupMetrics() {
if err := operatormetrics.RegisterMetrics(
func SetupMetrics() error {
err := operatormetrics.RegisterMetrics(
templateMetrics,
); err != nil {
panic(err)
)
if err != nil {
return err
}
return nil
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
)

func operatorAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages.",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
}
}
51 changes: 51 additions & 0 deletions pkg/monitoring/rules/alerts/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package alerts

import (
"errors"
"fmt"
"os"
"strings"

"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
)

const (
prometheusRunbookAnnotationKey = "runbook_url"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
)

func Register() error {
alerts := [][]promv1.Rule{
operatorAlerts(),
}

runbookURLTemplate := getRunbookURLTemplate()
for _, alertGroup := range alerts {
for _, alert := range alertGroup {
alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue
alert.Labels[componentAlertLabelKey] = componentAlertLabelValue
alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert)
}
}

return operatorrules.RegisterAlerts(alerts...)
}

func getRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}

return runbookURLTemplate
}
Loading

0 comments on commit 07cca3e

Please sign in to comment.