Skip to content

Commit

Permalink
refactor recording rules and alerts code
Browse files Browse the repository at this point in the history
Following the work started in kubevirt/kubevirt#10044 , and according to
the kubevirt/community#219 proposal, this PR refactors monitoring
recording rules and alerts

Signed-off-by: avlitman <[email protected]>
  • Loading branch information
avlitman committed Feb 13, 2024
1 parent 6e78638 commit 67be6bf
Show file tree
Hide file tree
Showing 28 changed files with 853 additions and 383 deletions.
19 changes: 14 additions & 5 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
# SSP Operator metrics
This document aims to help users that are not familiar with metrics exposed by the SSP Operator.
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## SSP Operator Metrics List
### kubevirt_ssp_common_templates_restored_increase
The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge.

### kubevirt_ssp_common_templates_restored_total
The total number of common templates restored by the operator back to their original state. Type: Counter.

### kubevirt_ssp_operator_reconcile_succeeded
Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge.

### kubevirt_ssp_operator_reconcile_succeeded_aggregated
The total number of ssp-operator pods reconciling with no errors. Type: Gauge.

### kubevirt_ssp_operator_up
The total number of running ssp-operator pods. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_increase
The increase in the number of rejected template validators, over the last hour. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_total
The total number of rejected template validators. Type: Counter.

### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.

### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.

## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
7 changes: 6 additions & 1 deletion internal/operands/metrics/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/internal/operands"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

// Define RBAC rules needed by this operand:
Expand Down Expand Up @@ -96,7 +97,11 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci
}

func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) {
prometheusRule, err := newPrometheusRule(request.Namespace)
if err := rules.SetupRules(); err != nil {
return common.ReconcileResult{}, err
}

prometheusRule, err := rules.BuildPrometheusRule(request.Namespace)
if err != nil {
return common.ReconcileResult{}, err
}
Expand Down
11 changes: 9 additions & 2 deletions internal/operands/metrics/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

var log = logf.Log.WithName("metrics_operand")
Expand All @@ -34,6 +35,9 @@ var _ = Describe("Metrics operand", func() {
)

BeforeEach(func() {
err := rules.SetupRules()
Expect(err).To(Succeed())

client := fake.NewClientBuilder().WithScheme(common.Scheme).Build()
request = common.Request{
Request: reconcile.Request{
Expand Down Expand Up @@ -67,7 +71,7 @@ var _ = Describe("Metrics operand", func() {
_, err := operand.Reconcile(&request)
Expect(err).ToNot(HaveOccurred())

prometheusRule, err := newPrometheusRule(namespace)
prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

ExpectResourceExists(prometheusRule, request)
Expand All @@ -82,7 +86,7 @@ var _ = Describe("Metrics operand", func() {
os.Setenv(runbookURLTemplateEnv, template)
}

prometheusRule, err := newPrometheusRule(namespace)
err := rules.SetupRules()

if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 {
Expect(err).To(HaveOccurred())
Expand All @@ -91,6 +95,9 @@ var _ = Describe("Metrics operand", func() {

Expect(err).ToNot(HaveOccurred())

prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

for _, group := range prometheusRule.Spec.Groups {
for _, rule := range group.Rules {
if rule.Alert != "" {
Expand Down
48 changes: 1 addition & 47 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
package metrics

import (
"errors"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -13,7 +9,6 @@ import (
)

const (
PrometheusRuleName = "prometheus-k8s-rules-cnv"
MonitorNamespace = "openshift-monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
Expand Down Expand Up @@ -69,7 +64,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
return &promv1.ServiceMonitor{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: PrometheusRuleName,
Name: rules.RuleName,
Labels: ServiceMonitorLabels(),
},
Spec: promv1.ServiceMonitorSpec{
Expand All @@ -96,44 +91,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
},
}
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: PrometheusRuleName,
Namespace: namespace,
Labels: map[string]string{
"prometheus": "k8s",
"role": "alert-rules",
"kubevirt.io": "prometheus-rules",
PrometheusLabelKey: PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
}, nil
}

func getRunbookURLTemplate() (string, error) {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 {
return "", errors.New("runbook URL template must have exactly 1 %s substring")
}

return runbookURLTemplate, nil
}
6 changes: 5 additions & 1 deletion internal/template-validator/validator/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ func (app *App) Run() {
registerReadinessProbe()

// setup monitoring
validatorMetrics.SetupMetrics()
err = validatorMetrics.SetupMetrics()
if err != nil {
logger.Log.Error(err, "Error setting up metrics")
panic(err)
}

logger.Log.Info("TLS certs directory", "directory", app.TLSInfo.CertsDirectory)

Expand Down
11 changes: 7 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,18 @@ func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatch
}
}

func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer {
sspMetrics.SetupMetrics()
func newPrometheusServer(metricsAddr string, cache cache.Cache) (*prometheusServer, error) {
err := sspMetrics.SetupMetrics()
if err != nil {
return nil, err
}

return &prometheusServer{
certPath: path.Join(sdkTLSDir, sdkTLSCrt),
keyPath: path.Join(sdkTLSDir, sdkTLSKey),
cache: cache,
serverAddress: metricsAddr,
}
}, nil
}

func main() {
Expand Down Expand Up @@ -249,7 +252,7 @@ func main() {
}
}

metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache())
metricsServer, err := newPrometheusServer(metricsAddr, mgr.GetCache())
if err != nil {
setupLog.Error(err, "unable create Prometheus server")
os.Exit(1)
Expand Down
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/ssp-operator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ import (
runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func SetupMetrics() {
func SetupMetrics() error {
operatormetrics.Register = runtimemetrics.Registry.Register

if err := operatormetrics.RegisterMetrics(
return operatormetrics.RegisterMetrics(
operatorMetrics,
rbdMetrics,
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/template-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
)

func SetupMetrics() {
if err := operatormetrics.RegisterMetrics(
func SetupMetrics() error {
return operatormetrics.RegisterMetrics(
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
)

func operatorAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
{
Alert: "VMStorageClassWarning",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"summary": "{{ $value }} Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns.",
"description": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
}
}
Loading

0 comments on commit 67be6bf

Please sign in to comment.