From aa9a6fb2f155f6bcffe965051352804adf685950 Mon Sep 17 00:00:00 2001 From: Andrej Krejcir Date: Thu, 18 Jan 2024 16:11:08 +0100 Subject: [PATCH 1/4] feat: Move metrics rules and alerts to separate package Moved the definitions of alerts and rules to pkg/monitoring/rules package. Signed-off-by: Andrej Krejcir --- internal/operands/metrics/resources.go | 176 +----------------- pkg/monitoring/rules/rules.go | 174 +++++++++++++++++ tests/monitoring_test.go | 5 +- tools/metricsdocs/metricsdocs.go | 8 +- .../metrics_collector.go | 6 +- 5 files changed, 188 insertions(+), 181 deletions(-) create mode 100644 pkg/monitoring/rules/rules.go diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index 4b22d62cb..6fb8c1555 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -2,15 +2,14 @@ package metrics import ( "errors" - "fmt" "os" "strings" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" rbac "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/utils/ptr" + + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) const ( @@ -18,12 +17,6 @@ const ( MonitorNamespace = "openshift-monitoring" defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" - partOfAlertLabelKey = "kubernetes_operator_part_of" - partOfAlertLabelValue = "kubevirt" - componentAlertLabelKey = "kubernetes_operator_component" - componentAlertLabelValue = "ssp-operator" PrometheusLabelKey = "prometheus.ssp.kubevirt.io" PrometheusLabelValue = "true" PrometheusClusterRoleName = "prometheus-k8s-ssp" @@ -31,167 +24,6 @@ const ( MetricsPortName = "metrics" ) -const ( - CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" - TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" -) - -// RecordRulesDesc represent SSP Operator Prometheus Record Rules -type RecordRulesDesc struct { - Name string - Expr intstr.IntOrString - Description string - Type string -} - -// RecordRulesDescList lists all SSP Operator Prometheus Record Rules -var RecordRulesDescList = []RecordRulesDesc{ - { - Name: "kubevirt_ssp_operator_up", - Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), - Description: "The total number of running ssp-operator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_up", - Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), - Description: "The total number of running virt-template-validator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", - Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), - Description: "The total number of ssp-operator pods reconciling with no errors", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_rejected_increase", - Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of rejected template validators, over the last hour", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_common_templates_restored_increase", - Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", - Type: "Gauge", - }, -} - -func getAlertRules() ([]promv1.Rule, error) { - runbookURLTemplate, err := getRunbookURLTemplate() - if err != nil { - return nil, err - } - - return []promv1.Rule{ - { - Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), - Record: "cnv:vmi_status_running:count", - }, - { - Alert: "SSPDown", - Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All SSP operator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPTemplateValidatorDown", - Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All Template Validator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPFailingToReconcile", - Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "The ssp-operator pod is up but failing to reconcile", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPHighRateRejectedVms", - Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "High rate of rejected Vms", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPCommonTemplatesModificationReverted", - Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), - For: ptr.To[promv1.Duration]("0m"), - Annotations: map[string]string{ - "summary": "Common Templates manual modifications were reverted by the operator", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "VirtualMachineCRCErrors", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), - Annotations: map[string]string{ - "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", - "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - }, nil -} - -func getRecordRules() []promv1.Rule { - var recordRules []promv1.Rule - - for _, rrd := range RecordRulesDescList { - recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) - } - - return recordRules -} - func newMonitoringClusterRole() *rbac.ClusterRole { return &rbac.ClusterRole{ ObjectMeta: metav1.ObjectMeta{ @@ -266,7 +98,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor { } func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { - alertRules, err := getAlertRules() + runbookURLTemplate, err := getRunbookURLTemplate() if err != nil { return nil, err } @@ -286,7 +118,7 @@ func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { Groups: []promv1.RuleGroup{ { Name: "cnv.rules", - Rules: append(alertRules, getRecordRules()...), + Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...), }, }, }, diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go new file mode 100644 index 000000000..8622931b7 --- /dev/null +++ b/pkg/monitoring/rules/rules.go @@ -0,0 +1,174 @@ +package rules + +import ( + "fmt" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "ssp-operator" +) + +const ( + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" +) + +// RecordRulesDesc represent SSP Operator Prometheus Record Rules +type RecordRulesDesc struct { + Name string + Expr intstr.IntOrString + Description string + Type string +} + +// RecordRulesDescList lists all SSP Operator Prometheus Record Rules +var RecordRulesDescList = []RecordRulesDesc{ + { + Name: "kubevirt_ssp_operator_up", + Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), + Description: "The total number of running ssp-operator pods", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_template_validator_up", + Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), + Description: "The total number of running virt-template-validator pods", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), + Description: "The total number of ssp-operator pods reconciling with no errors", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_template_validator_rejected_increase", + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of rejected template validators, over the last hour", + Type: "Gauge", + }, + { + Name: "kubevirt_ssp_common_templates_restored_increase", + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + Type: "Gauge", + }, +} + +func RecordRules() []promv1.Rule { + var recordRules []promv1.Rule + + for _, rrd := range RecordRulesDescList { + recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) + } + + return recordRules +} + +func AlertRules(runbookURLTemplate string) []promv1.Rule { + return []promv1.Rule{ + { + Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), + Record: "cnv:vmi_status_running:count", + }, + { + Alert: "SSPDown", + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All SSP operator pods are down.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPTemplateValidatorDown", + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All Template Validator pods are down.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPFailingToReconcile", + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "The ssp-operator pod is up but failing to reconcile", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPHighRateRejectedVms", + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "High rate of rejected Vms", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "SSPCommonTemplatesModificationReverted", + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), + For: ptr.To[promv1.Duration]("0m"), + Annotations: map[string]string{ + "summary": "Common Templates manual modifications were reverted by the operator", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + { + Alert: "VirtualMachineCRCErrors", + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), + Annotations: map[string]string{ + "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", + "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", + "runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"), + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + partOfAlertLabelKey: partOfAlertLabelValue, + componentAlertLabelKey: componentAlertLabelValue, + }, + }, + } +} diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index ad834e169..538f08ff9 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -31,6 +31,7 @@ import ( ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/operands/metrics" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" "kubevirt.io/ssp-operator/tests/env" ) @@ -49,7 +50,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8363] Should fire SSPCommonTemplatesModificationReverted", func() { // we have to wait for prometheus to pick up the series before we increase it. - waitForSeriesToBeDetected(metrics.CommonTemplatesRestoredIncreaseQuery) + waitForSeriesToBeDetected(rules.CommonTemplatesRestoredIncreaseQuery) expectTemplateUpdateToIncreaseTotalRestoredTemplatesCount(testTemplate) waitForAlertToActivate("SSPCommonTemplatesModificationReverted") }) @@ -114,7 +115,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8377] Should fire SSPHighRateRejectedVms", func() { - waitForSeriesToBeDetected(metrics.TemplateValidatorRejectedIncreaseQuery) + waitForSeriesToBeDetected(rules.TemplateValidatorRejectedIncreaseQuery) Expect(apiClient.Create(ctx, template)).ToNot(HaveOccurred(), "Failed to create template: %s", template.Name) for range [6]int{} { time.Sleep(time.Second * 5) diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 76a3bd4fa..cf3735afc 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -7,9 +7,9 @@ import ( "github.com/machadovilaca/operator-observability/pkg/operatormetrics" - "kubevirt.io/ssp-operator/internal/operands/metrics" sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) const ( @@ -30,7 +30,7 @@ const ( ) func main() { - metricsList := recordRulesDescToMetricList(metrics.RecordRulesDescList) + metricsList := recordRulesDescToMetricList(rules.RecordRulesDescList) sspMetrics.SetupMetrics() validatorMetrics.SetupMetrics() @@ -59,7 +59,7 @@ type metric struct { mtype string } -func recordRulesDescToMetricList(mdl []metrics.RecordRulesDesc) metricList { +func recordRulesDescToMetricList(mdl []rules.RecordRulesDesc) metricList { res := make([]metric, len(mdl)) for i, md := range mdl { res[i] = metricDescriptionToMetric(md) @@ -68,7 +68,7 @@ func recordRulesDescToMetricList(mdl []metrics.RecordRulesDesc) metricList { return res } -func metricDescriptionToMetric(rrd metrics.RecordRulesDesc) metric { +func metricDescriptionToMetric(rrd rules.RecordRulesDesc) metric { return metric{ name: rrd.Name, description: rrd.Description, diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index 522246750..b4e3c38e3 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -2,9 +2,9 @@ package main import ( parser "github.com/kubevirt/monitoring/pkg/metrics/parser" - "kubevirt.io/ssp-operator/internal/operands/metrics" - dto "github.com/prometheus/client_model/go" + + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) // This should be used only for very rare cases where the naming conventions that are explained in the best practices: @@ -14,7 +14,7 @@ var excludedMetrics = map[string]struct{}{} func readMetrics() []*dto.MetricFamily { var metricFamilies []*dto.MetricFamily - sspMetrics := metrics.RecordRulesDescList + sspMetrics := rules.RecordRulesDescList for _, metric := range sspMetrics { if _, isExcludedMetric := excludedMetrics[metric.Name]; !isExcludedMetric { From 92d87809fae5763796b7122a71057578fb8177e5 Mon Sep 17 00:00:00 2001 From: Andrej Krejcir Date: Fri, 19 Jan 2024 12:46:28 +0100 Subject: [PATCH 2/4] feat: run unit tests for metrics rules Added metrics unit tests file and a Makefile target to run the tests. Signed-off-by: Andrej Krejcir --- Makefile | 21 ++++++++++++- hack/metrics-rules-test.sh | 32 ++++++++++++++++++++ pkg/monitoring/rules/rules-tests.yaml | 26 ++++++++++++++++ tools/test-rules-writer/test_rules_writer.go | 31 +++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100755 hack/metrics-rules-test.sh create mode 100644 pkg/monitoring/rules/rules-tests.yaml create mode 100644 tools/test-rules-writer/test_rules_writer.go diff --git a/Makefile b/Makefile index 2f3719a9d..0f0d440cc 100644 --- a/Makefile +++ b/Makefile @@ -84,7 +84,7 @@ endif all: manager .PHONY: unittest -unittest: generate lint fmt vet manifests +unittest: generate lint fmt vet manifests metrics-rules-test go test -v -coverprofile cover.out $(SRC_PATHS_TESTS) cd api && go test -v ./... @@ -325,3 +325,22 @@ lint: .PHONY: lint-metrics lint-metrics: ./hack/prom_metric_linter.sh --operator-name="kubevirt" --sub-operator-name="ssp" + +PROMTOOL ?= $(LOCALBIN)/promtool +PROMTOOL_VERSION ?= 2.44.0 + +.PHONY: promtool +promtool: $(PROMTOOL) +$(PROMTOOL): $(LOCALBIN) + test -s $(PROMTOOL) || curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v$(PROMTOOL_VERSION)/prometheus-$(PROMTOOL_VERSION).linux-amd64.tar.gz" | \ + tar xvzf - --directory=$(LOCALBIN) "prometheus-$(PROMTOOL_VERSION).linux-amd64"/promtool --strip-components=1 + +METRIC_RULES_WRITER ?= $(LOCALBIN)/metrics-rules-writer + +.PHONY: build-metric-rules-writer +build-metric-rules-writer: $(LOCALBIN) + go build -o $(METRIC_RULES_WRITER) tools/test-rules-writer/test_rules_writer.go + +.PHONY: metrics-rules-test +metrics-rules-test: build-metric-rules-writer promtool + ./hack/metrics-rules-test.sh $(METRIC_RULES_WRITER) "./pkg/monitoring/rules/rules-tests.yaml" diff --git a/hack/metrics-rules-test.sh b/hack/metrics-rules-test.sh new file mode 100755 index 000000000..04a59aa86 --- /dev/null +++ b/hack/metrics-rules-test.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +readonly PROMTOOL="$(dirname "$0")/../bin/promtool" + +function cleanup() { + local cleanup_dir="${1:?}" + rm -rf "${cleanup_dir}" +} + +function main() { + local prom_spec_dumper="${1:?}" + local tests_file="${2:?}" + local temp_dir + + temp_dir="$(mktemp --tmpdir --directory metrics_test_dir.XXXXX)" + trap "cleanup ${temp_dir}" RETURN EXIT INT + + local rules_file="${temp_dir}/rules.json" + local tests_copy="${temp_dir}/rules-test.yaml" + + "${prom_spec_dumper}" > "${rules_file}" + cp "${tests_file}" "${tests_copy}" + + echo "INFO: Rules file content:" + cat "${rules_file}" + echo + + ${PROMTOOL} check rules "${rules_file}" + ${PROMTOOL} test rules "${tests_copy}" +} + +main "$@" diff --git a/pkg/monitoring/rules/rules-tests.yaml b/pkg/monitoring/rules/rules-tests.yaml new file mode 100644 index 000000000..cb60e2f53 --- /dev/null +++ b/pkg/monitoring/rules/rules-tests.yaml @@ -0,0 +1,26 @@ +# Unit tests for the prometheus rules +rule_files: + - rules.json + +tests: + - interval: "1m" + input_series: + - series: 'up{pod="ssp-operator-12345"}' + values: '0x5 1' + + alert_rule_test: + - eval_time: "5m" + alertname: "SSPDown" + exp_alerts: + - exp_annotations: + summary: "All SSP operator pods are down." + runbook_url: "test-runbook:SSPDown" + exp_labels: + severity: "critical" + operator_health_impact: "critical" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + - eval_time: "6m" + alertname: "SSPDown" + exp_alerts: [] diff --git a/tools/test-rules-writer/test_rules_writer.go b/tools/test-rules-writer/test_rules_writer.go new file mode 100644 index 000000000..f571a00df --- /dev/null +++ b/tools/test-rules-writer/test_rules_writer.go @@ -0,0 +1,31 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "kubevirt.io/ssp-operator/pkg/monitoring/rules" +) + +func main() { + const runbookTemplate = "test-runbook:%s" + + allRules := append(rules.RecordRules(), rules.AlertRules(runbookTemplate)...) + + spec := promv1.PrometheusRuleSpec{ + Groups: []promv1.RuleGroup{{ + Name: "test.rules", + Rules: allRules, + }}, + } + + encoder := json.NewEncoder(os.Stdout) + encoder.SetIndent("", " ") + if err := encoder.Encode(spec); err != nil { + fmt.Fprintf(os.Stderr, "Error encoding prometheus spec: %v", err) + os.Exit(1) + } +} From fbf89bd28fa31530783d0072f34a5fcf951eeb6a Mon Sep 17 00:00:00 2001 From: Andrej Krejcir Date: Fri, 19 Jan 2024 17:02:57 +0100 Subject: [PATCH 3/4] feat: add more tests for metrics alerts Added unit tests for all SSP alerts. Signed-off-by: Andrej Krejcir --- pkg/monitoring/rules/rules-tests.yaml | 166 ++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/pkg/monitoring/rules/rules-tests.yaml b/pkg/monitoring/rules/rules-tests.yaml index cb60e2f53..a9e8aea1f 100644 --- a/pkg/monitoring/rules/rules-tests.yaml +++ b/pkg/monitoring/rules/rules-tests.yaml @@ -3,6 +3,7 @@ rule_files: - rules.json tests: + # SSPDown alert tests - interval: "1m" input_series: - series: 'up{pod="ssp-operator-12345"}' @@ -24,3 +25,168 @@ tests: - eval_time: "6m" alertname: "SSPDown" exp_alerts: [] + + # SSPTemplateValidatorDown alert tests + - interval: "1m" + input_series: + - series: 'up{pod="virt-template-validator-12345"}' + values: '0x5 1' + + alert_rule_test: + - eval_time: "5m" + alertname: "SSPTemplateValidatorDown" + exp_alerts: + - exp_annotations: + summary: "All Template Validator pods are down." + runbook_url: "test-runbook:SSPTemplateValidatorDown" + exp_labels: + severity: "critical" + operator_health_impact: "critical" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + - eval_time: "6m" + alertname: "SSPTemplateValidatorDown" + exp_alerts: [] + + # SSPFailingToReconcile alert tests + - interval: "1m" + input_series: + - series: 'up{pod="ssp-operator-12345"}' + values: '0x5 1' + - series: 'kubevirt_ssp_operator_reconcile_succeeded{pod="ssp-operator-12345"}' + values: '0x11 1' + + alert_rule_test: + # SSP pod is down -> should not trigger SSPFailingToReconcile alert + - eval_time: "5m" + alertname: "SSPFailingToReconcile" + exp_alerts: [] + + # SSP pod is up, but failed to reconcile + - eval_time: "11m" + alertname: "SSPFailingToReconcile" + exp_alerts: + - exp_annotations: + summary: "The ssp-operator pod is up but failing to reconcile" + runbook_url: "test-runbook:SSPFailingToReconcile" + exp_labels: + severity: "critical" + operator_health_impact: "critical" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + # SSP pod is up, and reconciliation succeeded + - eval_time: "12m" + alertname: "SSPFailingToReconcile" + exp_alerts: [] + + # SSPHighRateRejectedVms alert tests + - interval: "1m" + input_series: + - series: 'kubevirt_ssp_template_validator_rejected_total{pod="virt-template-validator-12345"}' + values: '0+1x10 10x120' + + alert_rule_test: + - eval_time: "10m" + alertname: "SSPHighRateRejectedVms" + exp_alerts: [] + + - eval_time: "11m" + alertname: "SSPHighRateRejectedVms" + exp_alerts: + - exp_annotations: + summary: "High rate of rejected Vms" + runbook_url: "test-runbook:SSPHighRateRejectedVms" + exp_labels: + severity: "warning" + operator_health_impact: "warning" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + # The alert is triggering for the whole hour, until the window + # does not contain the first few values + - eval_time: "64m" + alertname: "SSPHighRateRejectedVms" + exp_alerts: + - exp_annotations: + summary: "High rate of rejected Vms" + runbook_url: "test-runbook:SSPHighRateRejectedVms" + exp_labels: + severity: "warning" + operator_health_impact: "warning" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + - eval_time: "65m" + alertname: "SSPHighRateRejectedVms" + exp_alerts: [] + + # SSPCommonTemplatesModificationReverted alert tests + - interval: "1m" + input_series: + - series: 'kubevirt_ssp_common_templates_restored_total{pod="ssp-operator-12345"}' + values: '0 0 1 0' + + alert_rule_test: + - eval_time: "1m" + alertname: "SSPCommonTemplatesModificationReverted" + exp_alerts: [] + + - eval_time: "2m" + alertname: "SSPCommonTemplatesModificationReverted" + exp_alerts: + - exp_annotations: + summary: "Common Templates manual modifications were reverted by the operator" + runbook_url: "test-runbook:SSPCommonTemplatesModificationReverted" + exp_labels: + severity: "warning" + operator_health_impact: "none" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + # The alert is triggering for the whole hour, until the window + # does not contain the first few values + - eval_time: "61m" + alertname: "SSPCommonTemplatesModificationReverted" + exp_alerts: + - exp_annotations: + summary: "Common Templates manual modifications were reverted by the operator" + runbook_url: "test-runbook:SSPCommonTemplatesModificationReverted" + exp_labels: + severity: "warning" + operator_health_impact: "none" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + - eval_time: "62m" + alertname: "SSPCommonTemplatesModificationReverted" + exp_alerts: [] + + # VirtualMachineCRCErrors alert tests + - interval: "1m" + input_series: + - series: 'kubevirt_ssp_vm_rbd_block_volume_without_rxbounce' + values: '0 0 1 0' + + alert_rule_test: + - eval_time: "1m" + alertname: "VirtualMachineCRCErrors" + exp_alerts: [] + + - eval_time: "2m" + alertname: "VirtualMachineCRCErrors" + exp_alerts: + - exp_annotations: + description: "1 Virtual Machines are in risk of causing CRC errors and major service outages" + summary: "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set." + runbook_url: "test-runbook:VirtualMachineCRCErrors" + exp_labels: + severity: "warning" + operator_health_impact: "none" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "ssp-operator" + + - eval_time: "3m" + alertname: "VirtualMachineCRCErrors" + exp_alerts: [] From 36106a0aef2f2ac1b63a1ee7de09ff8e5a70f7d3 Mon Sep 17 00:00:00 2001 From: Andrej Krejcir Date: Tue, 23 Jan 2024 16:39:46 +0100 Subject: [PATCH 4/4] chore: metrics: Make a global variable private Made "recordRulesDescList" variable private. Signed-off-by: Andrej Krejcir --- pkg/monitoring/rules/rules.go | 20 ++++++++++++------- tools/metricsdocs/metricsdocs.go | 2 +- .../metrics_collector.go | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go index 8622931b7..a08b6b9fd 100644 --- a/pkg/monitoring/rules/rules.go +++ b/pkg/monitoring/rules/rules.go @@ -30,8 +30,8 @@ type RecordRulesDesc struct { Type string } -// RecordRulesDescList lists all SSP Operator Prometheus Record Rules -var RecordRulesDescList = []RecordRulesDesc{ +// recordRulesDescList lists all SSP Operator Prometheus Record Rules +var recordRulesDescList = []RecordRulesDesc{ { Name: "kubevirt_ssp_operator_up", Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), @@ -65,13 +65,19 @@ var RecordRulesDescList = []RecordRulesDesc{ } func RecordRules() []promv1.Rule { - var recordRules []promv1.Rule - - for _, rrd := range RecordRulesDescList { - recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) + result := make([]promv1.Rule, 0, len(recordRulesDescList)) + for _, rrd := range recordRulesDescList { + result = append(result, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) } + return result +} - return recordRules +func RecordRulesWithDescriptions() []RecordRulesDesc { + result := make([]RecordRulesDesc, 0, len(recordRulesDescList)) + for _, rrd := range recordRulesDescList { + result = append(result, rrd) + } + return result } func AlertRules(runbookURLTemplate string) []promv1.Rule { diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index cf3735afc..323b72368 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -30,7 +30,7 @@ const ( ) func main() { - metricsList := recordRulesDescToMetricList(rules.RecordRulesDescList) + metricsList := recordRulesDescToMetricList(rules.RecordRulesWithDescriptions()) sspMetrics.SetupMetrics() validatorMetrics.SetupMetrics() diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index b4e3c38e3..a0c852f58 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -14,7 +14,7 @@ var excludedMetrics = map[string]struct{}{} func readMetrics() []*dto.MetricFamily { var metricFamilies []*dto.MetricFamily - sspMetrics := rules.RecordRulesDescList + sspMetrics := rules.RecordRulesWithDescriptions() for _, metric := range sspMetrics { if _, isExcludedMetric := excludedMetrics[metric.Name]; !isExcludedMetric {