Skip to content

Commit

Permalink
Merge pull request #848 from akrejcir/metrics-unit-tests
Browse files Browse the repository at this point in the history
feat: Metrics unit tests
  • Loading branch information
kubevirt-bot authored Jan 24, 2024
2 parents 37d5a36 + 36106a0 commit 1fbd840
Show file tree
Hide file tree
Showing 9 changed files with 469 additions and 182 deletions.
21 changes: 20 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ endif
all: manager

.PHONY: unittest
unittest: generate lint fmt vet manifests
unittest: generate lint fmt vet manifests metrics-rules-test
go test -v -coverprofile cover.out $(SRC_PATHS_TESTS)
cd api && go test -v ./...

Expand Down Expand Up @@ -325,3 +325,22 @@ lint:
.PHONY: lint-metrics
lint-metrics:
./hack/prom_metric_linter.sh --operator-name="kubevirt" --sub-operator-name="ssp"

PROMTOOL ?= $(LOCALBIN)/promtool
PROMTOOL_VERSION ?= 2.44.0

.PHONY: promtool
promtool: $(PROMTOOL)
$(PROMTOOL): $(LOCALBIN)
test -s $(PROMTOOL) || curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v$(PROMTOOL_VERSION)/prometheus-$(PROMTOOL_VERSION).linux-amd64.tar.gz" | \
tar xvzf - --directory=$(LOCALBIN) "prometheus-$(PROMTOOL_VERSION).linux-amd64"/promtool --strip-components=1

METRIC_RULES_WRITER ?= $(LOCALBIN)/metrics-rules-writer

.PHONY: build-metric-rules-writer
build-metric-rules-writer: $(LOCALBIN)
go build -o $(METRIC_RULES_WRITER) tools/test-rules-writer/test_rules_writer.go

.PHONY: metrics-rules-test
metrics-rules-test: build-metric-rules-writer promtool
./hack/metrics-rules-test.sh $(METRIC_RULES_WRITER) "./pkg/monitoring/rules/rules-tests.yaml"
32 changes: 32 additions & 0 deletions hack/metrics-rules-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

readonly PROMTOOL="$(dirname "$0")/../bin/promtool"

function cleanup() {
local cleanup_dir="${1:?}"
rm -rf "${cleanup_dir}"
}

function main() {
local prom_spec_dumper="${1:?}"
local tests_file="${2:?}"
local temp_dir

temp_dir="$(mktemp --tmpdir --directory metrics_test_dir.XXXXX)"
trap "cleanup ${temp_dir}" RETURN EXIT INT

local rules_file="${temp_dir}/rules.json"
local tests_copy="${temp_dir}/rules-test.yaml"

"${prom_spec_dumper}" > "${rules_file}"
cp "${tests_file}" "${tests_copy}"

echo "INFO: Rules file content:"
cat "${rules_file}"
echo

${PROMTOOL} check rules "${rules_file}"
${PROMTOOL} test rules "${tests_copy}"
}

main "$@"
176 changes: 4 additions & 172 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,196 +2,28 @@ package metrics

import (
"errors"
"fmt"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"

"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

const (
PrometheusRuleName = "prometheus-k8s-rules-cnv"
MonitorNamespace = "openshift-monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
PrometheusLabelKey = "prometheus.ssp.kubevirt.io"
PrometheusLabelValue = "true"
PrometheusClusterRoleName = "prometheus-k8s-ssp"
PrometheusServiceAccountName = "prometheus-k8s"
MetricsPortName = "metrics"
)

const (
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

// RecordRulesDesc represent SSP Operator Prometheus Record Rules
type RecordRulesDesc struct {
Name string
Expr intstr.IntOrString
Description string
Type string
}

// RecordRulesDescList lists all SSP Operator Prometheus Record Rules
var RecordRulesDescList = []RecordRulesDesc{
{
Name: "kubevirt_ssp_operator_up",
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
Description: "The total number of running ssp-operator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_up",
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
Description: "The total number of running virt-template-validator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
Description: "The total number of ssp-operator pods reconciling with no errors",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_rejected_increase",
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of rejected template validators, over the last hour",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_common_templates_restored_increase",
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
Type: "Gauge",
},
}

func getAlertRules() ([]promv1.Rule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return []promv1.Rule{
{
Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"),
Record: "cnv:vmi_status_running:count",
},
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
}, nil
}

func getRecordRules() []promv1.Rule {
var recordRules []promv1.Rule

for _, rrd := range RecordRulesDescList {
recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr})
}

return recordRules
}

func newMonitoringClusterRole() *rbac.ClusterRole {
return &rbac.ClusterRole{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -266,7 +98,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
alertRules, err := getAlertRules()
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}
Expand All @@ -286,7 +118,7 @@ func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(alertRules, getRecordRules()...),
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
Expand Down
Loading

0 comments on commit 1fbd840

Please sign in to comment.