Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Metrics unit tests #848

Merged
merged 4 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ endif
all: manager

.PHONY: unittest
unittest: generate lint fmt vet manifests
unittest: generate lint fmt vet manifests metrics-rules-test
go test -v -coverprofile cover.out $(SRC_PATHS_TESTS)
cd api && go test -v ./...

Expand Down Expand Up @@ -325,3 +325,22 @@ lint:
.PHONY: lint-metrics
lint-metrics:
./hack/prom_metric_linter.sh --operator-name="kubevirt" --sub-operator-name="ssp"

PROMTOOL ?= $(LOCALBIN)/promtool
PROMTOOL_VERSION ?= 2.44.0

.PHONY: promtool
promtool: $(PROMTOOL)
$(PROMTOOL): $(LOCALBIN)
test -s $(PROMTOOL) || curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v$(PROMTOOL_VERSION)/prometheus-$(PROMTOOL_VERSION).linux-amd64.tar.gz" | \
tar xvzf - --directory=$(LOCALBIN) "prometheus-$(PROMTOOL_VERSION).linux-amd64"/promtool --strip-components=1

METRIC_RULES_WRITER ?= $(LOCALBIN)/metrics-rules-writer

.PHONY: build-metric-rules-writer
build-metric-rules-writer: $(LOCALBIN)
go build -o $(METRIC_RULES_WRITER) tools/test-rules-writer/test_rules_writer.go

.PHONY: metrics-rules-test
metrics-rules-test: build-metric-rules-writer promtool
./hack/metrics-rules-test.sh $(METRIC_RULES_WRITER) "./pkg/monitoring/rules/rules-tests.yaml"
32 changes: 32 additions & 0 deletions hack/metrics-rules-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

readonly PROMTOOL="$(dirname "$0")/../bin/promtool"

function cleanup() {
local cleanup_dir="${1:?}"
rm -rf "${cleanup_dir}"
}

function main() {
local prom_spec_dumper="${1:?}"
local tests_file="${2:?}"
local temp_dir

temp_dir="$(mktemp --tmpdir --directory metrics_test_dir.XXXXX)"
trap "cleanup ${temp_dir}" RETURN EXIT INT

local rules_file="${temp_dir}/rules.json"
local tests_copy="${temp_dir}/rules-test.yaml"

"${prom_spec_dumper}" > "${rules_file}"
cp "${tests_file}" "${tests_copy}"

echo "INFO: Rules file content:"
cat "${rules_file}"
echo

${PROMTOOL} check rules "${rules_file}"
${PROMTOOL} test rules "${tests_copy}"
}

main "$@"
176 changes: 4 additions & 172 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,196 +2,28 @@ package metrics

import (
"errors"
"fmt"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"

"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

const (
PrometheusRuleName = "prometheus-k8s-rules-cnv"
MonitorNamespace = "openshift-monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
PrometheusLabelKey = "prometheus.ssp.kubevirt.io"
PrometheusLabelValue = "true"
PrometheusClusterRoleName = "prometheus-k8s-ssp"
PrometheusServiceAccountName = "prometheus-k8s"
MetricsPortName = "metrics"
)

const (
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

// RecordRulesDesc represent SSP Operator Prometheus Record Rules
type RecordRulesDesc struct {
Name string
Expr intstr.IntOrString
Description string
Type string
}

// RecordRulesDescList lists all SSP Operator Prometheus Record Rules
var RecordRulesDescList = []RecordRulesDesc{
{
Name: "kubevirt_ssp_operator_up",
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
Description: "The total number of running ssp-operator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_up",
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
Description: "The total number of running virt-template-validator pods",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
Description: "The total number of ssp-operator pods reconciling with no errors",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_template_validator_rejected_increase",
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of rejected template validators, over the last hour",
Type: "Gauge",
},
{
Name: "kubevirt_ssp_common_templates_restored_increase",
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
Type: "Gauge",
},
}

func getAlertRules() ([]promv1.Rule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return []promv1.Rule{
{
Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"),
Record: "cnv:vmi_status_running:count",
},
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
},
}, nil
}

func getRecordRules() []promv1.Rule {
var recordRules []promv1.Rule

for _, rrd := range RecordRulesDescList {
recordRules = append(recordRules, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr})
}

return recordRules
}

func newMonitoringClusterRole() *rbac.ClusterRole {
return &rbac.ClusterRole{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -266,7 +98,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
alertRules, err := getAlertRules()
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}
Expand All @@ -286,7 +118,7 @@ func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(alertRules, getRecordRules()...),
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
Expand Down
Loading
Loading