From 2eecc8f5d7efb9ce802cb76aea4fa1173aaf9d62 Mon Sep 17 00:00:00 2001 From: Leon <82407168+sed-i@users.noreply.github.com> Date: Wed, 20 Nov 2024 05:51:45 -0500 Subject: [PATCH] Fix template variable usage (#199) * Update host_health.rules Signed-off-by: Leon <82407168+sed-i@users.noreply.github.com> * Move individual alerts under group * Rename ext to match contents --------- Signed-off-by: Leon <82407168+sed-i@users.noreply.github.com> --- src/prometheus_alert_rules/high_cpu_iowait.rule | 8 -------- src/prometheus_alert_rules/high_cpu_iowait.rules | 11 +++++++++++ src/prometheus_alert_rules/host_health.rules | 4 ++-- src/prometheus_alert_rules/oomkill.rule | 8 -------- src/prometheus_alert_rules/oomkill.rules | 11 +++++++++++ 5 files changed, 24 insertions(+), 18 deletions(-) delete mode 100644 src/prometheus_alert_rules/high_cpu_iowait.rule create mode 100644 src/prometheus_alert_rules/high_cpu_iowait.rules delete mode 100644 src/prometheus_alert_rules/oomkill.rule create mode 100644 src/prometheus_alert_rules/oomkill.rules diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rule b/src/prometheus_alert_rules/high_cpu_iowait.rule deleted file mode 100644 index eed0cd7..0000000 --- a/src/prometheus_alert_rules/high_cpu_iowait.rule +++ /dev/null @@ -1,8 +0,0 @@ -alert: HostCpuHighIowait -expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 -for: 0m -labels: - severity: warning -annotations: - summary: Host CPU high iowait (instance {{ $labels.instance }}) - description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rules b/src/prometheus_alert_rules/high_cpu_iowait.rules new file mode 100644 index 0000000..5b2dceb --- /dev/null +++ b/src/prometheus_alert_rules/high_cpu_iowait.rules @@ -0,0 +1,11 @@ +groups: +- name: HostCpu + rules: + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/host_health.rules b/src/prometheus_alert_rules/host_health.rules index 803e2e5..f4a3814 100644 --- a/src/prometheus_alert_rules/host_health.rules +++ b/src/prometheus_alert_rules/host_health.rules @@ -7,7 +7,7 @@ groups: labels: severity: critical annotations: - summary: Host '{{labels.instance}}' is down. + summary: Host '{{ $labels.instance }}' is down. description: >- Host '{{ $labels.instance }}' is down. VALUE = {{ $value }} @@ -18,7 +18,7 @@ groups: labels: severity: critical annotations: - summary: Metrics not received from host '{{labels.instance}}'. + summary: Metrics not received from host '{{ $labels.instance }}'. description: >- The metrics endpoint for host '{{ $labels.instance }}' is unreachable. VALUE = {{ $value }} diff --git a/src/prometheus_alert_rules/oomkill.rule b/src/prometheus_alert_rules/oomkill.rule deleted file mode 100644 index e7fce3b..0000000 --- a/src/prometheus_alert_rules/oomkill.rule +++ /dev/null @@ -1,8 +0,0 @@ -alert: HostOomKillDetected -expr: increase(node_vmstat_oom_kill[1h]) > 0 -for: 0m -labels: - severity: warning -annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/oomkill.rules b/src/prometheus_alert_rules/oomkill.rules new file mode 100644 index 0000000..21c1200 --- /dev/null +++ b/src/prometheus_alert_rules/oomkill.rules @@ -0,0 +1,11 @@ +groups: +- name: HostMemory + rules: + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1h]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"