From 2790c840a820b7dce799da81e52c4f6c01baaad4 Mon Sep 17 00:00:00 2001 From: Leon <82407168+sed-i@users.noreply.github.com> Date: Tue, 29 Oct 2024 19:18:59 -0400 Subject: [PATCH 1/3] Update host_health.rules Signed-off-by: Leon <82407168+sed-i@users.noreply.github.com> --- src/prometheus_alert_rules/host_health.rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/prometheus_alert_rules/host_health.rules b/src/prometheus_alert_rules/host_health.rules index 803e2e5..f4a3814 100644 --- a/src/prometheus_alert_rules/host_health.rules +++ b/src/prometheus_alert_rules/host_health.rules @@ -7,7 +7,7 @@ groups: labels: severity: critical annotations: - summary: Host '{{labels.instance}}' is down. + summary: Host '{{ $labels.instance }}' is down. description: >- Host '{{ $labels.instance }}' is down. VALUE = {{ $value }} @@ -18,7 +18,7 @@ groups: labels: severity: critical annotations: - summary: Metrics not received from host '{{labels.instance}}'. + summary: Metrics not received from host '{{ $labels.instance }}'. description: >- The metrics endpoint for host '{{ $labels.instance }}' is unreachable. VALUE = {{ $value }} From b642c34a2b956efa49a6af13d470a89058509708 Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Tue, 29 Oct 2024 19:26:29 -0400 Subject: [PATCH 2/3] Move individual alerts under group --- .../high_cpu_iowait.rule | 19 +++++++++++-------- src/prometheus_alert_rules/oomkill.rule | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rule b/src/prometheus_alert_rules/high_cpu_iowait.rule index eed0cd7..5b2dceb 100644 --- a/src/prometheus_alert_rules/high_cpu_iowait.rule +++ b/src/prometheus_alert_rules/high_cpu_iowait.rule @@ -1,8 +1,11 @@ -alert: HostCpuHighIowait -expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 -for: 0m -labels: - severity: warning -annotations: - summary: Host CPU high iowait (instance {{ $labels.instance }}) - description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" +groups: +- name: HostCpu + rules: + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/oomkill.rule b/src/prometheus_alert_rules/oomkill.rule index e7fce3b..21c1200 100644 --- a/src/prometheus_alert_rules/oomkill.rule +++ b/src/prometheus_alert_rules/oomkill.rule @@ -1,8 +1,11 @@ -alert: HostOomKillDetected -expr: increase(node_vmstat_oom_kill[1h]) > 0 -for: 0m -labels: - severity: warning -annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" +groups: +- name: HostMemory + rules: + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1h]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From ca33d8e9b326d18164f6536338453a260752b2ca Mon Sep 17 00:00:00 2001 From: sed-i <82407168+sed-i@users.noreply.github.com> Date: Tue, 29 Oct 2024 19:27:10 -0400 Subject: [PATCH 3/3] Rename ext to match contents --- .../{high_cpu_iowait.rule => high_cpu_iowait.rules} | 0 src/prometheus_alert_rules/{oomkill.rule => oomkill.rules} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/prometheus_alert_rules/{high_cpu_iowait.rule => high_cpu_iowait.rules} (100%) rename src/prometheus_alert_rules/{oomkill.rule => oomkill.rules} (100%) diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rule b/src/prometheus_alert_rules/high_cpu_iowait.rules similarity index 100% rename from src/prometheus_alert_rules/high_cpu_iowait.rule rename to src/prometheus_alert_rules/high_cpu_iowait.rules diff --git a/src/prometheus_alert_rules/oomkill.rule b/src/prometheus_alert_rules/oomkill.rules similarity index 100% rename from src/prometheus_alert_rules/oomkill.rule rename to src/prometheus_alert_rules/oomkill.rules