diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rule b/src/prometheus_alert_rules/high_cpu_iowait.rule deleted file mode 100644 index eed0cd7..0000000 --- a/src/prometheus_alert_rules/high_cpu_iowait.rule +++ /dev/null @@ -1,8 +0,0 @@ -alert: HostCpuHighIowait -expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 -for: 0m -labels: - severity: warning -annotations: - summary: Host CPU high iowait (instance {{ $labels.instance }}) - description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/high_cpu_iowait.rules b/src/prometheus_alert_rules/high_cpu_iowait.rules new file mode 100644 index 0000000..5b2dceb --- /dev/null +++ b/src/prometheus_alert_rules/high_cpu_iowait.rules @@ -0,0 +1,11 @@ +groups: +- name: HostCpu + rules: + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/host_health.rules b/src/prometheus_alert_rules/host_health.rules index 803e2e5..f4a3814 100644 --- a/src/prometheus_alert_rules/host_health.rules +++ b/src/prometheus_alert_rules/host_health.rules @@ -7,7 +7,7 @@ groups: labels: severity: critical annotations: - summary: Host '{{labels.instance}}' is down. + summary: Host '{{ $labels.instance }}' is down. description: >- Host '{{ $labels.instance }}' is down. VALUE = {{ $value }} @@ -18,7 +18,7 @@ groups: labels: severity: critical annotations: - summary: Metrics not received from host '{{labels.instance}}'. + summary: Metrics not received from host '{{ $labels.instance }}'. description: >- The metrics endpoint for host '{{ $labels.instance }}' is unreachable. VALUE = {{ $value }} diff --git a/src/prometheus_alert_rules/oomkill.rule b/src/prometheus_alert_rules/oomkill.rule deleted file mode 100644 index e7fce3b..0000000 --- a/src/prometheus_alert_rules/oomkill.rule +++ /dev/null @@ -1,8 +0,0 @@ -alert: HostOomKillDetected -expr: increase(node_vmstat_oom_kill[1h]) > 0 -for: 0m -labels: - severity: warning -annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/src/prometheus_alert_rules/oomkill.rules b/src/prometheus_alert_rules/oomkill.rules new file mode 100644 index 0000000..21c1200 --- /dev/null +++ b/src/prometheus_alert_rules/oomkill.rules @@ -0,0 +1,11 @@ +groups: +- name: HostMemory + rules: + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1h]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"