From 9364fe07a699931e040091887c46679bfc4f2e8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Wed, 3 Jul 2024 12:45:37 -0600 Subject: [PATCH] feat: implement some default alerts --- src/alert_rules/prometheus/high_cpu_usage.rule | 12 ++++++++++++ src/alert_rules/prometheus/unreachable_slurmdbd.rule | 12 ++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 src/alert_rules/prometheus/high_cpu_usage.rule create mode 100644 src/alert_rules/prometheus/unreachable_slurmdbd.rule diff --git a/src/alert_rules/prometheus/high_cpu_usage.rule b/src/alert_rules/prometheus/high_cpu_usage.rule new file mode 100644 index 0000000..3c45530 --- /dev/null +++ b/src/alert_rules/prometheus/high_cpu_usage.rule @@ -0,0 +1,12 @@ +alert: SlurmHighCPUUsage +expr: (slurm_cpu_load{%%juju_topology%%} / slurm_cpus_total{%%juju_topology%%}) * 100 > 90 +for: 5m +labels: + severity: warning +annotations: + summary: CPU usage for the cluster managed by the Slurm controller {{ $labels.juju_model }}/{{ $labels.juju_unit }} reached 90% + description: > + The total CPU usage for all nodes in the cluster managed by the Slurm controller + {{ $labels.juju_model }}/{{ $labels.juju_unit }} reached 90%. This could indicate that the cluster + is reaching its maximum computing capacity. + LABELS = {{ $labels }} diff --git a/src/alert_rules/prometheus/unreachable_slurmdbd.rule b/src/alert_rules/prometheus/unreachable_slurmdbd.rule new file mode 100644 index 0000000..585a340 --- /dev/null +++ b/src/alert_rules/prometheus/unreachable_slurmdbd.rule @@ -0,0 +1,12 @@ +alert: SlurmTooManyFailedDbdMessages +expr: max_over_time(slurm_dbd_agent_queue_size{%%juju_topology%%}[10s]) > 50 +for: 1s +labels: + severity: critical +annotations: + summary: Slurm controller {{ $labels.juju_model }}/{{ $labels.juju_unit }} cannot reach SlurmDBD + description: > + The maximum amount of pending messages from the Slurm controller {{ $labels.juju_model }}/{{ $labels.juju_unit }} + to SlurmDBD exceeded 5000 in the past minute. This can indicate a problem to reach SlurmDBD + or its backing database. + LABELS = {{ $labels }}