From 78c448abe9ed4dd82708c983537b466da8c5f290 Mon Sep 17 00:00:00 2001
From: Gabriel Cocenza <gabriel.cocenza@canonical.com>
Date: Wed, 18 Dec 2024 14:08:17 -0300
Subject: [PATCH 1/2] Add CI and unit tests for prometheus rules using promtool
 (#505)

- It's good to have unit tests for the prometheus alerts and also check
if the prometheus_alerts.yaml is a valid file.

- using promtool is possible to check the rules and also run unit tests.

## How to test this PR

1. Install the prometheus snap:
```shell
sudo snap install prometheus
```

2. Run to check if the rules file is valid:
```shell
promtool check rules src/alert_rules/prometheus/*.yaml
```

3. Run the unit tests:
```
promtool test rules tests/unit/test_alert_rules/*.yaml
```
---
 .github/workflows/ci.yaml                     |   3 +
 .github/workflows/test_prometheus_rules.yaml  |  28 +++
 .../prometheus/prometheus_alerts.yaml         |   2 +-
 .../test_opensearch_rules.yaml                | 195 ++++++++++++++++++
 4 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/test_prometheus_rules.yaml
 create mode 100644 tests/unit/test_alert_rules/test_opensearch_rules.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 581a1054a..9ab29bde0 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -32,6 +32,9 @@ jobs:
       - name: Run tests
         run: tox run -e unit
 
+  promtool:
+    uses: ./.github/workflows/test_prometheus_rules.yaml
+
   terraform-test:
     name: Terraform - Lint and Simple Deployment
     runs-on: ubuntu-latest
diff --git a/.github/workflows/test_prometheus_rules.yaml b/.github/workflows/test_prometheus_rules.yaml
new file mode 100644
index 000000000..2af5a7c2e
--- /dev/null
+++ b/.github/workflows/test_prometheus_rules.yaml
@@ -0,0 +1,28 @@
+name: Test prometheus rules
+
+on:
+  workflow_call:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  promtool:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      # prometheus snap includes promtool
+      - name: Install prometheus snap
+        run: sudo snap install prometheus
+
+      - name: Check validity of prometheus alert rules
+        run: |
+          promtool check rules src/alert_rules/prometheus/*.yaml
+
+      - name: Run unit tests for prometheus alert rules
+        run: |
+          promtool test rules tests/unit/test_alert_rules/*.yaml
diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml
index 630c567f8..332e40579 100644
--- a/src/alert_rules/prometheus/prometheus_alerts.yaml
+++ b/src/alert_rules/prometheus/prometheus_alerts.yaml
@@ -60,7 +60,7 @@
       "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
       "summary": "High Write Rejection Ratio - {{ $value }}%"
     "expr": |
-      round( write:reject_ratio:rate2m * 100, 0.001 ) > 5
+      round( write:reject_ratio:rate2m * 100, 0.1) > 5
     "for": "10m"
     "labels":
       "severity": "warning"
diff --git a/tests/unit/test_alert_rules/test_opensearch_rules.yaml b/tests/unit/test_alert_rules/test_opensearch_rules.yaml
new file mode 100644
index 000000000..7a6b8f719
--- /dev/null
+++ b/tests/unit/test_alert_rules/test_opensearch_rules.yaml
@@ -0,0 +1,195 @@
+rule_files:
+  - ../../../src/alert_rules/prometheus/prometheus_alerts.yaml
+
+evaluation_interval: 1m
+
+tests:
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '2x3'
+    alert_rule_test:
+      - eval_time: 2m
+        alertname: OpenSearchClusterRed
+        exp_alerts:
+          - exp_labels:
+              severity: critical
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
+              summary: "Cluster health status is RED"
+
+  - interval: 1m
+    input_series:
+      - series: 'up{juju_unit="opensearch/1"}'
+        values: '0x20'
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchScrapeFailed
+        exp_alerts:
+          - exp_labels:
+              severity: critical
+              juju_unit: opensearch/1
+            exp_annotations:
+              message: "Scrape on opensearch/1 failed. Ensure that the OpenSearch systemd service is healthy and that the unit is part of the cluster."
+              summary: "OpenSearch exporter scrape failed"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '1x21'
+      - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="relocating"}'
+        values: '35x21'
+    alert_rule_test:
+      - eval_time: 20m
+        alertname: OpenSearchClusterYellowTemp
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load."
+              summary: "Cluster health status is temporarily YELLOW"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '1x21'
+      - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="unassigned"}'
+        values: '35x21'
+    alert_rule_test:
+      - eval_time: 20m
+        alertname: OpenSearchClusterYellow
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been YELLOW. Some replica shards are unassigned."
+              summary: "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards."
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_threadpool_threads_count{name="write", type="rejected", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
+        values: '0 2 4 6 8 10 12 14 16 18 20'   # Simulates increasing rejection rates
+
+      - series: 'opensearch_threadpool_threads_count{name="write", type="completed", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
+        values: '100 110 120 130 140 150 160 170 180 190 200' # Total requests, increasing over time
+
+    alert_rule_test:
+      - eval_time: 11m
+        alertname: OpenSearchWriteRequestsRejectionJumps
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "High Write Rejection Ratio at opensearch-0.fa9 node in opensearch-x7zb cluster. This node may not be keeping up with the indexing speed."
+              summary: "High Write Rejection Ratio - 16.7%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '69802552852x10' # just 70 GB available
+      - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '498589663232x10' # HD with 500 GB
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchNodeDiskLowWatermarkReached
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
+              summary: "Disk Low Watermark Reached - disk saturation is 86%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '44873069690x10' # just 45 GB available
+      - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '498589663232x10' # HD with 500 GB
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchNodeDiskHighWatermarkReached
+        # both low and high water mark alerts are triggered
+        exp_alerts:
+          - exp_labels:
+              severity: high
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "Disk High Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
+              summary: "Disk High Watermark Reached - disk saturation is 91%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_jvm_mem_heap_used_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '76x60'
+    alert_rule_test:
+      - eval_time: 10m
+        alertname: OpenSearchJVMHeapUseHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "JVM Heap usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 76%."
+              summary: "JVM Heap usage on the node is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_os_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '91x6'
+    alert_rule_test:
+      - eval_time: 1m
+        alertname: OpenSearchHostSystemCPUHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "System CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
+              summary: "System CPU usage is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_process_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '91x6'
+    alert_rule_test:
+      - eval_time: 1m
+        alertname: OpenSearchProcessCPUHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "OSE process CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
+              summary: "OSE process CPU usage is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_indices_indexing_is_throttled_bool{cluster="opensearch-x7zb"}'
+        values: '1x60'
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchThrottling
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb is throttling. Please optimize queries and indexing patterns or consider scale the application."
+              summary: "OpenSearch Indexing Throttle"

From c11cd1c9f557a4cc7870660ffbd0fd79d9225d66 Mon Sep 17 00:00:00 2001
From: Gabriel Cocenza <gabriel.cocenza@canonical.com>
Date: Wed, 18 Dec 2024 14:50:05 -0300
Subject: [PATCH 2/2] Both ci.yaml and test_prometheus_rules.yaml were using
 same lock (#524)

Those files had the same configuration for concurrency which blocked the
CI with the message: Canceling since a deadlock for concurrency group
'Release to latest/edge-refs/heads/2/edge' was detected between
'ci-tests' and 'ci-tests.promtool'

This PR adds completly the promtool into the ci.yaml file
---
 .github/workflows/ci.yaml                    | 17 +++++++++++-
 .github/workflows/test_prometheus_rules.yaml | 28 --------------------
 2 files changed, 16 insertions(+), 29 deletions(-)
 delete mode 100644 .github/workflows/test_prometheus_rules.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 9ab29bde0..1e1862ad7 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -33,7 +33,22 @@ jobs:
         run: tox run -e unit
 
   promtool:
-    uses: ./.github/workflows/test_prometheus_rules.yaml
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      # prometheus snap includes promtool
+      - name: Install prometheus snap
+        run: sudo snap install prometheus
+
+      - name: Check validity of prometheus alert rules
+        run: |
+          promtool check rules src/alert_rules/prometheus/*.yaml
+
+      - name: Run unit tests for prometheus alert rules
+        run: |
+          promtool test rules tests/unit/test_alert_rules/*.yaml
 
   terraform-test:
     name: Terraform - Lint and Simple Deployment
diff --git a/.github/workflows/test_prometheus_rules.yaml b/.github/workflows/test_prometheus_rules.yaml
deleted file mode 100644
index 2af5a7c2e..000000000
--- a/.github/workflows/test_prometheus_rules.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Test prometheus rules
-
-on:
-  workflow_call:
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  promtool:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v4
-
-      # prometheus snap includes promtool
-      - name: Install prometheus snap
-        run: sudo snap install prometheus
-
-      - name: Check validity of prometheus alert rules
-        run: |
-          promtool check rules src/alert_rules/prometheus/*.yaml
-
-      - name: Run unit tests for prometheus alert rules
-        run: |
-          promtool test rules tests/unit/test_alert_rules/*.yaml