Merge branch '2/edge' into opensearch-throttling

canonical · Dec 18, 2024 · d1d11e6 · d1d11e6
2 parents 5cfcdc0 + c11cd1c
commit d1d11e6
Show file tree

Hide file tree

Showing 3 changed files with 214 additions and 1 deletion.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -32,6 +32,24 @@ jobs:
       - name: Run tests
         run: tox run -e unit
 
+  promtool:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      # prometheus snap includes promtool
+      - name: Install prometheus snap
+        run: sudo snap install prometheus
+
+      - name: Check validity of prometheus alert rules
+        run: |
+          promtool check rules src/alert_rules/prometheus/*.yaml
+
+      - name: Run unit tests for prometheus alert rules
+        run: |
+          promtool test rules tests/unit/test_alert_rules/*.yaml
+
   terraform-test:
     name: Terraform - Lint and Simple Deployment
     runs-on: ubuntu-latest

diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml
@@ -60,7 +60,7 @@
       "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
       "summary": "High Write Rejection Ratio - {{ $value }}%"
     "expr": |
-      round( write:reject_ratio:rate2m * 100, 0.001 ) > 5
+      round( write:reject_ratio:rate2m * 100, 0.1) > 5
     "for": "10m"
     "labels":
       "severity": "warning"

diff --git a/tests/unit/test_alert_rules/test_opensearch_rules.yaml b/tests/unit/test_alert_rules/test_opensearch_rules.yaml
@@ -0,0 +1,195 @@
+rule_files:
+  - ../../../src/alert_rules/prometheus/prometheus_alerts.yaml
+
+evaluation_interval: 1m
+
+tests:
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '2x3'
+    alert_rule_test:
+      - eval_time: 2m
+        alertname: OpenSearchClusterRed
+        exp_alerts:
+          - exp_labels:
+              severity: critical
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
+              summary: "Cluster health status is RED"
+
+  - interval: 1m
+    input_series:
+      - series: 'up{juju_unit="opensearch/1"}'
+        values: '0x20'
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchScrapeFailed
+        exp_alerts:
+          - exp_labels:
+              severity: critical
+              juju_unit: opensearch/1
+            exp_annotations:
+              message: "Scrape on opensearch/1 failed. Ensure that the OpenSearch systemd service is healthy and that the unit is part of the cluster."
+              summary: "OpenSearch exporter scrape failed"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '1x21'
+      - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="relocating"}'
+        values: '35x21'
+    alert_rule_test:
+      - eval_time: 20m
+        alertname: OpenSearchClusterYellowTemp
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load."
+              summary: "Cluster health status is temporarily YELLOW"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
+        values: '1x21'
+      - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="unassigned"}'
+        values: '35x21'
+    alert_rule_test:
+      - eval_time: 20m
+        alertname: OpenSearchClusterYellow
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb health status has been YELLOW. Some replica shards are unassigned."
+              summary: "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards."
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_threadpool_threads_count{name="write", type="rejected", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
+        values: '0 2 4 6 8 10 12 14 16 18 20'   # Simulates increasing rejection rates
+
+      - series: 'opensearch_threadpool_threads_count{name="write", type="completed", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
+        values: '100 110 120 130 140 150 160 170 180 190 200' # Total requests, increasing over time
+
+    alert_rule_test:
+      - eval_time: 11m
+        alertname: OpenSearchWriteRequestsRejectionJumps
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "High Write Rejection Ratio at opensearch-0.fa9 node in opensearch-x7zb cluster. This node may not be keeping up with the indexing speed."
+              summary: "High Write Rejection Ratio - 16.7%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '69802552852x10' # just 70 GB available
+      - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '498589663232x10' # HD with 500 GB
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchNodeDiskLowWatermarkReached
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
+              summary: "Disk Low Watermark Reached - disk saturation is 86%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '44873069690x10' # just 45 GB available
+      - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '498589663232x10' # HD with 500 GB
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchNodeDiskHighWatermarkReached
+        # both low and high water mark alerts are triggered
+        exp_alerts:
+          - exp_labels:
+              severity: high
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "Disk High Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
+              summary: "Disk High Watermark Reached - disk saturation is 91%"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_jvm_mem_heap_used_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '76x60'
+    alert_rule_test:
+      - eval_time: 10m
+        alertname: OpenSearchJVMHeapUseHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "JVM Heap usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 76%."
+              summary: "JVM Heap usage on the node is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_os_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '91x6'
+    alert_rule_test:
+      - eval_time: 1m
+        alertname: OpenSearchHostSystemCPUHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "System CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
+              summary: "System CPU usage is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_process_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
+        values: '91x6'
+    alert_rule_test:
+      - eval_time: 1m
+        alertname: OpenSearchProcessCPUHigh
+        exp_alerts:
+          - exp_labels:
+              severity: alert
+              cluster: opensearch-x7zb
+              instance: 10.1.156.70:9200
+              node: opensearch-0.fa9
+            exp_annotations:
+              message: "OSE process CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
+              summary: "OSE process CPU usage is high"
+
+  - interval: 1m
+    input_series:
+      - series: 'opensearch_indices_indexing_is_throttled_bool{cluster="opensearch-x7zb"}'
+        values: '1x60'
+    alert_rule_test:
+      - eval_time: 5m
+        alertname: OpenSearchThrottling
+        exp_alerts:
+          - exp_labels:
+              severity: warning
+              cluster: opensearch-x7zb
+            exp_annotations:
+              message: "Cluster opensearch-x7zb is throttling. Please optimize queries and indexing patterns or consider scale the application."
+              summary: "OpenSearch Indexing Throttle"