From 78c448abe9ed4dd82708c983537b466da8c5f290 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 18 Dec 2024 14:08:17 -0300 Subject: [PATCH 1/2] Add CI and unit tests for prometheus rules using promtool (#505) - It's good to have unit tests for the prometheus alerts and also check if the prometheus_alerts.yaml is a valid file. - using promtool is possible to check the rules and also run unit tests. ## How to test this PR 1. Install the prometheus snap: ```shell sudo snap install prometheus ``` 2. Run to check if the rules file is valid: ```shell promtool check rules src/alert_rules/prometheus/*.yaml ``` 3. Run the unit tests: ``` promtool test rules tests/unit/test_alert_rules/*.yaml ``` --- .github/workflows/ci.yaml | 3 + .github/workflows/test_prometheus_rules.yaml | 28 +++ .../prometheus/prometheus_alerts.yaml | 2 +- .../test_opensearch_rules.yaml | 195 ++++++++++++++++++ 4 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test_prometheus_rules.yaml create mode 100644 tests/unit/test_alert_rules/test_opensearch_rules.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 581a1054a..9ab29bde0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,6 +32,9 @@ jobs: - name: Run tests run: tox run -e unit + promtool: + uses: ./.github/workflows/test_prometheus_rules.yaml + terraform-test: name: Terraform - Lint and Simple Deployment runs-on: ubuntu-latest diff --git a/.github/workflows/test_prometheus_rules.yaml b/.github/workflows/test_prometheus_rules.yaml new file mode 100644 index 000000000..2af5a7c2e --- /dev/null +++ b/.github/workflows/test_prometheus_rules.yaml @@ -0,0 +1,28 @@ +name: Test prometheus rules + +on: + workflow_call: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + promtool: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + # prometheus snap includes promtool + - name: Install prometheus snap + run: sudo snap install prometheus + + - name: Check validity of prometheus alert rules + run: | + promtool check rules src/alert_rules/prometheus/*.yaml + + - name: Run unit tests for prometheus alert rules + run: | + promtool test rules tests/unit/test_alert_rules/*.yaml diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 630c567f8..332e40579 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -60,7 +60,7 @@ "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." "summary": "High Write Rejection Ratio - {{ $value }}%" "expr": | - round( write:reject_ratio:rate2m * 100, 0.001 ) > 5 + round( write:reject_ratio:rate2m * 100, 0.1) > 5 "for": "10m" "labels": "severity": "warning" diff --git a/tests/unit/test_alert_rules/test_opensearch_rules.yaml b/tests/unit/test_alert_rules/test_opensearch_rules.yaml new file mode 100644 index 000000000..7a6b8f719 --- /dev/null +++ b/tests/unit/test_alert_rules/test_opensearch_rules.yaml @@ -0,0 +1,195 @@ +rule_files: + - ../../../src/alert_rules/prometheus/prometheus_alerts.yaml + +evaluation_interval: 1m + +tests: + + - interval: 1m + input_series: + - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}' + values: '2x3' + alert_rule_test: + - eval_time: 2m + alertname: OpenSearchClusterRed + exp_alerts: + - exp_labels: + severity: critical + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + summary: "Cluster health status is RED" + + - interval: 1m + input_series: + - series: 'up{juju_unit="opensearch/1"}' + values: '0x20' + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchScrapeFailed + exp_alerts: + - exp_labels: + severity: critical + juju_unit: opensearch/1 + exp_annotations: + message: "Scrape on opensearch/1 failed. Ensure that the OpenSearch systemd service is healthy and that the unit is part of the cluster." + summary: "OpenSearch exporter scrape failed" + + - interval: 1m + input_series: + - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}' + values: '1x21' + - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="relocating"}' + values: '35x21' + alert_rule_test: + - eval_time: 20m + alertname: OpenSearchClusterYellowTemp + exp_alerts: + - exp_labels: + severity: warning + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load." + summary: "Cluster health status is temporarily YELLOW" + + - interval: 1m + input_series: + - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}' + values: '1x21' + - series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="unassigned"}' + values: '35x21' + alert_rule_test: + - eval_time: 20m + alertname: OpenSearchClusterYellow + exp_alerts: + - exp_labels: + severity: warning + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb health status has been YELLOW. Some replica shards are unassigned." + summary: "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards." + + - interval: 1m + input_series: + - series: 'opensearch_threadpool_threads_count{name="write", type="rejected", cluster="opensearch-x7zb", node="opensearch-0.fa9"}' + values: '0 2 4 6 8 10 12 14 16 18 20' # Simulates increasing rejection rates + + - series: 'opensearch_threadpool_threads_count{name="write", type="completed", cluster="opensearch-x7zb", node="opensearch-0.fa9"}' + values: '100 110 120 130 140 150 160 170 180 190 200' # Total requests, increasing over time + + alert_rule_test: + - eval_time: 11m + alertname: OpenSearchWriteRequestsRejectionJumps + exp_alerts: + - exp_labels: + severity: warning + cluster: opensearch-x7zb + node: opensearch-0.fa9 + exp_annotations: + message: "High Write Rejection Ratio at opensearch-0.fa9 node in opensearch-x7zb cluster. This node may not be keeping up with the indexing speed." + summary: "High Write Rejection Ratio - 16.7%" + + - interval: 1m + input_series: + - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '69802552852x10' # just 70 GB available + - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '498589663232x10' # HD with 500 GB + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchNodeDiskLowWatermarkReached + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." + summary: "Disk Low Watermark Reached - disk saturation is 86%" + + - interval: 1m + input_series: + - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '44873069690x10' # just 45 GB available + - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '498589663232x10' # HD with 500 GB + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchNodeDiskHighWatermarkReached + # both low and high water mark alerts are triggered + exp_alerts: + - exp_labels: + severity: high + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "Disk High Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." + summary: "Disk High Watermark Reached - disk saturation is 91%" + + - interval: 1m + input_series: + - series: 'opensearch_jvm_mem_heap_used_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '76x60' + alert_rule_test: + - eval_time: 10m + alertname: OpenSearchJVMHeapUseHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "JVM Heap usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 76%." + summary: "JVM Heap usage on the node is high" + + - interval: 1m + input_series: + - series: 'opensearch_os_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '91x6' + alert_rule_test: + - eval_time: 1m + alertname: OpenSearchHostSystemCPUHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "System CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%" + summary: "System CPU usage is high" + + - interval: 1m + input_series: + - series: 'opensearch_process_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '91x6' + alert_rule_test: + - eval_time: 1m + alertname: OpenSearchProcessCPUHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "OSE process CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%" + summary: "OSE process CPU usage is high" + + - interval: 1m + input_series: + - series: 'opensearch_indices_indexing_is_throttled_bool{cluster="opensearch-x7zb"}' + values: '1x60' + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchThrottling + exp_alerts: + - exp_labels: + severity: warning + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb is throttling. Please optimize queries and indexing patterns or consider scale the application." + summary: "OpenSearch Indexing Throttle" From c11cd1c9f557a4cc7870660ffbd0fd79d9225d66 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 18 Dec 2024 14:50:05 -0300 Subject: [PATCH 2/2] Both ci.yaml and test_prometheus_rules.yaml were using same lock (#524) Those files had the same configuration for concurrency which blocked the CI with the message: Canceling since a deadlock for concurrency group 'Release to latest/edge-refs/heads/2/edge' was detected between 'ci-tests' and 'ci-tests.promtool' This PR adds completly the promtool into the ci.yaml file --- .github/workflows/ci.yaml | 17 +++++++++++- .github/workflows/test_prometheus_rules.yaml | 28 -------------------- 2 files changed, 16 insertions(+), 29 deletions(-) delete mode 100644 .github/workflows/test_prometheus_rules.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9ab29bde0..1e1862ad7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,7 +33,22 @@ jobs: run: tox run -e unit promtool: - uses: ./.github/workflows/test_prometheus_rules.yaml + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + # prometheus snap includes promtool + - name: Install prometheus snap + run: sudo snap install prometheus + + - name: Check validity of prometheus alert rules + run: | + promtool check rules src/alert_rules/prometheus/*.yaml + + - name: Run unit tests for prometheus alert rules + run: | + promtool test rules tests/unit/test_alert_rules/*.yaml terraform-test: name: Terraform - Lint and Simple Deployment diff --git a/.github/workflows/test_prometheus_rules.yaml b/.github/workflows/test_prometheus_rules.yaml deleted file mode 100644 index 2af5a7c2e..000000000 --- a/.github/workflows/test_prometheus_rules.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: Test prometheus rules - -on: - workflow_call: - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: true - -jobs: - promtool: - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - # prometheus snap includes promtool - - name: Install prometheus snap - run: sudo snap install prometheus - - - name: Check validity of prometheus alert rules - run: | - promtool check rules src/alert_rules/prometheus/*.yaml - - - name: Run unit tests for prometheus alert rules - run: | - promtool test rules tests/unit/test_alert_rules/*.yaml