Skip to content

Commit

Permalink
Add CI and unit tests for prometheus rules using promtool (#505)
Browse files Browse the repository at this point in the history
- It's good to have unit tests for the prometheus alerts and also check
if the prometheus_alerts.yaml is a valid file.

- using promtool is possible to check the rules and also run unit tests.

## How to test this PR

1. Install the prometheus snap:
```shell
sudo snap install prometheus
```

2. Run to check if the rules file is valid:
```shell
promtool check rules src/alert_rules/prometheus/*.yaml
```

3. Run the unit tests:
```
promtool test rules tests/unit/test_alert_rules/*.yaml
```
  • Loading branch information
gabrielcocenza authored Dec 18, 2024
1 parent 4416292 commit 78c448a
Show file tree
Hide file tree
Showing 4 changed files with 227 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ jobs:
- name: Run tests
run: tox run -e unit

promtool:
uses: ./.github/workflows/test_prometheus_rules.yaml

terraform-test:
name: Terraform - Lint and Simple Deployment
runs-on: ubuntu-latest
Expand Down
28 changes: 28 additions & 0 deletions .github/workflows/test_prometheus_rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Test prometheus rules

on:
workflow_call:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: true

jobs:
promtool:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@v4

# prometheus snap includes promtool
- name: Install prometheus snap
run: sudo snap install prometheus

- name: Check validity of prometheus alert rules
run: |
promtool check rules src/alert_rules/prometheus/*.yaml
- name: Run unit tests for prometheus alert rules
run: |
promtool test rules tests/unit/test_alert_rules/*.yaml
2 changes: 1 addition & 1 deletion src/alert_rules/prometheus/prometheus_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"summary": "High Write Rejection Ratio - {{ $value }}%"
"expr": |
round( write:reject_ratio:rate2m * 100, 0.001 ) > 5
round( write:reject_ratio:rate2m * 100, 0.1) > 5
"for": "10m"
"labels":
"severity": "warning"
Expand Down
195 changes: 195 additions & 0 deletions tests/unit/test_alert_rules/test_opensearch_rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
rule_files:
- ../../../src/alert_rules/prometheus/prometheus_alerts.yaml

evaluation_interval: 1m

tests:

- interval: 1m
input_series:
- series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
values: '2x3'
alert_rule_test:
- eval_time: 2m
alertname: OpenSearchClusterRed
exp_alerts:
- exp_labels:
severity: critical
cluster: opensearch-x7zb
exp_annotations:
message: "Cluster opensearch-x7zb health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
summary: "Cluster health status is RED"

- interval: 1m
input_series:
- series: 'up{juju_unit="opensearch/1"}'
values: '0x20'
alert_rule_test:
- eval_time: 5m
alertname: OpenSearchScrapeFailed
exp_alerts:
- exp_labels:
severity: critical
juju_unit: opensearch/1
exp_annotations:
message: "Scrape on opensearch/1 failed. Ensure that the OpenSearch systemd service is healthy and that the unit is part of the cluster."
summary: "OpenSearch exporter scrape failed"

- interval: 1m
input_series:
- series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
values: '1x21'
- series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="relocating"}'
values: '35x21'
alert_rule_test:
- eval_time: 20m
alertname: OpenSearchClusterYellowTemp
exp_alerts:
- exp_labels:
severity: warning
cluster: opensearch-x7zb
exp_annotations:
message: "Cluster opensearch-x7zb health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load."
summary: "Cluster health status is temporarily YELLOW"

- interval: 1m
input_series:
- series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}'
values: '1x21'
- series: 'opensearch_cluster_shards_number{cluster="opensearch-x7zb", type="unassigned"}'
values: '35x21'
alert_rule_test:
- eval_time: 20m
alertname: OpenSearchClusterYellow
exp_alerts:
- exp_labels:
severity: warning
cluster: opensearch-x7zb
exp_annotations:
message: "Cluster opensearch-x7zb health status has been YELLOW. Some replica shards are unassigned."
summary: "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards."

- interval: 1m
input_series:
- series: 'opensearch_threadpool_threads_count{name="write", type="rejected", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
values: '0 2 4 6 8 10 12 14 16 18 20' # Simulates increasing rejection rates

- series: 'opensearch_threadpool_threads_count{name="write", type="completed", cluster="opensearch-x7zb", node="opensearch-0.fa9"}'
values: '100 110 120 130 140 150 160 170 180 190 200' # Total requests, increasing over time

alert_rule_test:
- eval_time: 11m
alertname: OpenSearchWriteRequestsRejectionJumps
exp_alerts:
- exp_labels:
severity: warning
cluster: opensearch-x7zb
node: opensearch-0.fa9
exp_annotations:
message: "High Write Rejection Ratio at opensearch-0.fa9 node in opensearch-x7zb cluster. This node may not be keeping up with the indexing speed."
summary: "High Write Rejection Ratio - 16.7%"

- interval: 1m
input_series:
- series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '69802552852x10' # just 70 GB available
- series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '498589663232x10' # HD with 500 GB
alert_rule_test:
- eval_time: 5m
alertname: OpenSearchNodeDiskLowWatermarkReached
exp_alerts:
- exp_labels:
severity: alert
cluster: opensearch-x7zb
instance: 10.1.156.70:9200
node: opensearch-0.fa9
exp_annotations:
message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
summary: "Disk Low Watermark Reached - disk saturation is 86%"

- interval: 1m
input_series:
- series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '44873069690x10' # just 45 GB available
- series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '498589663232x10' # HD with 500 GB
alert_rule_test:
- eval_time: 5m
alertname: OpenSearchNodeDiskHighWatermarkReached
# both low and high water mark alerts are triggered
exp_alerts:
- exp_labels:
severity: high
cluster: opensearch-x7zb
instance: 10.1.156.70:9200
node: opensearch-0.fa9
exp_annotations:
message: "Disk High Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
summary: "Disk High Watermark Reached - disk saturation is 91%"

- interval: 1m
input_series:
- series: 'opensearch_jvm_mem_heap_used_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '76x60'
alert_rule_test:
- eval_time: 10m
alertname: OpenSearchJVMHeapUseHigh
exp_alerts:
- exp_labels:
severity: alert
cluster: opensearch-x7zb
instance: 10.1.156.70:9200
node: opensearch-0.fa9
exp_annotations:
message: "JVM Heap usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 76%."
summary: "JVM Heap usage on the node is high"

- interval: 1m
input_series:
- series: 'opensearch_os_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '91x6'
alert_rule_test:
- eval_time: 1m
alertname: OpenSearchHostSystemCPUHigh
exp_alerts:
- exp_labels:
severity: alert
cluster: opensearch-x7zb
instance: 10.1.156.70:9200
node: opensearch-0.fa9
exp_annotations:
message: "System CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
summary: "System CPU usage is high"

- interval: 1m
input_series:
- series: 'opensearch_process_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}'
values: '91x6'
alert_rule_test:
- eval_time: 1m
alertname: OpenSearchProcessCPUHigh
exp_alerts:
- exp_labels:
severity: alert
cluster: opensearch-x7zb
instance: 10.1.156.70:9200
node: opensearch-0.fa9
exp_annotations:
message: "OSE process CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%"
summary: "OSE process CPU usage is high"

- interval: 1m
input_series:
- series: 'opensearch_indices_indexing_is_throttled_bool{cluster="opensearch-x7zb"}'
values: '1x60'
alert_rule_test:
- eval_time: 5m
alertname: OpenSearchThrottling
exp_alerts:
- exp_labels:
severity: warning
cluster: opensearch-x7zb
exp_annotations:
message: "Cluster opensearch-x7zb is throttling. Please optimize queries and indexing patterns or consider scale the application."
summary: "OpenSearch Indexing Throttle"

0 comments on commit 78c448a

Please sign in to comment.