From 453fb0e5ffe58d9f6a56418ad13aa7621a713932 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 4 Dec 2024 20:18:43 -0300 Subject: [PATCH] Separate alerts when cluster is in yellow state (#506) When a cluster is in yellow state, it's because some shards are not active. This can be reached under heavy load, however if there are shards unassigned and the cluster is yellow, that means that some replicas are not allocated and it might be necessary to add new nodes in order to host all shards. --------- Co-authored-by: Mehdi Bendriss --- .../prometheus/prometheus_alerts.yaml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index b51ead8af..ffdb4c122 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -35,16 +35,26 @@ "labels": "severity": "critical" - - "alert": "OpenSearchClusterYellow" + - "alert": "OpenSearchClusterYellowTemp" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some cluster replicas shards are not allocated." - "summary": "Cluster health status is YELLOW" + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load." + "summary": "Cluster health status is temporarily YELLOW" "expr": | - sum by (cluster) (opensearch_cluster_status == 1) + sum by (cluster) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster) opensearch_cluster_status == 1 "for": "20m" "labels": "severity": "warning" + - "alert": "OpenSearchClusterYellow" + "annotations": + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW. Some replica shards are unassigned." + "summary": "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards." + "expr": | + sum by (cluster) (opensearch_cluster_shards_number{type="unassigned"}) > 0 and on(cluster) opensearch_cluster_status == 1 + "for": "10m" + "labels": + "severity": "warning" + - "alert": "OpenSearchWriteRequestsRejectionJumps" "annotations": "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."