From df92c6adef871370bc76191442770f746af9ce51 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Thu, 21 Nov 2024 11:09:06 -0300 Subject: [PATCH 1/3] Separate alerts when cluster is in yellow state When a cluster is in yellow state, it's because some shards are not active. This can be reached under heavy load, however if there are shards unassigned and the cluster is yellow, that means that some replicas are not allocated and it might be necessary to add new nodes in order to host all shards. --- .../prometheus/prometheus_alerts.yaml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 7da5e0449..3c13a425f 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -11,16 +11,26 @@ "labels": "severity": "critical" - - "alert": "OpenSearchClusterYellow" + - "alert": "OpenSearchClusterYellowTemp" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some cluster replicas shards are not allocated." - "summary": "Cluster health status is YELLOW" + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Cluster might be under heavy load." + "summary": "Cluster health status is temporarily YELLOW" "expr": | - sum by (cluster) (opensearch_cluster_status == 1) + sum by (cluster, instance) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster, instance) opensearch_cluster_status == 1 "for": "20m" "labels": "severity": "warning" + - "alert": "OpenSearchClusterYellow" + "annotations": + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW with some replica shards unassigned." + "summary": "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards." + "expr": | + sum by (cluster, instance) (opensearch_cluster_shards_number{type="unassigned"}) > 0 and on(cluster, instance) opensearch_cluster_status == 1 + "for": "10m" + "labels": + "severity": "warning" + - "alert": "OpenSearchBulkRequestsRejectionJumps" "annotations": "message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." From e52dc5d988bd75feb2035bbbe7f4fef4499f9b2b Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 27 Nov 2024 16:34:08 -0300 Subject: [PATCH 2/3] - not break by instance - fix messages --- src/alert_rules/prometheus/prometheus_alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 3c13a425f..f17522258 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -13,20 +13,20 @@ - "alert": "OpenSearchClusterYellowTemp" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Cluster might be under heavy load." + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Shards are still still relocating or initializing. The cluster might be under heavy load." "summary": "Cluster health status is temporarily YELLOW" "expr": | - sum by (cluster, instance) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster, instance) opensearch_cluster_status == 1 + sum by (cluster) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster) opensearch_cluster_status == 1 "for": "20m" "labels": "severity": "warning" - "alert": "OpenSearchClusterYellow" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW with some replica shards unassigned." + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW. Some replica shards are unassigned." "summary": "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards." "expr": | - sum by (cluster, instance) (opensearch_cluster_shards_number{type="unassigned"}) > 0 and on(cluster, instance) opensearch_cluster_status == 1 + sum by (cluster) (opensearch_cluster_shards_number{type="unassigned"}) > 0 and on(cluster) opensearch_cluster_status == 1 "for": "10m" "labels": "severity": "warning" From ed733108728e39b0f876abddd99fc84d996e9e48 Mon Sep 17 00:00:00 2001 From: Mehdi Bendriss Date: Tue, 3 Dec 2024 21:26:30 +0100 Subject: [PATCH 3/3] Update src/alert_rules/prometheus/prometheus_alerts.yaml --- src/alert_rules/prometheus/prometheus_alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 662c3baf8..22b5f32d0 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -24,7 +24,7 @@ - "alert": "OpenSearchClusterYellowTemp" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Shards are still still relocating or initializing. The cluster might be under heavy load." + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load." "summary": "Cluster health status is temporarily YELLOW" "expr": | sum by (cluster) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster) opensearch_cluster_status == 1