diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 600441cfd..22b5f32d0 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -22,16 +22,26 @@ "labels": "severity": "critical" - - "alert": "OpenSearchClusterYellow" + - "alert": "OpenSearchClusterYellowTemp" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some cluster replicas shards are not allocated." - "summary": "Cluster health status is YELLOW" + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Shards are still relocating or initializing. The cluster might be under heavy load." + "summary": "Cluster health status is temporarily YELLOW" "expr": | - sum by (cluster) (opensearch_cluster_status == 1) + sum by (cluster) (opensearch_cluster_shards_number{type=~"relocating|initializing"}) > 0 and on(cluster) opensearch_cluster_status == 1 "for": "20m" "labels": "severity": "warning" + - "alert": "OpenSearchClusterYellow" + "annotations": + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW. Some replica shards are unassigned." + "summary": "Number of nodes in the cluster might be too low. Consider scaling the application to ensure that it has enough nodes to host all shards." + "expr": | + sum by (cluster) (opensearch_cluster_shards_number{type="unassigned"}) > 0 and on(cluster) opensearch_cluster_status == 1 + "for": "10m" + "labels": + "severity": "warning" + - "alert": "OpenSearchBulkRequestsRejectionJumps" "annotations": "message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."