diff --git a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml index b9a123d3..20ef8355 100644 --- a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml +++ b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml @@ -121,6 +121,7 @@ alertmanager: - name: 'null' {% if monitoring_slack_api_url is defined and monitoring_slack_notification_channel is defined %} - name: slack + repeat_interval: 1h slack_configs: - channel: "{{ monitoring_slack_notification_channel }}" send_resolved: true @@ -207,7 +208,7 @@ additionalPrometheusRulesMap: annotations: description: "Partition {{ $labels.partition }} has {{ $value }} DEAD Machines" - alert: MachineCapacityLow - expr: (avg(metal_partition_capacity_total{size!="unknown"})by (partition, size) - avg(metal_partition_capacity_free{size!="unknown"}) by (partition, size) ) / avg(metal_partition_capacity_total{size!="unknown"})by (partition, size) *100 > 80 + expr: avg(metal_partition_capacity_free{size!="unknown"} > 10) by (partition, size) / avg(metal_partition_capacity_total{size!="unknown"} > 10) by (partition, size) < 0.10 for: 10m labels: severity: "warning"