From 9de83391fc407728baa84f809427c647f27b4afa Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 7 Nov 2024 08:42:03 +0100 Subject: [PATCH] [observability] Introduce "ReplicaUnavailable" alerts (#20344) * [observability] ReplicaMismatch: Improve "the mismatch is 1.0" message * [observability] Introduce "ReplicasUnavailable" alert (as warning for now) --- .../workspace/rules/central/image-builder.yaml | 13 ++++++++++++- .../workspace/rules/central/node-labeler.yaml | 13 ++++++++++++- .../mixins/workspace/rules/central/ws-manager.yaml | 13 ++++++++++++- .../mixins/workspace/rules/central/ws-proxy.yaml | 13 ++++++++++++- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/operations/observability/mixins/workspace/rules/central/image-builder.yaml b/operations/observability/mixins/workspace/rules/central/image-builder.yaml index 0067590e494726..78e22904410f98 100644 --- a/operations/observability/mixins/workspace/rules/central/image-builder.yaml +++ b/operations/observability/mixins/workspace/rules/central/image-builder.yaml @@ -47,7 +47,18 @@ spec: annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md summary: Desired number of replicas for image-builder-mk3 are not available in cluster {{ $labels.cluster }} - description: The mismatch is {{ printf "%.2f" $value }} + description: 'Desired number of replicas for image-builder-mk3 are not available in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }} are missing' expr: | kube_deployment_spec_replicas{deployment="image-builder-mk3", cluster!~"ephemeral.*"} != kube_deployment_status_replicas_available{deployment="image-builder-mk3", cluster!~"ephemeral.*"} for: 3m + - alert: GitpodImageBuilderMk3ReplicaUnavailable + labels: + # TODO(gpl): warning for now, to set it up and fine-tune it + severity: warning + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md + summary: image-builder-mk3 replicas are unavailable in cluster {{ $labels.cluster }} + description: 'image-builder-mk3 pods are unavailable in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }}' + expr: | + kube_deployment_status_replicas_unavailable{deployment="image-builder-mk3", cluster!~"ephemeral.*"} > 0 + for: 10m diff --git a/operations/observability/mixins/workspace/rules/central/node-labeler.yaml b/operations/observability/mixins/workspace/rules/central/node-labeler.yaml index c6c13b3ea489c1..657fec67e5e541 100644 --- a/operations/observability/mixins/workspace/rules/central/node-labeler.yaml +++ b/operations/observability/mixins/workspace/rules/central/node-labeler.yaml @@ -31,7 +31,18 @@ spec: annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md summary: Desired number of replicas for node-labeler are not available in cluster {{ $labels.cluster }} - description: The mismatch is {{ printf "%.2f" $value }} + description: 'Desired number of replicas for node-labeler are not available in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }} are missing' expr: | kube_deployment_spec_replicas{deployment="node-labeler", cluster!~"ephemeral.*"} != kube_deployment_status_replicas_available{deployment="node-labeler", cluster!~"ephemeral.*"} for: 3m + - alert: GitpodNodeLabelerReplicaUnavailable + labels: + # TODO(gpl): warning for now, to set it up and fine-tune it + severity: warning + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md + summary: node-labeler replicas are unavailable in cluster {{ $labels.cluster }} + description: 'node-labeler pods are unavailable in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }}' + expr: | + kube_deployment_status_replicas_unavailable{deployment="node-labeler", cluster!~"ephemeral.*"} > 0 + for: 10m diff --git a/operations/observability/mixins/workspace/rules/central/ws-manager.yaml b/operations/observability/mixins/workspace/rules/central/ws-manager.yaml index 95de82658ef5e6..afa560c424ab0f 100644 --- a/operations/observability/mixins/workspace/rules/central/ws-manager.yaml +++ b/operations/observability/mixins/workspace/rules/central/ws-manager.yaml @@ -31,7 +31,18 @@ spec: annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md summary: Desired number of replicas for ws-manager-mk2 are not available in cluster {{ $labels.cluster }} - description: The mismatch is {{ printf "%.2f" $value }} + description: 'Desired number of replicas for ws-manager-mk2 are not available in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }} are missing' expr: | kube_deployment_spec_replicas{deployment="ws-manager-mk2", cluster!~"ephemeral.*"} != kube_deployment_status_replicas_available{deployment="ws-manager-mk2", cluster!~"ephemeral.*"} for: 3m + - alert: GitpodWsManagerMk2ReplicaUnavailable + labels: + # TODO(gpl): warning for now, to set it up and fine-tune it + severity: warning + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md + summary: ws-manager-mk2 replicas are unavailable in cluster {{ $labels.cluster }} + description: 'ws-manager-mk2 pods are unavailable in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }}' + expr: | + kube_deployment_status_replicas_unavailable{deployment="ws-manager-mk2", cluster!~"ephemeral.*"} > 0 + for: 10m diff --git a/operations/observability/mixins/workspace/rules/central/ws-proxy.yaml b/operations/observability/mixins/workspace/rules/central/ws-proxy.yaml index c86820388f6db8..0287b988874214 100644 --- a/operations/observability/mixins/workspace/rules/central/ws-proxy.yaml +++ b/operations/observability/mixins/workspace/rules/central/ws-proxy.yaml @@ -31,7 +31,18 @@ spec: annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md summary: Desired number of replicas for ws-proxy are not available in cluster {{ $labels.cluster }} - description: The mismatch is {{ printf "%.2f" $value }} + description: 'Desired number of replicas for ws-proxy are not available in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }} are missing' expr: | kube_deployment_spec_replicas{deployment="ws-proxy", cluster!~"ephemeral.*"} != kube_deployment_status_replicas_available{deployment="ws-proxy", cluster!~"ephemeral.*"} for: 3m + - alert: GitpodWsProxyMk2ReplicaUnavailable + labels: + # TODO(gpl): warning for now, to set it up and fine-tune it + severity: warning + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceDeploymentReplicaMismatch.md + summary: ws-proxy replicas are unavailable in cluster {{ $labels.cluster }} + description: 'ws-proxy pods are unavailable in cluster {{ $labels.cluster }}: {{ printf "%.2f" $value }}' + expr: | + kube_deployment_status_replicas_unavailable{deployment="ws-proxy", cluster!~"ephemeral.*"} > 0 + for: 10m