diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index 0395a0b16..b81cf1b49 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -176,7 +176,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "admin" + Service = "notify-admin" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -197,7 +197,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "api" + Service = "notify-api" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -218,7 +218,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery-primary" + Service = "notify-celery-primary" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery-scalable" + Service = "notify-celery-scalable" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -260,7 +260,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery-sms" + Service = "notify-celery-sms" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -282,7 +282,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "admin" + Service = "notify-admin" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -303,7 +303,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "api" + Service = "notify-api" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -324,7 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery-primary" + Service = "notify-celery-primary" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -345,7 +345,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery-sms" + Service = "notify-celery-sms" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -478,7 +478,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-primary" + deployment = "notify-celery-primary" } } } @@ -507,7 +507,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-scalable" + deployment = "notify-celery-scalable" } } } @@ -535,7 +535,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-beat" + deployment = "notify-celery-beat" } } } @@ -563,7 +563,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-sms" + deployment = "notify-celery-sms" } } } @@ -591,7 +591,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-email-send-primary" + deployment = "notify-celery-email-send-primary" } } } @@ -620,7 +620,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-email-send-scalable" + deployment = "notify-celery-email-send-scalable" } } } @@ -648,7 +648,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-sms-send-primary" + deployment = "notify-celery-sms-send-primary" } } } @@ -677,7 +677,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery-sms-send-scalable" + deployment = "notify-celery-sms-send-scalable" } } } @@ -705,7 +705,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "admin" + deployment = "notify-admin" } } } @@ -733,7 +733,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "api" + deployment = "notify-api" } } } @@ -761,7 +761,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "documentation" + deployment = "notify-documentation" } } } @@ -789,7 +789,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "document-download-api" + deployment = "notify-document-download" } } } @@ -939,22 +939,6 @@ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes- ok_actions = [var.sns_alert_critical_arn] } -resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" { - count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "github-arc-runner-error-alarm" - alarm_description = "GitHub ARC Runners Are Failing" - comparison_operator = "LessThanThreshold" - evaluation_periods = "1" - metric_name = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].name - namespace = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].namespace - period = "300" - statistic = "Sum" - threshold = 1 - treat_missing_data = "notBreaching" - alarm_actions = [var.sns_alert_critical_arn] - ok_actions = [var.sns_alert_critical_arn] -} - resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "service-callback-too-many-failures-warning" diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index 29908e455..41d182629 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -92,7 +92,7 @@ resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" { resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" { count = var.cloudwatch_enabled ? 1 : 0 name = "api-evicted-pods" - pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"api-*\") }" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-api-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name metric_transformation { @@ -105,7 +105,7 @@ resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" { resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" { count = var.cloudwatch_enabled ? 1 : 0 name = "celery-evicted-pods" - pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"celery-*\") }" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-celery-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name metric_transformation { @@ -118,7 +118,7 @@ resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" { resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" { count = var.cloudwatch_enabled ? 1 : 0 name = "admin-evicted-pods" - pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"admin-*\") }" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-admin-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name metric_transformation { @@ -131,7 +131,7 @@ resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" { resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" { count = var.cloudwatch_enabled ? 1 : 0 name = "document-download-evicted-pods" - pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"document-download-*\") }" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-document-download-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name metric_transformation { @@ -144,7 +144,7 @@ resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" { resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" { count = var.cloudwatch_enabled ? 1 : 0 name = "documentation-evicted-pods" - pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"documentation-*\") }" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-documentation-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name metric_transformation { @@ -167,19 +167,6 @@ resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" { } } -resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" { - count = var.cloudwatch_enabled ? 1 : 0 - name = "GitHub ARC Runners Write Alarm" - pattern = "{ $.kubernetes.pod_name = \"github-arc-ss-${var.env}-*-runner-*\" && $.log = \"*ERROR*\" }" - log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name - - metric_transformation { - name = "aggregating-github-arc-runner-alarm" - namespace = "LogMetrics" - value = "1" - } -} - resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { count = var.cloudwatch_enabled ? 1 : 0 name = "callback-request-failures" diff --git a/aws/eks/cloudwatch_queries.tf b/aws/eks/cloudwatch_queries.tf index 2b3ea4fac..d3e7f707a 100644 --- a/aws/eks/cloudwatch_queries.tf +++ b/aws/eks/cloudwatch_queries.tf @@ -10,7 +10,7 @@ resource "aws_cloudwatch_query_definition" "celery-errors" { query_string = <\d+s)/ | stats count() by retry_duration @@ -196,7 +196,7 @@ resource "aws_cloudwatch_query_definition" "admin-50X-errors" { query_string = <