Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating cloudwatch stuff to reflect new pod and deployment names #1691

Merged
merged 12 commits into from
Dec 12, 2024
84 changes: 34 additions & 50 deletions aws/eks/cloudwatch_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "admin"
Service = var.env == "production" ? "admin" : "notify-admin"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -197,7 +197,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "api"
Service = var.env == "production" ? "api" : "notify-api"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -218,7 +218,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-primary"
Service = var.env == "production" ? "celery-primary" : "notify-celery-primary"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-scalable"
Service = var.env == "production" ? "celery-scalable" : "notify-celery-scalable"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -260,7 +260,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-sms"
Service = var.env == "production" ? "celery-sms" : "notify-celery-sms"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -282,7 +282,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "admin"
Service = var.env == "production" ? "admin" : "notify-admin"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -303,7 +303,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "api"
Service = var.env == "production" ? "api" : "notify-api"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -324,7 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning"
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-primary"
Service = var.env == "production" ? "celery-primary" : "notify-celery-primary"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -345,7 +345,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-sms"
Service = var.env == "production" ? "celery-sms" : "notify-celery-sms"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand Down Expand Up @@ -464,7 +464,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" {
alarm_description = "Celery Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -478,7 +478,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-primary"
deployment = var.env == "production" ? "celery-primary" : "notify-celery-primary"
}
}
}
Expand All @@ -493,7 +493,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" {
alarm_description = "Celery Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -507,7 +507,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-scalable"
deployment = var.env == "production" ? "celery-scalable" : "notify-celery-scalable"
}
}
}
Expand All @@ -521,7 +521,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" {
alarm_description = "Celery Beat Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -535,7 +535,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-beat"
deployment = var.env == "production" ? "celery-beat" : "notify-celery-beat"
}
}
}
Expand All @@ -549,7 +549,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" {
alarm_description = "Celery SMS Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -563,7 +563,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms"
deployment = var.env == "production" ? "celery-sms" : "notify-celery-sms"
}
}
}
Expand All @@ -577,7 +577,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava
alarm_description = "Celery Email Send Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -591,7 +591,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-email-send-primary"
deployment = var.env == "production" ? "celery-email-send-primary" : "notify-celery-email-send-primary"
}
}
}
Expand All @@ -606,7 +606,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav
alarm_description = "Celery Email Send Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -620,7 +620,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-email-send-scalable"
deployment = var.env == "production" ? "celery-email-send-scalable" : "notify-celery-email-send-scalable"
}
}
}
Expand All @@ -634,7 +634,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail
alarm_description = "Celery SMS Send Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -648,7 +648,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms-send-primary"
deployment = var.env == "production" ? "celery-sms-send-primary" : "notify-celery-sms-send-primary"
}
}
}
Expand All @@ -663,7 +663,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai
alarm_description = "Celery SMS Send Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -677,7 +677,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms-send-scalable"
deployment = var.env == "production" ? "celery-sms-send-scalable" : "notify-celery-sms-send-scalable"
}
}
}
Expand All @@ -691,7 +691,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" {
alarm_description = "Notify Admin Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -705,7 +705,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "admin"
deployment = var.env == "production" ? "admin" : "notify-admin"
}
}
}
Expand All @@ -719,7 +719,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" {
alarm_description = "Notify K8S API Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -733,7 +733,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "api"
deployment = var.env == "production" ? "api" : "notify-api"
}
}
}
Expand All @@ -747,7 +747,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" {
alarm_description = "Notify Documentation Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -761,7 +761,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "documentation"
deployment = var.env == "production" ? "documentation" : "notify-documentation"
}
}
}
Expand All @@ -775,7 +775,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab
alarm_description = "Notify Document Download API Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -789,7 +789,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "document-download-api"
deployment = var.env == "production" ? "document-download" : "notify-document-download"
}
}
}
Expand Down Expand Up @@ -888,7 +888,7 @@ resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" {
alarm_description = "Karpenter Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand Down Expand Up @@ -939,22 +939,6 @@ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes-
ok_actions = [var.sns_alert_critical_arn]
}

resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "github-arc-runner-error-alarm"
alarm_description = "GitHub ARC Runners Are Failing"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].namespace
period = "300"
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_critical_arn]
ok_actions = [var.sns_alert_critical_arn]
}

resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "service-callback-too-many-failures-warning"
Expand Down
23 changes: 5 additions & 18 deletions aws/eks/cloudwatch_log.tf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" {
resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "api-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"api-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"api-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-api-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -111,7 +111,7 @@ resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "celery-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"celery-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"celery-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-celery-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -124,7 +124,7 @@ resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "admin-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"admin-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"admin-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-admin-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -137,7 +137,7 @@ resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "document-download-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"document-download-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"document-download-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-document-download-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -150,7 +150,7 @@ resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "documentation-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"documentation-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"documentation-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-documentation-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -173,19 +173,6 @@ resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" {
}
}

resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" {
count = var.cloudwatch_enabled ? 1 : 0
name = "GitHub ARC Runners Write Alarm"
pattern = "{ $.kubernetes.pod_name = \"github-arc-ss-${var.env}-*-runner-*\" && $.log = \"*ERROR*\" }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name

metric_transformation {
name = "aggregating-github-arc-runner-alarm"
namespace = "LogMetrics"
value = "1"
}
}

resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "callback-request-failures"
Expand Down
Loading
Loading