From 621d0fe5b7c8f697bdcfe87b5be1019924dbb5d5 Mon Sep 17 00:00:00 2001 From: Michael Pond Date: Thu, 12 Dec 2024 10:10:33 -0500 Subject: [PATCH] making these conditional for our production rollout. Need them to work on staging first only for a while [review] --- aws/eks/cloudwatch_alarms.tf | 96 +- aws/eks/cloudwatch_alarms_kustomize.tf | 1016 ++++++++++ aws/eks/cloudwatch_log.tf | 34 +- aws/eks/cloudwatch_log_kustomize.tf | 200 ++ aws/eks/cloudwatch_queries.tf | 38 +- aws/eks/cloudwatch_queries_kustomize.tf | 344 ++++ aws/eks/dashboards.tf | 12 +- aws/eks/dashboards_kustomize.tf | 1776 +++++++++++++++++ .../dashboards.tf | 4 +- .../dashboards_kustomize.tf | 671 +++++++ 10 files changed, 4099 insertions(+), 92 deletions(-) create mode 100644 aws/eks/cloudwatch_alarms_kustomize.tf create mode 100644 aws/eks/cloudwatch_log_kustomize.tf create mode 100644 aws/eks/cloudwatch_queries_kustomize.tf create mode 100644 aws/eks/dashboards_kustomize.tf create mode 100644 aws/pinpoint_to_sqs_sms_callbacks/dashboards_kustomize.tf diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index 4670222b0..0419fda6e 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -5,7 +5,7 @@ # There are also alarms defined in aws/common/cloudwatch_alarms.tf resource "aws_cloudwatch_metric_alarm" "load-balancer-1-500-error-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "load-balancer-1-500-error-1-minute-warning" alarm_description = "One 500 error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -23,7 +23,7 @@ resource "aws_cloudwatch_metric_alarm" "load-balancer-1-500-error-1-minute-warni } resource "aws_cloudwatch_metric_alarm" "load-balancer-10-500-error-5-minutes-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "load-balancer-10-500-error-5-minutes-critical" alarm_description = "Ten 500 errors in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -43,7 +43,7 @@ resource "aws_cloudwatch_metric_alarm" "load-balancer-10-500-error-5-minutes-cri } resource "aws_cloudwatch_metric_alarm" "load-balancer-1-502-error-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "load-balancer-1-502-error-1-minute-warning" alarm_description = "One 502 error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -61,7 +61,7 @@ resource "aws_cloudwatch_metric_alarm" "load-balancer-1-502-error-1-minute-warni } resource "aws_cloudwatch_metric_alarm" "load-balancer-10-502-error-5-minutes-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "load-balancer-10-502-error-5-minutes-critical" alarm_description = "Ten 502 errors in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -80,7 +80,7 @@ resource "aws_cloudwatch_metric_alarm" "load-balancer-10-502-error-5-minutes-cri } resource "aws_cloudwatch_metric_alarm" "document-download-api-high-request-count-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "document-download-api-high-request-count-warning" alarm_description = "More than 300 4XX requests in 10 minutes on ${aws_alb_target_group.notification-canada-ca-document-api.name} target group" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -99,7 +99,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-high-request-count } resource "aws_cloudwatch_metric_alarm" "logs-1-celery-error-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-1-celery-error-1-minute-warning" alarm_description = "One Celery error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -114,7 +114,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-1-celery-error-1-minute-warning" { } resource "aws_cloudwatch_metric_alarm" "logs-10-celery-error-1-minute-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-10-celery-error-1-minute-critical" alarm_description = "Ten Celery errors in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -130,7 +130,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-10-celery-error-1-minute-critical" } resource "aws_cloudwatch_metric_alarm" "logs-1-500-error-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-1-500-error-1-minute-warning" alarm_description = "One 500 error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -145,7 +145,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-1-500-error-1-minute-warning" { } resource "aws_cloudwatch_metric_alarm" "logs-10-500-error-5-minutes-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-10-500-error-5-minutes-critical" alarm_description = "Ten 500 errors in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -161,7 +161,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-10-500-error-5-minutes-critical" { } resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "admin-pods-high-cpu-warning" alarm_description = "Average CPU of admin pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -182,7 +182,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" { } resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "api-pods-high-cpu-warning" alarm_description = "Average CPU of API pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -203,7 +203,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" { } resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-primary-pods-high-cpu-warning" alarm_description = "Average CPU of Primary Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -224,7 +224,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" { } resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-scalable-pods-high-cpu-warning" alarm_description = "Average CPU of Scalable Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -245,7 +245,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" { } resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-sms-pods-high-cpu-warning" alarm_description = "Average CPU of celery-sms pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -267,7 +267,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" { resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "admin-pods-high-memory-warning" alarm_description = "Average memory of admin pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -288,7 +288,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" { } resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "api-pods-high-memory-warning" alarm_description = "Average memory of API pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -309,7 +309,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { } resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-primary-pods-high-memory-warning" alarm_description = "Average memory of Primary Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -330,7 +330,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" } resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-sms-pods-high-memory-warning" alarm_description = "Average memory of celery-sms >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -351,7 +351,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" { } resource "aws_cloudwatch_metric_alarm" "ddos-detected-load-balancer-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "ddos-detected-load-balancer-critical" alarm_description = "DDoS has been detected on the load balancer" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -370,7 +370,7 @@ resource "aws_cloudwatch_metric_alarm" "ddos-detected-load-balancer-critical" { } resource "aws_cloudwatch_metric_alarm" "logs-1-malware-detected-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-1-malware-detected-1-minute-warning" alarm_description = "One malware detected error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -385,7 +385,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-1-malware-detected-1-minute-warning } resource "aws_cloudwatch_metric_alarm" "logs-10-malware-detected-1-minute-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-10-malware-detected-1-minute-critical" alarm_description = "Ten malware detected errors in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -401,7 +401,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-10-malware-detected-1-minute-critic } resource "aws_cloudwatch_metric_alarm" "logs-1-scanfiles-timeout-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-1-scanfiles-timeout-5-minutes-warning" alarm_description = "One scanfiles timeout detected error in 1 minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -416,7 +416,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-1-scanfiles-timeout-1-minute-warnin } resource "aws_cloudwatch_metric_alarm" "logs-1-bounce-rate-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "logs-1-bounce-rate-critical" alarm_description = "Bounce rate exceeding 10% in a 12 hour period" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -431,7 +431,7 @@ resource "aws_cloudwatch_metric_alarm" "logs-1-bounce-rate-critical" { } resource "aws_cloudwatch_metric_alarm" "kubernetes-failed-nodes" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "kubernetes-failed-nodes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 1 @@ -457,7 +457,7 @@ resource "aws_cloudwatch_metric_alarm" "kubernetes-failed-nodes" { } resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-primary-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -486,7 +486,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" { resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-scalable-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 3 @@ -514,7 +514,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-beat-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -542,7 +542,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-sms-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -570,7 +570,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-email-send-primary-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -599,7 +599,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-email-send-scalable-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 3 @@ -627,7 +627,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav } resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-sms-send-primary-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -656,7 +656,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "celery-sms-send-scalable-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 3 @@ -684,7 +684,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai } resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "admin-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -712,7 +712,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "api-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -740,7 +740,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "documentation-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -768,7 +768,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "document-download-api-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -796,7 +796,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab } resource "aws_cloudwatch_metric_alarm" "api-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "evicted-api-pods-detected" alarm_description = "One or more Kubernetes API Pods is reporting as Evicted" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -813,7 +813,7 @@ resource "aws_cloudwatch_metric_alarm" "api-evicted-pods" { } resource "aws_cloudwatch_metric_alarm" "celery-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "evicted-celery-pods-detected" alarm_description = "One or more Kubernetes Celery Pods is reporting as Evicted" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -830,7 +830,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-evicted-pods" { } resource "aws_cloudwatch_metric_alarm" "admin-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "evicted-admin-pods-detected" alarm_description = "One or more Kubernetes Admin Pods is reporting as Evicted" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -847,7 +847,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-evicted-pods" { } resource "aws_cloudwatch_metric_alarm" "document-download-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "evicted-document-download-pods-detected" alarm_description = "One or more Kubernetes Document Download Pods is reporting as Evicted" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -864,7 +864,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-evicted-pods" { } resource "aws_cloudwatch_metric_alarm" "documentation-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "evicted-documentation-pods-detected" alarm_description = "One or more Kubernetes Documentation Pods is reporting as Evicted" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -881,7 +881,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-evicted-pods" { } resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "karpenter-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 @@ -909,7 +909,7 @@ resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" { } resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-1-minute-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "aggregating-queues-not-active-1-minute-warning" alarm_description = "Beat inbox tasks have not been active for one minute" comparison_operator = "LessThanThreshold" @@ -924,7 +924,7 @@ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-1-minute-w } resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "aggregating-queues-not-active-5-minutes-critical" alarm_description = "Beat inbox tasks have not been active for 5 minutes" comparison_operator = "LessThanThreshold" @@ -940,7 +940,7 @@ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes- } resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "service-callback-too-many-failures-warning" alarm_description = "Service reached the max number of callback retries 25 times in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -955,7 +955,7 @@ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warni } resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "service-callback-too-many-failures-critical" alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -970,7 +970,7 @@ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-criti } resource "aws_cloudwatch_metric_alarm" "throttling-exception-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "throttling-exception-warning" alarm_description = "Have received a throttling exception in the last minute" comparison_operator = "GreaterThanOrEqualToThreshold" @@ -985,7 +985,7 @@ resource "aws_cloudwatch_metric_alarm" "throttling-exception-warning" { } resource "aws_cloudwatch_metric_alarm" "many-throttling-exceptions-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 alarm_name = "many-throttling-exceptions-warning" alarm_description = "Have received 100 throttling exception in the last minute" comparison_operator = "GreaterThanOrEqualToThreshold" diff --git a/aws/eks/cloudwatch_alarms_kustomize.tf b/aws/eks/cloudwatch_alarms_kustomize.tf new file mode 100644 index 000000000..134f568a8 --- /dev/null +++ b/aws/eks/cloudwatch_alarms_kustomize.tf @@ -0,0 +1,1016 @@ +# Note to maintainers: +# Updating alarms? Update the Google Sheet also! +# https://docs.google.com/spreadsheets/d/1gkrL3Trxw0xEkX724C1bwpfeRsTlK2X60wtCjF6MFRA/edit +# +# There are also alarms defined in aws/common/cloudwatch_alarms.tf + +resource "aws_cloudwatch_metric_alarm" "load-balancer-1-500-error-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "load-balancer-1-500-error-1-minute-warning" + alarm_description = "One 500 error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "HTTPCode_ELB_500_Count" + namespace = "AWS/ApplicationELB" + period = 60 + statistic = "Sum" + threshold = 1 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + dimensions = { + LoadBalancer = aws_alb.notification-canada-ca.arn_suffix + } +} + +resource "aws_cloudwatch_metric_alarm" "load-balancer-10-500-error-5-minutes-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "load-balancer-10-500-error-5-minutes-critical" + alarm_description = "Ten 500 errors in 5 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "HTTPCode_ELB_500_Count" + namespace = "AWS/ApplicationELB" + period = 300 + statistic = "Sum" + threshold = 10 + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] + treat_missing_data = "notBreaching" + + dimensions = { + LoadBalancer = aws_alb.notification-canada-ca.arn_suffix + } +} + +resource "aws_cloudwatch_metric_alarm" "load-balancer-1-502-error-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "load-balancer-1-502-error-1-minute-warning" + alarm_description = "One 502 error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "HTTPCode_ELB_502_Count" + namespace = "AWS/ApplicationELB" + period = 60 + statistic = "Sum" + threshold = 1 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + dimensions = { + LoadBalancer = aws_alb.notification-canada-ca.arn_suffix + } +} + +resource "aws_cloudwatch_metric_alarm" "load-balancer-10-502-error-5-minutes-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "load-balancer-10-502-error-5-minutes-critical" + alarm_description = "Ten 502 errors in 5 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "HTTPCode_ELB_502_Count" + namespace = "AWS/ApplicationELB" + period = 300 + statistic = "Sum" + threshold = 10 + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] + treat_missing_data = "notBreaching" + dimensions = { + LoadBalancer = aws_alb.notification-canada-ca.arn_suffix + } +} + +resource "aws_cloudwatch_metric_alarm" "document-download-api-high-request-count-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "document-download-api-high-request-count-warning" + alarm_description = "More than 300 4XX requests in 10 minutes on ${aws_alb_target_group.notification-canada-ca-document-api.name} target group" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "HTTPCode_Target_4XX_Count" + namespace = "AWS/ApplicationELB" + period = 60 * 10 + statistic = "Sum" + threshold = 300 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + dimensions = { + LoadBalancer = aws_alb.notification-canada-ca.arn_suffix + TargetGroup = aws_alb_target_group.notification-canada-ca-document-api.arn_suffix + } +} + +resource "aws_cloudwatch_metric_alarm" "logs-1-celery-error-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-1-celery-error-1-minute-warning" + alarm_description = "One Celery error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.celery-error[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.celery-error[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-10-celery-error-1-minute-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-10-celery-error-1-minute-critical" + alarm_description = "Ten Celery errors in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.celery-error[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.celery-error[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-1-500-error-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-1-500-error-1-minute-warning" + alarm_description = "One 500 error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.web-500-errors[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.web-500-errors[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-10-500-error-5-minutes-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-10-500-error-5-minutes-critical" + alarm_description = "Ten 500 errors in 5 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.web-500-errors[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.web-500-errors[0].metric_transformation[0].namespace + period = 300 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "admin-pods-high-cpu-warning" + alarm_description = "Average CPU of admin pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "admin" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "api-pods-high-cpu-warning" + alarm_description = "Average CPU of API pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "api" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-primary-pods-high-cpu-warning" + alarm_description = "Average CPU of Primary Celery pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-primary" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-scalable-pods-high-cpu-warning" + alarm_description = "Average CPU of Scalable Celery pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-scalable" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-sms-pods-high-cpu-warning" + alarm_description = "Average CPU of celery-sms pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-sms" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + + +resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "admin-pods-high-memory-warning" + alarm_description = "Average memory of admin pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_memory_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "admin" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "api-pods-high-memory-warning" + alarm_description = "Average memory of API pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_memory_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "api" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-primary-pods-high-memory-warning" + alarm_description = "Average memory of Primary Celery pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_memory_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-primary" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-sms-pods-high-memory-warning" + alarm_description = "Average memory of celery-sms >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_memory_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-sms" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "ddos-detected-load-balancer-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "ddos-detected-load-balancer-critical" + alarm_description = "DDoS has been detected on the load balancer" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "DDoSDetected" + namespace = "AWS/DDoSProtection" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] + dimensions = { + ResourceArn = aws_shield_protection.notification-canada-ca.resource_arn + } +} + +resource "aws_cloudwatch_metric_alarm" "logs-1-malware-detected-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-1-malware-detected-1-minute-warning" + alarm_description = "One malware detected error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.malware-detected[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.malware-detected[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-10-malware-detected-1-minute-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-10-malware-detected-1-minute-critical" + alarm_description = "Ten malware detected errors in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.malware-detected[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.malware-detected[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-1-scanfiles-timeout-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-1-scanfiles-timeout-5-minutes-warning" + alarm_description = "One scanfiles timeout detected error in 1 minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.scanfiles-timeout[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.scanfiles-timeout[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "logs-1-bounce-rate-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "logs-1-bounce-rate-critical" + alarm_description = "Bounce rate exceeding 10% in a 12 hour period" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.bounce-rate-critical[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.bounce-rate-critical[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "kubernetes-failed-nodes" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "kubernetes-failed-nodes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + alarm_description = "Kubernetes failed node anomalies" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "cluster_failed_node_count" + namespace = "ContainerInsights" + period = 300 + stat = "Average" + dimensions = { + Name = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-primary-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery Primary Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-scalable" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-beat-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery Beat Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-beat" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-sms-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery SMS Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-sms" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-email-send-primary-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery Email Send Primary Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-email-send-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-email-send-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery Email Send Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-email-send-scalable" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-sms-send-primary-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery SMS Send Primary Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-sms-send-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "celery-sms-send-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery SMS Send Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-sms-send-scalable" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "admin-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Notify Admin Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "admin" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "api-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Notify K8S API Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "api" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "documentation-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Notify Documentation Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "documentation" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "document-download-api-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Notify Document Download API Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Average" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "document-download-api" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "api-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "evicted-api-pods-detected" + alarm_description = "One or more Kubernetes API Pods is reporting as Evicted" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "3" + metric_name = aws_cloudwatch_log_metric_filter.api-evicted-pods[0].name + namespace = aws_cloudwatch_log_metric_filter.api-evicted-pods[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + ok_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "celery-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "evicted-celery-pods-detected" + alarm_description = "One or more Kubernetes Celery Pods is reporting as Evicted" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "3" + metric_name = aws_cloudwatch_log_metric_filter.celery-evicted-pods[0].name + namespace = aws_cloudwatch_log_metric_filter.celery-evicted-pods[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + ok_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "admin-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "evicted-admin-pods-detected" + alarm_description = "One or more Kubernetes Admin Pods is reporting as Evicted" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "3" + metric_name = aws_cloudwatch_log_metric_filter.admin-evicted-pods[0].name + namespace = aws_cloudwatch_log_metric_filter.admin-evicted-pods[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + ok_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "document-download-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "evicted-document-download-pods-detected" + alarm_description = "One or more Kubernetes Document Download Pods is reporting as Evicted" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "3" + metric_name = aws_cloudwatch_log_metric_filter.document-download-evicted-pods[0].name + namespace = aws_cloudwatch_log_metric_filter.document-download-evicted-pods[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + ok_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "documentation-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "evicted-documentation-pods-detected" + alarm_description = "One or more Kubernetes Documentation Pods is reporting as Evicted" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "3" + metric_name = aws_cloudwatch_log_metric_filter.documentation-evicted-pods[0].name + namespace = aws_cloudwatch_log_metric_filter.documentation-evicted-pods[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + ok_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "karpenter-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Karpenter Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = "karpenter" + deployment = "karpenter" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-1-minute-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "aggregating-queues-not-active-1-minute-warning" + alarm_description = "Beat inbox tasks have not been active for one minute" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.aggregating-queues-are-active[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.aggregating-queues-are-active[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "breaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "aggregating-queues-not-active-5-minutes-critical" + alarm_description = "Beat inbox tasks have not been active for 5 minutes" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.aggregating-queues-are-active[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.aggregating-queues-are-active[0].metric_transformation[0].namespace + period = "300" + statistic = "Sum" + threshold = 1 + treat_missing_data = "breaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "github-arc-runner-error-alarm" + alarm_description = "GitHub ARC Runners Are Failing" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].namespace + period = "300" + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "service-callback-too-many-failures-warning" + alarm_description = "Service reached the max number of callback retries 25 times in 5 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 5 + statistic = "Sum" + threshold = 25 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "service-callback-too-many-failures-critical" + alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 10 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] +} + +resource "aws_cloudwatch_metric_alarm" "throttling-exception-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "throttling-exception-warning" + alarm_description = "Have received a throttling exception in the last minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "many-throttling-exceptions-warning" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + alarm_name = "many-throttling-exceptions-warning" + alarm_description = "Have received 100 throttling exception in the last minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index 08b72c95a..a1f324e7c 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -3,25 +3,25 @@ ### resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-cluster-logs" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "/aws/eks/${var.eks_cluster_name}/cluster" retention_in_days = var.log_retention_period_days } resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-application-logs" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "/aws/containerinsights/${var.eks_cluster_name}/application" retention_in_days = var.log_retention_period_days } resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-prometheus-logs" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "/aws/containerinsights/${var.eks_cluster_name}/prometheus" retention_in_days = var.log_retention_period_days } resource "aws_cloudwatch_log_group" "blazer" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "blazer" retention_in_days = 1827 # 5 years } @@ -31,7 +31,7 @@ resource "aws_cloudwatch_log_group" "blazer" { # AWS EKS Cloudwatch log metric filters ### resource "aws_cloudwatch_log_metric_filter" "web-500-errors" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "web-500-errors" pattern = "\"\\\" 500 \"" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -44,7 +44,7 @@ resource "aws_cloudwatch_log_metric_filter" "web-500-errors" { } resource "aws_cloudwatch_log_metric_filter" "celery-error" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "celery-error" pattern = "%ERROR/.*Worker|ERROR/MainProcess%" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -57,7 +57,7 @@ resource "aws_cloudwatch_log_metric_filter" "celery-error" { } resource "aws_cloudwatch_log_metric_filter" "malware-detected" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "malware-detected" pattern = jsonencode("Malicious content detected! Download and attachment failed") log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -70,7 +70,7 @@ resource "aws_cloudwatch_log_metric_filter" "malware-detected" { } resource "aws_cloudwatch_log_metric_filter" "scanfiles-timeout" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "scanfiles-timeout" pattern = "Malware scan timed out for notification.id" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -83,7 +83,7 @@ resource "aws_cloudwatch_log_metric_filter" "scanfiles-timeout" { } resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "bounce-rate-critical" pattern = "critical bounce rate threshold of 10" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -96,7 +96,7 @@ resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" { } resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "api-evicted-pods" pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-api-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name @@ -109,7 +109,7 @@ resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" { } resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "celery-evicted-pods" pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-celery-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name @@ -122,7 +122,7 @@ resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" { } resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "admin-evicted-pods" pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-admin-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name @@ -135,7 +135,7 @@ resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" { } resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "document-download-evicted-pods" pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-document-download-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name @@ -148,7 +148,7 @@ resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" { } resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "documentation-evicted-pods" pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-documentation-*\") }" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name @@ -161,7 +161,7 @@ resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" { } resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "aggregating-queues-are-active" pattern = "Batch saving with" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -174,7 +174,7 @@ resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" { } resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "callback-request-failures" pattern = "send_delivery_status_to_service request failed for notification_id" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name @@ -187,7 +187,7 @@ resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { } resource "aws_cloudwatch_log_metric_filter" "throttling-exceptions" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "throttling-exceptions" pattern = "ThrottlingException" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name diff --git a/aws/eks/cloudwatch_log_kustomize.tf b/aws/eks/cloudwatch_log_kustomize.tf new file mode 100644 index 000000000..a267156e3 --- /dev/null +++ b/aws/eks/cloudwatch_log_kustomize.tf @@ -0,0 +1,200 @@ +### +# AWS EKS Cloudwatch groups +### + +resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-cluster-logs" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "/aws/eks/${var.eks_cluster_name}/cluster" + retention_in_days = var.log_retention_period_days +} + +resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-application-logs" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "/aws/containerinsights/${var.eks_cluster_name}/application" + retention_in_days = var.log_retention_period_days +} + +resource "aws_cloudwatch_log_group" "notification-canada-ca-eks-prometheus-logs" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "/aws/containerinsights/${var.eks_cluster_name}/prometheus" + retention_in_days = var.log_retention_period_days +} + +resource "aws_cloudwatch_log_group" "blazer" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "blazer" + retention_in_days = 1827 # 5 years +} + + +### +# AWS EKS Cloudwatch log metric filters +### +resource "aws_cloudwatch_log_metric_filter" "web-500-errors" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "web-500-errors" + pattern = "\"\\\" 500 \"" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "500-errors" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "celery-error" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "celery-error" + pattern = "%ERROR/.*Worker|ERROR/MainProcess%" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "celery-error" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "malware-detected" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "malware-detected" + pattern = jsonencode("Malicious content detected! Download and attachment failed") + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "malware-detected" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "scanfiles-timeout" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "scanfiles-timeout" + pattern = "Malware scan timed out for notification.id" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "scanfiles-timeout" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "bounce-rate-critical" + pattern = "critical bounce rate threshold of 10" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "bounce-rate-critical" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "api-evicted-pods" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-api-*\") }" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name + + metric_transformation { + name = "api-evicted-pods" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "celery-evicted-pods" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-celery-*\") }" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name + + metric_transformation { + name = "celery-evicted-pods" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "admin-evicted-pods" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-admin-*\") }" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name + + metric_transformation { + name = "admin-evicted-pods" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "document-download-evicted-pods" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-document-download-*\") }" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name + + metric_transformation { + name = "document-download-evicted-pods" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "documentation-evicted-pods" + pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-documentation-*\") }" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name + + metric_transformation { + name = "documentation-evicted-pods" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "aggregating-queues-are-active" + pattern = "Batch saving with" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "aggregating-queues-are-active" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "callback-request-failures" + pattern = "send_delivery_status_to_service request failed for notification_id" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "callback-max-retry-failures" + namespace = "LogMetrics" + value = "1" + } +} + +resource "aws_cloudwatch_log_metric_filter" "throttling-exceptions" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "throttling-exceptions" + pattern = "ThrottlingException" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "throttling-exceptions" + namespace = "LogMetrics" + value = "1" + } +} diff --git a/aws/eks/cloudwatch_queries.tf b/aws/eks/cloudwatch_queries.tf index d3e7f707a..4ee1b9c32 100644 --- a/aws/eks/cloudwatch_queries.tf +++ b/aws/eks/cloudwatch_queries.tf @@ -1,7 +1,7 @@ ################################ CELERY FOLDER ################################ resource "aws_cloudwatch_query_definition" "celery-errors" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Errors" log_group_names = [ @@ -18,7 +18,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-filter-by-job" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Filter by job" log_group_names = [ @@ -35,7 +35,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-filter-by-notification-id" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Filter by notification id" log_group_names = [ @@ -52,7 +52,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-memory-usage-by-pod" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Memory Usage By Pod" log_group_names = [ @@ -68,7 +68,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-pods-over-cpu-limit" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Pods over CPU Limit" log_group_names = [ @@ -84,7 +84,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-queues" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Queues" log_group_names = [ @@ -101,7 +101,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-starts" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Starts" log_group_names = [ @@ -118,7 +118,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-worker-exited-normally" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Worker exited normally" log_group_names = [ @@ -134,7 +134,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-worker-exited-prematurely" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Worker exited prematurely" log_group_names = [ @@ -150,7 +150,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "celery-worker-exits-cold-vs-warm" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Worker exits, cold vs warm" log_group_names = [ @@ -168,7 +168,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "retry-attemps-by-duration" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Celery / Retry attempts by duration" log_group_names = [ @@ -187,7 +187,7 @@ QUERY ################################ UNSORTED YET ################################# resource "aws_cloudwatch_query_definition" "admin-50X-errors" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Admin / 50X errors" log_group_names = [ @@ -204,7 +204,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "api-50X-errors" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "API / 50X errors" log_group_names = [ @@ -221,7 +221,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "bounce-rate-critical" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Bounces / Critical bounces" log_group_names = [ @@ -238,7 +238,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "bounce-rate-warning" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Bounces / Warning bounces" log_group_names = [ @@ -255,7 +255,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "bounce-rate-warnings-and-criticals" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Bounces / Bounce warnings and criticals grouped by type" log_group_names = [ @@ -273,7 +273,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "callback-errors-by-url" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Callbacks / Callback errors by URL" log_group_names = [ @@ -291,7 +291,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "callback-max-retry-failures-by-service" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Callbacks / Callbacks that exceeded MaxRetries by service" log_group_names = [ @@ -310,7 +310,7 @@ QUERY } resource "aws_cloudwatch_query_definition" "callback-failures" { - count = var.cloudwatch_enabled ? 1 : 0 + count = var.cloudwatch_enabled && var.env != "production" ? 1 : 0 name = "Callbacks / Callback errors by notification_id" log_group_names = [ diff --git a/aws/eks/cloudwatch_queries_kustomize.tf b/aws/eks/cloudwatch_queries_kustomize.tf new file mode 100644 index 000000000..a7d0b9cce --- /dev/null +++ b/aws/eks/cloudwatch_queries_kustomize.tf @@ -0,0 +1,344 @@ +################################ CELERY FOLDER ################################ + +resource "aws_cloudwatch_query_definition" "celery-errors" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "Celery / Errors" + + log_group_names = [ + local.eks_application_log_group + ] + + query_string = <\d+s)/ +| stats count() by retry_duration +QUERY +} + +################################ UNSORTED YET ################################# + +resource "aws_cloudwatch_query_definition" "admin-50X-errors" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + name = "Admin / 50X errors" + + log_group_names = [ + local.eks_application_log_group + ] + + query_string = <.*?) has been rate limited for (?..........).*/\n| stats count(*) by service, limit_type\n", + "region": "${var.region}", + "stacked": false, + "title": "Services going over the daily limit", + "view": "table" + } + } + ] +} +EOF +} + +resource "aws_cloudwatch_dashboard" "kubernetes" { + count = var.cloudwatch_enabled && var.env == "production" ? 1 : 0 + dashboard_name = "Kubernetes" + dashboard_body = <