From 74166dbeaa649d1a94b7bfd53329b1b4b1dfe36e Mon Sep 17 00:00:00 2001 From: Steve Astels Date: Thu, 26 Sep 2024 15:06:48 -0400 Subject: [PATCH] add warnings for throttling exceptions (#1555) --- aws/eks/cloudwatch_alarms.tf | 32 +++++++++++++++++++++++++++++++- aws/eks/cloudwatch_log.tf | 13 +++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index 4daf6e5da..0395a0b16 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -972,7 +972,7 @@ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warni resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" { count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "service-callback-too-many-failures-warning" + alarm_name = "service-callback-too-many-failures-critical" alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" @@ -984,3 +984,33 @@ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-criti treat_missing_data = "notBreaching" alarm_actions = [var.sns_alert_critical_arn] } + +resource "aws_cloudwatch_metric_alarm" "throttling-exception-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "throttling-exception-warning" + alarm_description = "Have received a throttling exception in the last minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "many-throttling-exceptions-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "many-throttling-exceptions-warning" + alarm_description = "Have received 100 throttling exception in the last minute" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.throttling-exceptions[0].metric_transformation[0].namespace + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] +} diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index 470f5facf..5c3fcbf2b 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -192,3 +192,16 @@ resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { value = "1" } } + +resource "aws_cloudwatch_log_metric_filter" "throttling-exceptions" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "throttling-exceptions" + pattern = "ThrottlingException" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "throttling-exceptions" + namespace = "LogMetrics" + value = "1" + } +}