From 613613734d34acf15cb271e0edcc09bb2a4c8c88 Mon Sep 17 00:00:00 2001 From: wbanks Date: Thu, 29 Aug 2024 14:45:33 -0400 Subject: [PATCH 1/3] Update and add callback alarms --- aws/eks/cloudwatch_alarms.tf | 25 ++++++++++++++++++++----- aws/eks/cloudwatch_log.tf | 6 +++--- aws/eks/cloudwatch_queries.tf | 21 +++++++++++++++++++-- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index dd847131e..4daf6e5da 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -958,14 +958,29 @@ resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" { resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "service-callback-too-many-failures-warning" - alarm_description = "Service reached the max number of callback retries 5 times in 30 minutes" + alarm_description = "Service reached the max number of callback retries 25 times in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" - metric_name = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].name - namespace = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].namespace - period = 60 * 30 + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 5 statistic = "Sum" - threshold = 5 + threshold = 25 treat_missing_data = "notBreaching" alarm_actions = [var.sns_alert_warning_arn] } + +resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "service-callback-too-many-failures-warning" + alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 10 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] +} diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index fed7d25d1..56fcc3f4a 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -180,10 +180,10 @@ resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" { } } -resource "aws_cloudwatch_log_metric_filter" "callback-max-retry-failures" { +resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { count = var.cloudwatch_enabled ? 1 : 0 - name = "callback-max-retry-failures" - pattern = "send_delivery_status_to_service has retried the max num of times for callback url" + name = "allback-request-failures" + pattern = "send_delivery_status_to_service request failed for notification_id:" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name metric_transformation { diff --git a/aws/eks/cloudwatch_queries.tf b/aws/eks/cloudwatch_queries.tf index 03cf8a866..c6254aab1 100644 --- a/aws/eks/cloudwatch_queries.tf +++ b/aws/eks/cloudwatch_queries.tf @@ -272,9 +272,9 @@ fields @timestamp, @service_id, @bounce_type QUERY } -resource "aws_cloudwatch_query_definition" "callback-failures-by-service" { +resource "aws_cloudwatch_query_definition" "callback-max-retry-failures-by-service" { count = var.cloudwatch_enabled ? 1 : 0 - name = "Callbacks / Count of callbacks that exceeded MaxRetries by service" + name = "Callbacks / Callbacks that exceeded MaxRetries by service" log_group_names = [ local.eks_application_log_group @@ -291,6 +291,23 @@ fields @timestamp, @service_id, @callback_url, @notification_id QUERY } +resource "aws_cloudwatch_query_definition" "callback-failures" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "Callbacks / Callbacks that exceeded MaxRetries by URL" + + log_group_names = [ + local.eks_application_log_group + ] + + query_string = < Date: Thu, 29 Aug 2024 14:49:55 -0400 Subject: [PATCH 2/3] Adjust query name --- aws/eks/cloudwatch_queries.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/eks/cloudwatch_queries.tf b/aws/eks/cloudwatch_queries.tf index c6254aab1..f743b4470 100644 --- a/aws/eks/cloudwatch_queries.tf +++ b/aws/eks/cloudwatch_queries.tf @@ -293,7 +293,7 @@ QUERY resource "aws_cloudwatch_query_definition" "callback-failures" { count = var.cloudwatch_enabled ? 1 : 0 - name = "Callbacks / Callbacks that exceeded MaxRetries by URL" + name = "Callbacks / Callback errors by notification_id" log_group_names = [ local.eks_application_log_group From 0b59950ece84a112de152b9de7cd40b63e73918a Mon Sep 17 00:00:00 2001 From: wbanks Date: Tue, 3 Sep 2024 10:50:21 -0400 Subject: [PATCH 3/3] typo --- aws/eks/cloudwatch_log.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index 56fcc3f4a..cfe8275e9 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -182,7 +182,7 @@ resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" { resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { count = var.cloudwatch_enabled ? 1 : 0 - name = "allback-request-failures" + name = "callback-request-failures" pattern = "send_delivery_status_to_service request failed for notification_id:" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name