diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index dd847131e..4daf6e5da 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -958,14 +958,29 @@ resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" { resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "service-callback-too-many-failures-warning" - alarm_description = "Service reached the max number of callback retries 5 times in 30 minutes" + alarm_description = "Service reached the max number of callback retries 25 times in 5 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "1" - metric_name = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].name - namespace = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].namespace - period = 60 * 30 + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 5 statistic = "Sum" - threshold = 5 + threshold = 25 treat_missing_data = "notBreaching" alarm_actions = [var.sns_alert_warning_arn] } + +resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "service-callback-too-many-failures-warning" + alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.callback-request-failures[0].metric_transformation[0].namespace + period = 60 * 10 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_critical_arn] +} diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index fed7d25d1..cfe8275e9 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -180,10 +180,10 @@ resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" { } } -resource "aws_cloudwatch_log_metric_filter" "callback-max-retry-failures" { +resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" { count = var.cloudwatch_enabled ? 1 : 0 - name = "callback-max-retry-failures" - pattern = "send_delivery_status_to_service has retried the max num of times for callback url" + name = "callback-request-failures" + pattern = "send_delivery_status_to_service request failed for notification_id:" log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name metric_transformation { diff --git a/aws/eks/cloudwatch_queries.tf b/aws/eks/cloudwatch_queries.tf index 03cf8a866..f743b4470 100644 --- a/aws/eks/cloudwatch_queries.tf +++ b/aws/eks/cloudwatch_queries.tf @@ -272,9 +272,9 @@ fields @timestamp, @service_id, @bounce_type QUERY } -resource "aws_cloudwatch_query_definition" "callback-failures-by-service" { +resource "aws_cloudwatch_query_definition" "callback-max-retry-failures-by-service" { count = var.cloudwatch_enabled ? 1 : 0 - name = "Callbacks / Count of callbacks that exceeded MaxRetries by service" + name = "Callbacks / Callbacks that exceeded MaxRetries by service" log_group_names = [ local.eks_application_log_group @@ -291,6 +291,23 @@ fields @timestamp, @service_id, @callback_url, @notification_id QUERY } +resource "aws_cloudwatch_query_definition" "callback-failures" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "Callbacks / Callback errors by notification_id" + + log_group_names = [ + local.eks_application_log_group + ] + + query_string = <