Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning and critical alarms for callbacks #1352

Merged
merged 14 commits into from
Jun 11, 2024
Merged
17 changes: 16 additions & 1 deletion aws/eks/cloudwatch_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -953,4 +953,19 @@ resource "aws_cloudwatch_metric_alarm" "github-arc-runner-write-alarm" {
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_critical_arn]
ok_actions = [var.sns_alert_critical_arn]
}
}

resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "service-callback-too-many-failures-warning"
alarm_description = "Service reached the max number of callback retries 5 times in 30 minutes"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0].metric_transformation[0].namespace
period = 60 * 30
statistic = "Sum"
threshold = 5
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_warning_arn]
}
20 changes: 19 additions & 1 deletion aws/eks/cloudwatch_log.tf
Original file line number Diff line number Diff line change
Expand Up @@ -178,4 +178,22 @@ resource "aws_cloudwatch_log_metric_filter" "github-arc-write-alarm" {
namespace = "LogMetrics"
value = "1"
}
}
}

resource "aws_cloudwatch_log_metric_filter" "callback-max-retry-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "callback-max-retry-failures"
pattern = "Retry: send_delivery_status_to_service has retried the max num of times for callback url"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name

metric_transformation {
name = "callback-max-retry-failures"
namespace = "LogMetrics"
value = "1"
dimensions = {
url = "$.url"
notification_id = "$.notification_id"
service_id = "$.service"
}
}
}
19 changes: 19 additions & 0 deletions aws/eks/cloudwatch_queries.tf
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,22 @@ fields @timestamp, @service_id, @bounce_type
| limit 100
QUERY
}

resource "aws_cloudwatch_query_definition" "callback-failures-by-service" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Count of callbacks that exceeded MaxRetries by service"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, @service_id, @callback_url, @notification_id
| filter kubernetes.container_name like /^celery/
| filter @message like /send_delivery_status_to_service has retried the max num of times for callback url/
| parse @message 'Retry: send_delivery_status_to_service has retried the max num of times for callback url * and notification_id: * for service: *' as @callback_url, @notification_id, @service_id
| sort @timestamp desc
| stats count(@service_id) by @service_id, bin(30m)
| limit 10000
QUERY
}
2 changes: 1 addition & 1 deletion aws/eks/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -319,4 +319,4 @@ variable "subnet_ids" {
variable "eks_karpenter_ami_id" {
type = string
description = "The AMI ID for the Karpenter nodes"
}
}
Loading