Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Pinpoint metrics and alarms #1354

Merged
merged 19 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions aws/common/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,7 @@ output "subnet_ids" {
output "subnet_cidr_blocks" {
value = aws_subnet.notification-canada-ca-private[*].cidr_block
}

output "sns_monthly_spend_limit" {
value = var.sns_monthly_spend_limit
}
208 changes: 208 additions & 0 deletions aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,211 @@ resource "aws_cloudwatch_metric_alarm" "lambda-image-pinpoint-delivery-receipts-
FunctionName = module.pinpoint_to_sqs_sms_callbacks.function_name
}
}

resource "aws_cloudwatch_metric_alarm" "total-sms-spending-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "total-sms-spending-warning"
alarm_description = "SMS spending reached 80% of limit this month"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
threshold = 0.8 * var.sms_monthly_spend_limit
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_warning_arn]

metric_query {
id = "total_spend"
expression = "sns_spend + pinpoint_spend"
label = "Total SMS Monthly Spend"
return_data = "true"
}

metric_query {
id = "sns_spend"
metric {
metric_name = "SMSMonthToDateSpentUSD"
namespace = "AWS/SNS"
period = 300
stat = "Maximum"
unit = "Count"
}
}

metric_query {
id = "pinpoint_spend"
metric {
metric_name = "TextMessageMonthlySpend"
namespace = "AWS/SMSVoice"
period = 300
stat = "Maximum"
unit = "Count"
}
}
}

resource "aws_cloudwatch_metric_alarm" "total-sms-spending-critical" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "total-sms-spending-critical"
alarm_description = "SMS spending reached 90% of limit this month"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
threshold = 0.9 * var.sms_monthly_spend_limit
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_warning_arn]

metric_query {
id = "total_spend"
expression = "sns_spend + pinpoint_spend"
label = "Total SMS Monthly Spend"
return_data = "true"
}

metric_query {
id = "sns_spend"
metric {
metric_name = "SMSMonthToDateSpentUSD"
namespace = "AWS/SNS"
period = 300
stat = "Maximum"
unit = "Count"
}
}

metric_query {
id = "pinpoint_spend"
metric {
metric_name = "TextMessageMonthlySpend"
namespace = "AWS/SMSVoice"
period = 300
stat = "Maximum"
unit = "Count"
}
}
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-success-rate-warning"
alarm_description = "Pinpoint SMS success rate is below 60% over 2 consecutive periods of 12 hours"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
datapoints_to_alarm = "2"
threshold = 60 / 100
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"

metric_query {
id = "success_rate"
expression = "successes / (successes + failures)"
label = "Success Rate"
return_data = "true"
}

metric_query {
id = "successes"
metric {
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].namespace
period = 60 * 60 * 12
stat = "Sum"
unit = "Count"
}
}

metric_query {
id = "failures"
metric {
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].namespace
period = 60 * 60 * 12
stat = "Sum"
unit = "Count"
}
}
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-canadian-numbers-critical" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-success-rate-canadian-numbers-critical"
alarm_description = "Pinpoint SMS success rate to Canadian numbers is below 25% over 2 consecutive periods of 12 hours"
comparison_operator = "LessThanThreshold"
evaluation_periods = "2"
datapoints_to_alarm = "2"
threshold = 25 / 100
alarm_actions = [var.sns_alert_critical_arn]
ok_actions = [var.sns_alert_ok_arn]
treat_missing_data = "notBreaching"

metric_query {
id = "success_rate"
expression = "successes / (successes + failures)"
label = "Success Rate"
return_data = "true"
}

metric_query {
id = "successes"
metric {
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].namespace
period = 60 * 60 * 12
stat = "Sum"
unit = "Count"
}
}

metric_query {
id = "failures"
metric {
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].namespace
period = 60 * 60 * 12
stat = "Sum"
unit = "Count"
}
}
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-blocked-as-spam-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-blocked-as-spam-warning"
alarm_description = "More than 10 Pinpoint SMS have been blocked as spam over 12 hours"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-blocked-as-spam[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-blocked-as-spam[0].metric_transformation[0].namespace
period = 60 * 60 * 12
statistic = "Sum"
threshold = 10
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-phone-carrier-unavailable-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-phone-carrier-unavailable-warning"
alarm_description = "More than 100 Pinpoint SMS failed because a phone carrier is unavailable over 3 hours"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-phone-carrier-unavailable[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-phone-carrier-unavailable[0].metric_transformation[0].namespace
period = 60 * 60 * 3
statistic = "Sum"
threshold = 100
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-rate-exceeded-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-rate-exceeded-warning"
alarm_description = "At least 1 Pinpoint SMS rate exceeded error in 5 minutes"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-rate-exceeded[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-rate-exceeded[0].metric_transformation[0].namespace
period = 60 * 5
statistic = "Sum"
threshold = 1
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
}
77 changes: 77 additions & 0 deletions aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,80 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint_to_sqs_sms_callbacks-500-e
value = "1"
}
}

###
# AWS CloudWatch Logs Metrics
###
resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-blocked-as-spam" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-blocked-as-spam"
# See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html
pattern = "{ $.messageStatus = \"SPAM\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name

metric_transformation {
name = "pinpoint-sms-blocked-as-spam"
namespace = "LogMetrics"
value = "1"
default_value = "0"
}
}

resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-phone-carrier-unavailable" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-phone-carrier-unavailable"
# See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html
pattern = "{ $.messageStatus = \"CARRIER_UNREACHABLE\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name

metric_transformation {
name = "pinpoint-sms-phone-carrier-unavailable"
namespace = "LogMetrics"
value = "1"
default_value = "0"
}
}

resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-rate-exceeded" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-rate-exceeded"
# https://docs.aws.amazon.com/sns/latest/dg/channels-sms-originating-identities-long-codes.html
# Canadian long code numbers are limited at 1 SMS per second/number
pattern = "{ $.messageStatusDescription = \"Rate exceeded.\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name

metric_transformation {
name = "pinpoint-sms-rate-exceeded"
namespace = "LogMetrics"
value = "1"
default_value = "0"
}
}

resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-successes" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-successes"
pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus = \"SUCCESSFUL\") || ($.messageStatus = \"DELIVERED\") ) }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries.name

metric_transformation {
name = "pinpoint-sms-successes"
namespace = "LogMetrics"
value = "1"
default_value = "0"
}
}

resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-failures"
pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus != \"SUCCESSFUL\") && ($.messageStatus != \"DELIVERED\") ) }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name

metric_transformation {
name = "pinpoint-sms-failures"
namespace = "LogMetrics"
value = "1"
default_value = "0"
}
}
5 changes: 5 additions & 0 deletions aws/pinpoint_to_sqs_sms_callbacks/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ variable "pinpoint_to_sqs_sms_callbacks_ecr_arn" {
description = "The ARN of the ECR repository for the pinpoint_to_sqs_sms_callbacks image"
}

variable "sms_monthly_spend_limit" {
type = number
description = "The total monthly spending limit for SMS (SNS plus Pinpoint)"
}

variable "force_delete_ecr" {
description = "Boolean value to decide whether or not to force delete a non-empty ECR"
type = bool
Expand Down
2 changes: 2 additions & 0 deletions env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependency "common" {
sns_alert_critical_arn = ""
sns_alert_ok_arn = ""
sqs_deliver_receipts_queue_arn = ""
sns_monthly_spend_limit = 1
}
}

Expand All @@ -39,6 +40,7 @@ inputs = {
sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn
pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url
pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn
sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit
}

terraform {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dependency "common" {
sns_alert_critical_arn = ""
sns_alert_ok_arn = ""
sqs_deliver_receipts_queue_arn = ""

sns_monthly_spend_limit = 1
}
}

Expand All @@ -45,4 +45,5 @@ inputs = {
sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn
pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url
pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn
sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit
}
3 changes: 2 additions & 1 deletion env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependency "common" {
sns_alert_critical_arn = ""
sns_alert_ok_arn = ""
sqs_deliver_receipts_queue_arn = ""

sns_monthly_spend_limit = 1
}
}

Expand All @@ -40,6 +40,7 @@ inputs = {
sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn
pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url
pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn
sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit
}

terraform {
Expand Down
Loading