diff --git a/aws/common/outputs.tf b/aws/common/outputs.tf index 8a92cc6f0..5dd4a245d 100644 --- a/aws/common/outputs.tf +++ b/aws/common/outputs.tf @@ -208,3 +208,7 @@ output "subnet_ids" { output "subnet_cidr_blocks" { value = aws_subnet.notification-canada-ca-private[*].cidr_block } + +output "sns_monthly_spend_limit" { + value = var.sns_monthly_spend_limit +} diff --git a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf index cf825feb6..bc5387995 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf @@ -72,3 +72,211 @@ resource "aws_cloudwatch_metric_alarm" "lambda-image-pinpoint-delivery-receipts- FunctionName = module.pinpoint_to_sqs_sms_callbacks.function_name } } + +resource "aws_cloudwatch_metric_alarm" "total-sms-spending-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "total-sms-spending-warning" + alarm_description = "SMS spending reached 80% of limit this month" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + threshold = 0.8 * var.sms_monthly_spend_limit + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + + metric_query { + id = "total_spend" + expression = "sns_spend + pinpoint_spend" + label = "Total SMS Monthly Spend" + return_data = "true" + } + + metric_query { + id = "sns_spend" + metric { + metric_name = "SMSMonthToDateSpentUSD" + namespace = "AWS/SNS" + period = 300 + stat = "Maximum" + unit = "Count" + } + } + + metric_query { + id = "pinpoint_spend" + metric { + metric_name = "TextMessageMonthlySpend" + namespace = "AWS/SMSVoice" + period = 300 + stat = "Maximum" + unit = "Count" + } + } +} + +resource "aws_cloudwatch_metric_alarm" "total-sms-spending-critical" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "total-sms-spending-critical" + alarm_description = "SMS spending reached 90% of limit this month" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + threshold = 0.9 * var.sms_monthly_spend_limit + treat_missing_data = "notBreaching" + alarm_actions = [var.sns_alert_warning_arn] + + metric_query { + id = "total_spend" + expression = "sns_spend + pinpoint_spend" + label = "Total SMS Monthly Spend" + return_data = "true" + } + + metric_query { + id = "sns_spend" + metric { + metric_name = "SMSMonthToDateSpentUSD" + namespace = "AWS/SNS" + period = 300 + stat = "Maximum" + unit = "Count" + } + } + + metric_query { + id = "pinpoint_spend" + metric { + metric_name = "TextMessageMonthlySpend" + namespace = "AWS/SMSVoice" + period = 300 + stat = "Maximum" + unit = "Count" + } + } +} + +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "pinpoint-sms-success-rate-warning" + alarm_description = "Pinpoint SMS success rate is below 60% over 2 consecutive periods of 12 hours" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + datapoints_to_alarm = "2" + threshold = 60 / 100 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + + metric_query { + id = "success_rate" + expression = "successes / (successes + failures)" + label = "Success Rate" + return_data = "true" + } + + metric_query { + id = "successes" + metric { + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + stat = "Sum" + unit = "Count" + } + } + + metric_query { + id = "failures" + metric { + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + stat = "Sum" + unit = "Count" + } + } +} + +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-canadian-numbers-critical" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "pinpoint-sms-success-rate-canadian-numbers-critical" + alarm_description = "Pinpoint SMS success rate to Canadian numbers is below 25% over 2 consecutive periods of 12 hours" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + datapoints_to_alarm = "2" + threshold = 25 / 100 + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_ok_arn] + treat_missing_data = "notBreaching" + + metric_query { + id = "success_rate" + expression = "successes / (successes + failures)" + label = "Success Rate" + return_data = "true" + } + + metric_query { + id = "successes" + metric { + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-successes[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + stat = "Sum" + unit = "Count" + } + } + + metric_query { + id = "failures" + metric { + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + stat = "Sum" + unit = "Count" + } + } +} + +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-blocked-as-spam-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "pinpoint-sms-blocked-as-spam-warning" + alarm_description = "More than 10 Pinpoint SMS have been blocked as spam over 12 hours" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-blocked-as-spam[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-blocked-as-spam[0].metric_transformation[0].namespace + period = 60 * 60 * 12 + statistic = "Sum" + threshold = 10 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" +} + +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-phone-carrier-unavailable-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "pinpoint-sms-phone-carrier-unavailable-warning" + alarm_description = "More than 100 Pinpoint SMS failed because a phone carrier is unavailable over 3 hours" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-phone-carrier-unavailable[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-phone-carrier-unavailable[0].metric_transformation[0].namespace + period = 60 * 60 * 3 + statistic = "Sum" + threshold = 100 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" +} + +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-rate-exceeded-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "pinpoint-sms-rate-exceeded-warning" + alarm_description = "At least 1 Pinpoint SMS rate exceeded error in 5 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-rate-exceeded[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.pinpoint-sms-rate-exceeded[0].metric_transformation[0].namespace + period = 60 * 5 + statistic = "Sum" + threshold = 1 + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" +} diff --git a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf index 58317ce05..4603bb439 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf @@ -45,3 +45,80 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint_to_sqs_sms_callbacks-500-e value = "1" } } + +### +# AWS CloudWatch Logs Metrics +### +resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-blocked-as-spam" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "pinpoint-sms-blocked-as-spam" + # See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html + pattern = "{ $.messageStatus = \"SPAM\" }" + log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name + + metric_transformation { + name = "pinpoint-sms-blocked-as-spam" + namespace = "LogMetrics" + value = "1" + default_value = "0" + } +} + +resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-phone-carrier-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "pinpoint-sms-phone-carrier-unavailable" + # See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html + pattern = "{ $.messageStatus = \"CARRIER_UNREACHABLE\" }" + log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name + + metric_transformation { + name = "pinpoint-sms-phone-carrier-unavailable" + namespace = "LogMetrics" + value = "1" + default_value = "0" + } +} + +resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-rate-exceeded" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "pinpoint-sms-rate-exceeded" + # https://docs.aws.amazon.com/sns/latest/dg/channels-sms-originating-identities-long-codes.html + # Canadian long code numbers are limited at 1 SMS per second/number + pattern = "{ $.messageStatusDescription = \"Rate exceeded.\" }" + log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name + + metric_transformation { + name = "pinpoint-sms-rate-exceeded" + namespace = "LogMetrics" + value = "1" + default_value = "0" + } +} + +resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-successes" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "pinpoint-sms-successes" + pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus = \"SUCCESSFUL\") || ($.messageStatus = \"DELIVERED\") ) }" + log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries.name + + metric_transformation { + name = "pinpoint-sms-successes" + namespace = "LogMetrics" + value = "1" + default_value = "0" + } +} + +resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-failures" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "pinpoint-sms-failures" + pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus != \"SUCCESSFUL\") && ($.messageStatus != \"DELIVERED\") ) }" + log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name + + metric_transformation { + name = "pinpoint-sms-failures" + namespace = "LogMetrics" + value = "1" + default_value = "0" + } +} diff --git a/aws/pinpoint_to_sqs_sms_callbacks/variables.tf b/aws/pinpoint_to_sqs_sms_callbacks/variables.tf index f4f47c01b..3bcd541d7 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/variables.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/variables.tf @@ -35,6 +35,11 @@ variable "pinpoint_to_sqs_sms_callbacks_ecr_arn" { description = "The ARN of the ECR repository for the pinpoint_to_sqs_sms_callbacks image" } +variable "sms_monthly_spend_limit" { + type = number + description = "The total monthly spending limit for SMS (SNS plus Pinpoint)" +} + variable "force_delete_ecr" { description = "Boolean value to decide whether or not to force delete a non-empty ECR" type = bool diff --git a/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl b/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl index 56d51c52f..51cd1a8cc 100644 --- a/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl +++ b/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl @@ -14,6 +14,7 @@ dependency "common" { sns_alert_critical_arn = "" sns_alert_ok_arn = "" sqs_deliver_receipts_queue_arn = "" + sns_monthly_spend_limit = 1 } } @@ -39,6 +40,7 @@ inputs = { sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn + sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit } terraform { diff --git a/env/production/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl b/env/production/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl index d3aff88dc..dfa753801 100644 --- a/env/production/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl +++ b/env/production/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl @@ -19,7 +19,7 @@ dependency "common" { sns_alert_critical_arn = "" sns_alert_ok_arn = "" sqs_deliver_receipts_queue_arn = "" - + sns_monthly_spend_limit = 1 } } @@ -45,4 +45,5 @@ inputs = { sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn + sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit } diff --git a/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl b/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl index 2851955f7..c0fddf361 100644 --- a/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl +++ b/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl @@ -14,7 +14,7 @@ dependency "common" { sns_alert_critical_arn = "" sns_alert_ok_arn = "" sqs_deliver_receipts_queue_arn = "" - + sns_monthly_spend_limit = 1 } } @@ -40,6 +40,7 @@ inputs = { sqs_deliver_receipts_queue_arn = dependency.common.outputs.sqs_deliver_receipts_queue_arn pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn + sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit } terraform {