Skip to content

Commit

Permalink
SMS (SNS) stored queries from CWI into Terraform / SMS (pinpoint) new…
Browse files Browse the repository at this point in the history
… queries (#1574)

* Added SMS (SNS) stored queries from CWI into Terraform

* Added pinpoint-carrier-dwell-times CWLI query

* Added the pinpoint-failures-by-carrier query

* Added coalesce + labels

* Added warning alarms for the main telecoms when one of these fail.

* Rework the interntional sending query on the SNS side

* Brought forward SNS queries into Pinpoint + fixed queries

* Performed a few renames and fixed indices refs

* Added optional switch to the new log metric filter.

* Trigger AI code [review]

* Fixed the typo

* Renamed duplicate query

* Renamed duplicate alarm names

* Added count on pinpoint log groups, expect build failures
  • Loading branch information
jimleroyer authored Oct 15, 2024
1 parent 2c24989 commit 3c5df00
Show file tree
Hide file tree
Showing 8 changed files with 503 additions and 17 deletions.
118 changes: 118 additions & 0 deletions aws/common/cloudwatch_queries.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
resource "aws_cloudwatch_query_definition" "sms-sns-blocked-as-spam" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Block as spam"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp as Timestamp, delivery.phoneCarrier as Carrier, delivery.providerResponse as `Provider response`, delivery.destination as `Destination phone number`
| filter delivery.providerResponse like 'spam'
| sort Timestamp desc
| limit 100
}
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-carrier-dwell-times" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Carrier dwell times"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries[0].name,
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
stats avg(delivery.dwellTimeMsUntilDeviceAck / 1000 / 60) as Avg_carrier_time_minutes,
| count(*) as Number by delivery.phoneCarrier as Carrier
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-get-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Get failures"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp as Timestamp, status, delivery.phoneCarrier as Carrier, delivery.providerResponse as `Provider response`, delivery.destination as `Destination phone number`, notification.messageId as messageId, @message
| filter status = 'FAILURE'
| sort Timestamp desc
| limit 200
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-get-sms-logs-by-phone-number" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Get SMS logs by phone number"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries[0].name,
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp as Timestamp, status as Status, notification.messageId as `Message ID`,
delivery.destination as `Destination phone number`, delivery.providerResponse as `Provider response`,
delivery.smsType as `Message type`
| filter delivery.destination like '1416xxxxxxx'
| sort Timestamp desc
| limit 100
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-international-sending-status" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / International sending status"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries[0].name,
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp, @message, delivery.mcc as CountryCode, status
| stats count(*) as Event_Count by CountryCode, status
| display CountryCode, status, Event_Count
| sort CountryCode asc
| limit 200
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-success-vs-unreachable" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Success vs Unreachable"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries[0].name,
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp, delivery.providerResponse
| parse delivery.providerResponse "Phone is currently unreachable/*" as @unavailable
| parse delivery.providerResponse "Message has been * by phone" as @available
| sort @timestamp desc
| stats count(@unavailable), count(@available), count(*) by bin(1h)
QUERY
}

resource "aws_cloudwatch_query_definition" "sms-sns-unreachable-phone-numbers" {
count = var.cloudwatch_enabled ? 1 : 0
name = "SMS (SNS) / Unreachable phone numbers"

log_group_names = [
aws_cloudwatch_log_group.sns_deliveries_failures[0].name
]

query_string = <<QUERY
fields @timestamp, delivery.providerResponse
| filter delivery.providerResponse like "Phone is currently unreachable/unavailable"
| sort @timestamp desc
| limit 20
QUERY
}
189 changes: 189 additions & 0 deletions aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,195 @@ resource "aws_cloudwatch_metric_alarm" "total-sms-spending-critical" {
}
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-bell-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-bell-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Bell Cellular Inc. / Aliant Telecom."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Bell Cellular Inc. / Aliant Telecom"
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-bragg-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-bragg-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for BRAGG Communications INC."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "BRAGG Communications INC."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-freedom-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-freedom-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Freedom Mobile Inc."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Freedom Mobile Inc."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-iristel-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-iristel-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Iristel Inc."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Iristel Inc."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-maritime-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-maritime-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Maritime Telephone & Telegraph Ltd."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Maritime Telephone & Telegraph Ltd"
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-mts-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-mts-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for MTS Communications Inc."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "MTS Communications Inc."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-rogers-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-rogers-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Rogers Communications Canada Inc."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Rogers Communications Canada Inc."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-telus-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-telus-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Telus Communications"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Telus Communications"
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-failures-videotron-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-failures-videotron-warning"
alarm_description = "Pinpoint SMS failures are more than 50 for Videotron Ltd."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.pinpoint-sms-failures-carriers[0].metric_transformation[0].name
namespace = "LogMetrics"
period = "300"
statistic = "Sum"
threshold = "50" # 50 over a period of 5 minutes
treat_missing_data = "notBreaching"

dimensions = {
Carrier = "Videotron Ltd."
}

alarm_actions = [var.sns_alert_warning_arn]
ok_actions = [var.sns_alert_warning_arn]
}

resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "pinpoint-sms-success-rate-warning"
Expand Down
29 changes: 24 additions & 5 deletions aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_logs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

resource "aws_cloudwatch_log_group" "pinpoint_deliveries" {
count = var.cloudwatch_enabled ? 1 : 0
name = "sns/${var.region}/${var.account_id}/PinpointDirectPublishToPhoneNumber"
retention_in_days = var.sensitive_log_retention_period_days
tags = {
Expand All @@ -11,6 +12,7 @@ resource "aws_cloudwatch_log_group" "pinpoint_deliveries" {
}

resource "aws_cloudwatch_log_group" "pinpoint_deliveries_failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "sns/${var.region}/${var.account_id}/PinpointDirectPublishToPhoneNumber/Failure"
retention_in_days = var.sensitive_log_retention_period_days
tags = {
Expand Down Expand Up @@ -54,7 +56,7 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-blocked-as-spam" {
name = "pinpoint-sms-blocked-as-spam"
# See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html
pattern = "{ $.messageStatus = \"SPAM\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures[0].name

metric_transformation {
name = "pinpoint-sms-blocked-as-spam"
Expand All @@ -69,7 +71,7 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-phone-carrier-unavaila
name = "pinpoint-sms-phone-carrier-unavailable"
# See https://docs.aws.amazon.com/sms-voice/latest/userguide/configuration-sets-event-format.html
pattern = "{ $.messageStatus = \"CARRIER_UNREACHABLE\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures[0].name

metric_transformation {
name = "pinpoint-sms-phone-carrier-unavailable"
Expand All @@ -85,7 +87,7 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-rate-exceeded" {
# https://docs.aws.amazon.com/sns/latest/dg/channels-sms-originating-identities-long-codes.html
# Canadian long code numbers are limited at 1 SMS per second/number
pattern = "{ $.messageStatusDescription = \"Rate exceeded.\" }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures[0].name

metric_transformation {
name = "pinpoint-sms-rate-exceeded"
Expand All @@ -99,7 +101,7 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-successes" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-successes"
pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus = \"SUCCESSFUL\") || ($.messageStatus = \"DELIVERED\") ) }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries.name
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries[0].name

metric_transformation {
name = "pinpoint-sms-successes"
Expand All @@ -113,7 +115,7 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "pinpoint-sms-failures"
pattern = "{ ($.isFinal IS TRUE) && ( ($.messageStatus != \"SUCCESSFUL\") && ($.messageStatus != \"DELIVERED\") ) }"
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures.name
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures[0].name

metric_transformation {
name = "pinpoint-sms-failures"
Expand All @@ -122,3 +124,20 @@ resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-failures" {
default_value = "0"
}
}

resource "aws_cloudwatch_log_metric_filter" "pinpoint-sms-failures-carriers" {
count = var.cloudwatch_enabled ? 1 : 0
log_group_name = aws_cloudwatch_log_group.pinpoint_deliveries_failures[0].name

name = "pinpoint-sms-failures-carriers"
pattern = "{ ($.isFinal IS TRUE) && ($.carrierName != \"\" && ( ($.messageStatus != \"SUCCESSFUL\") && ($.messageStatus != \"DELIVERED\") )) }"

metric_transformation {
name = "pinpoint-sms-failures-carriers"
namespace = "LogMetrics"
value = "1"
dimensions = {
Carrier = "$.carrierName"
}
}
}
Loading

0 comments on commit 3c5df00

Please sign in to comment.