From 60541e4a6af16f05885516d5c85d4f97de32ab55 Mon Sep 17 00:00:00 2001 From: Stephen Astels Date: Mon, 10 Jun 2024 11:59:20 -0400 Subject: [PATCH 1/2] rough in pinpoint dashboard --- .../dashboards.tf | 518 ++++++++++++++++++ 1 file changed, 518 insertions(+) create mode 100644 aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf diff --git a/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf b/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf new file mode 100644 index 000000000..8d844c049 --- /dev/null +++ b/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf @@ -0,0 +1,518 @@ +resource "aws_cloudwatch_dashboard" "pinpoint" { + count = var.cloudwatch_enabled ? 1 : 0 + dashboard_name = "Pinpoint" + dashboard_body = < Date: Mon, 10 Jun 2024 17:00:32 -0400 Subject: [PATCH 2/2] switch to pinpoint sms info --- aws/common/outputs.tf | 28 ++++++++++ .../cloudwatch_alarms.tf | 2 +- .../dashboards.tf | 54 +++++++------------ .../variables.tf | 35 ++++++++++++ .../terragrunt.hcl | 14 +++++ .../terragrunt.hcl | 14 +++++ 6 files changed, 112 insertions(+), 35 deletions(-) diff --git a/aws/common/outputs.tf b/aws/common/outputs.tf index 5dd4a245d..a40b71eab 100644 --- a/aws/common/outputs.tf +++ b/aws/common/outputs.tf @@ -212,3 +212,31 @@ output "subnet_cidr_blocks" { output "sns_monthly_spend_limit" { value = var.sns_monthly_spend_limit } + +output "celery_queue_prefix" { + value = var.celery_queue_prefix +} + +output "sqs_send_sms_high_queue_delay_warning_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-warning[0].arn : "" +} + +output "sqs_send_sms_high_queue_delay_critical_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-critical[0].arn : "" +} + +output "sqs_send_sms_medium_queue_delay_warning_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-medium-queue-delay-warning[0].arn : "" +} + +output "sqs_send_sms_medium_queue_delay_critical_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-medium-queue-delay-critical[0].arn : "" +} + +output "sqs_send_sms_low_queue_delay_warning_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-low-queue-delay-warning[0].arn : "" +} + +output "sqs_send_sms_low_queue_delay_critical_arn" { + value = var.cloudwatch_enabled ? aws_cloudwatch_metric_alarm.sqs-send-sms-low-queue-delay-critical[0].arn : "" +} diff --git a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf index bc5387995..e059f65f9 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/cloudwatch_alarms.tf @@ -194,7 +194,7 @@ resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-warning" { } } -resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-canadian-numbers-critical" { +resource "aws_cloudwatch_metric_alarm" "pinpoint-sms-success-rate-critical" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "pinpoint-sms-success-rate-canadian-numbers-critical" alarm_description = "Pinpoint SMS success rate to Canadian numbers is below 25% over 2 consecutive periods of 12 hours" diff --git a/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf b/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf index 8d844c049..b3906643b 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/dashboards.tf @@ -13,16 +13,16 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "properties": { "title": "Alarms", "alarms": [ - "${aws_cloudwatch_metric_alarm.sns-sms-success-rate-canadian-numbers-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sns-sms-success-rate-canadian-numbers-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-medium-queue-delay-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-medium-queue-delay-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-low-queue-delay-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-send-sms-low-queue-delay-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sns-spending-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sns-spending-warning[0].arn}" + "${aws_cloudwatch_metric_alarm.pinpoint-sms-success-rate-critical[0].arn}", + "${aws_cloudwatch_metric_alarm.pinpoint-sms-success-rate-warning[0].arn}", + "${var.sqs_send_sms_high_queue_delay_warning_arn}", + "${var.sqs_send_sms_high_queue_delay_critical_arn}", + "${var.sqs_send_sms_medium_queue_delay_warning_arn}", + "${var.sqs_send_sms_medium_queue_delay_critical_arn}", + "${var.sqs_send_sms_low_queue_delay_warning_arn}", + "${var.sqs_send_sms_low_queue_delay_critical_arn}", + "${aws_cloudwatch_metric_alarm.total-sms-spending-critical[0].arn}", + "${aws_cloudwatch_metric_alarm.total-sms-spending-warning[0].arn}" ] } }, @@ -34,7 +34,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "type": "metric", "properties": { "metrics": [ - [ "AWS/SNS", "NumberOfNotificationsDelivered", "PhoneNumber", "PhoneNumberDirect" ] + [ "LogMetrics", "pinpoint-sms-successes" ] ], "view": "timeSeries", "stacked": true, @@ -52,7 +52,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "type": "metric", "properties": { "metrics": [ - [ "AWS/SNS", "NumberOfNotificationsFailed", "PhoneNumber", "PhoneNumberDirect", { "color": "#d62728" } ] + [ "LogMetrics", "pinpoint-sms-failures" ] ], "view": "timeSeries", "stacked": true, @@ -129,7 +129,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "region": "${var.region}", "stat": "p90", "period": 60, - "title": "p90 SNS request time in ms", + "title": "TODO CONVERT TO PINPOINT p90 SNS request time in ms", "annotations": { "horizontal": [ { @@ -192,7 +192,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "type": "metric", "properties": { "metrics": [ - [ "AWS/Lambda", "Invocations", "FunctionName", "sns-to-sqs-sms-callbacks" ], + [ "AWS/Lambda", "Invocations", "FunctionName", "pinpoint-to-sqs-sms-callbacks" ], [ ".", "Errors", ".", ".", { "color": "#d62728", "yAxis": "right" } ] ], "view": "timeSeries", @@ -200,7 +200,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "region": "${var.region}", "stat": "Sum", "period": 300, - "title": "Lambda invocations per 5m" + "title": "Pinpoint Callback Lambda invocations per 5m" } }, { @@ -241,20 +241,6 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "markdown": "\n## Message flow\nAfter an SMS has been sent by SNS, the delivery details are stored in CloudWatch Log groups:\n\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber) for successful deliveries\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber$252FFailure) for failures\n\nThe log groups are subscribed the Lambda function [sns-to-sqs-sms-callbacks](#/functions/sns-to-sqs-sms-callbacks?tab=configuration). This Lambda adds messages to the SQS queue `delivery-receipts` to trigger the Celery task in charge of updating notifications in the database, `process-sns-result`.\n\nSee the relevant [AWS documentation](https://docs.aws.amazon.com/sns/latest/dg/sms_stats_cloudwatch.html#sns-viewing-cloudwatch-logs) for these messages.\n" } }, - { - "height": 3, - "width": 24, - "y": 54, - "x": 0, - "type": "log", - "properties": { - "query": "SOURCE 'sns/us-west-2/${var.account_id}/DirectPublishToPhoneNumber/Failure' | fields @timestamp as Timestamp, notification.messageId as MessageID, status, delivery.destination as Destination, delivery.providerResponse as ProviderResponse\n| sort @timestamp desc\n| limit 20", - "region": "us-west-2", - "stacked": false, - "title": "SMS Errors Log / us-west-2", - "view": "table" - } - }, { "height": 3, "width": 24, @@ -262,11 +248,11 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "x": 0, "type": "log", "properties": { - "query": "SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure' | fields @timestamp as Timestamp, notification.messageId as MessageID, status, delivery.destination as Destination, delivery.providerResponse as ProviderResponse\n| sort @timestamp desc\n| limit 20", + "query": "SOURCE 'sns/${var.region}/${var.account_id}/PinpointDirectPublishToPhoneNumber/Failure' | fields @timestamp as Timestamp, notification.messageId as MessageID, status, delivery.destination as Destination, delivery.providerResponse as ProviderResponse\n| sort @timestamp desc\n| limit 20", "region": "${var.region}", "stacked": false, "view": "table", - "title": "SMS Errors Log / ${var.region}" + "title": "Pinpoint SMS Errors Log / ${var.region}" } }, { @@ -286,7 +272,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "region": "${var.region}", "stat": "p90", "period": 60, - "title": "p90 SMS sending time in seconds", + "title": "TODO: Convert to Pinpoint p90 SMS sending time in seconds", "annotations": { "horizontal": [ { @@ -320,7 +306,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "x": 0, "type": "log", "properties": { - "query": "SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure' | SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber' | stats avg(delivery.dwellTimeMsUntilDeviceAck / 1000 / 60) as Avg_carrier_time_minutes, count(*) as Number by delivery.phoneCarrier as Carrier", + "query": "SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure' | SOURCE 'sns/${var.region}/${var.account_id}/PinpointDirectPublishToPhoneNumber' | stats avg(delivery.dwellTimeMsUntilDeviceAck / 1000 / 60) as Avg_carrier_time_minutes, count(*) as Number by delivery.phoneCarrier as Carrier", "region": "${var.region}", "title": "Carrier Dwell Times", "view": "table" @@ -333,7 +319,7 @@ resource "aws_cloudwatch_dashboard" "pinpoint" { "x": 0, "type": "log", "properties": { - "query": "SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber' | SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure' | stats avg(delivery.dwellTimeMsUntilDeviceAck / 1000 / 60) as Avg_carrier_time_minutes by bin(30s)", + "query": "SOURCE 'sns/${var.region}/${var.account_id}/PinpointDirectPublishToPhoneNumber' | SOURCE 'sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure' | stats avg(delivery.dwellTimeMsUntilDeviceAck / 1000 / 60) as Avg_carrier_time_minutes by bin(30s)", "region": "${var.region}", "stacked": false, "view": "timeSeries", diff --git a/aws/pinpoint_to_sqs_sms_callbacks/variables.tf b/aws/pinpoint_to_sqs_sms_callbacks/variables.tf index 3bcd541d7..2d42cc813 100644 --- a/aws/pinpoint_to_sqs_sms_callbacks/variables.tf +++ b/aws/pinpoint_to_sqs_sms_callbacks/variables.tf @@ -51,3 +51,38 @@ variable "bootstrap" { type = bool default = false } + +variable "celery_queue_prefix" { + description = "The prefix for the celery queue" + type = string +} + +variable "sqs_send_sms_high_queue_delay_warning_arn" { + description = "ARN for the corresponding alarm" + type = string +} + +variable "sqs_send_sms_high_queue_delay_critical_arn" { + description = "ARN for the corresponding alarm" + type = string +} + +variable "sqs_send_sms_medium_queue_delay_warning_arn" { + description = "ARN for the corresponding alarm" + type = string +} + +variable "sqs_send_sms_medium_queue_delay_critical_arn" { + description = "ARN for the corresponding alarm" + type = string +} + +variable "sqs_send_sms_low_queue_delay_warning_arn" { + description = "ARN for the corresponding alarm" + type = string +} + +variable "sqs_send_sms_low_queue_delay_critical_arn" { + description = "ARN for the corresponding alarm" + type = string +} diff --git a/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl b/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl index 51cd1a8cc..4e0aac8fe 100644 --- a/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl +++ b/env/dev/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl @@ -15,6 +15,13 @@ dependency "common" { sns_alert_ok_arn = "" sqs_deliver_receipts_queue_arn = "" sns_monthly_spend_limit = 1 + celery_queue_prefix = "" + sqs_send_sms_high_queue_delay_warning_arn = "" + sqs_send_sms_high_queue_delay_critical_arn = "" + sqs_send_sms_medium_queue_delay_warning_arn = "" + sqs_send_sms_medium_queue_delay_critical_arn = "" + sqs_send_sms_low_queue_delay_warning_arn = "" + sqs_send_sms_low_queue_delay_critical_arn = "" } } @@ -41,6 +48,13 @@ inputs = { pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit + celery_queue_prefix = dependency.common.outputs.celery_queue_prefix + sqs_send_sms_high_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_high_queue_delay_warning_arn + sqs_send_sms_high_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_high_queue_delay_critical_arn + sqs_send_sms_medium_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_medium_queue_delay_warning_arn + sqs_send_sms_medium_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_medium_queue_delay_critical_arn + sqs_send_sms_low_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_low_queue_delay_warning_arn + sqs_send_sms_low_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_low_queue_delay_critical_arn } terraform { diff --git a/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl b/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl index c0fddf361..0fe010cb8 100644 --- a/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl +++ b/env/staging/pinpoint_to_sqs_sms_callbacks/terragrunt.hcl @@ -15,6 +15,13 @@ dependency "common" { sns_alert_ok_arn = "" sqs_deliver_receipts_queue_arn = "" sns_monthly_spend_limit = 1 + celery_queue_prefix = "" + sqs_send_sms_high_queue_delay_warning_arn = "" + sqs_send_sms_high_queue_delay_critical_arn = "" + sqs_send_sms_medium_queue_delay_warning_arn = "" + sqs_send_sms_medium_queue_delay_critical_arn = "" + sqs_send_sms_low_queue_delay_warning_arn = "" + sqs_send_sms_low_queue_delay_critical_arn = "" } } @@ -41,6 +48,13 @@ inputs = { pinpoint_to_sqs_sms_callbacks_ecr_repository_url = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_repository_url pinpoint_to_sqs_sms_callbacks_ecr_arn = dependency.ecr.outputs.pinpoint_to_sqs_sms_callbacks_ecr_arn sms_monthly_spend_limit = dependency.common.outputs.sns_monthly_spend_limit + celery_queue_prefix = dependency.common.outputs.celery_queue_prefix + sqs_send_sms_high_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_high_queue_delay_warning_arn + sqs_send_sms_high_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_high_queue_delay_critical_arn + sqs_send_sms_medium_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_medium_queue_delay_warning_arn + sqs_send_sms_medium_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_medium_queue_delay_critical_arn + sqs_send_sms_low_queue_delay_warning_arn = dependency.common.outputs.sqs_send_sms_low_queue_delay_warning_arn + sqs_send_sms_low_queue_delay_critical_arn = dependency.common.outputs.sqs_send_sms_low_queue_delay_critical_arn } terraform {