diff --git a/aws/common/cloudwatch_alarms_email.tf b/aws/common/cloudwatch_alarms_email.tf index 972caf199..38ac9e935 100644 --- a/aws/common/cloudwatch_alarms_email.tf +++ b/aws/common/cloudwatch_alarms_email.tf @@ -71,47 +71,6 @@ resource "aws_cloudwatch_metric_alarm" "ses-complaint-rate-critical" { treat_missing_data = "notBreaching" } -# TODO: delete this alarm and queue once we verify that we've transitioned to the new queues -resource "aws_cloudwatch_metric_alarm" "sqs-email-queue-delay-warning" { - count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "sqs-email-queue-delay-warning" - alarm_description = "ApproximateAgeOfOldestMessage in email queue >= 30 minutes for 5 minutes" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "5" - metric_name = "ApproximateAgeOfOldestMessage" - namespace = "AWS/SQS" - period = 60 - statistic = "Maximum" - threshold = 60 * 30 - treat_missing_data = "missing" - alarm_actions = [aws_sns_topic.notification-canada-ca-alert-warning.arn] - dimensions = { - QueueName = "${var.celery_queue_prefix}${var.sqs_email_queue_name}" - } -} - -# TODO: delete this alarm and queue once we verify that we've transitioned to the new queues -resource "aws_cloudwatch_metric_alarm" "sqs-email-queue-delay-critical" { - count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "sqs-email-queue-delay-critical" - alarm_description = "ApproximateAgeOfOldestMessage in email queue >= 45 minutes for 5 minutes" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "5" - metric_name = "ApproximateAgeOfOldestMessage" - namespace = "AWS/SQS" - period = 60 - statistic = "Maximum" - threshold = 60 * 45 - treat_missing_data = "missing" - alarm_actions = [aws_sns_topic.notification-canada-ca-alert-critical.arn] - insufficient_data_actions = [aws_sns_topic.notification-canada-ca-alert-warning.arn] - ok_actions = [aws_sns_topic.notification-canada-ca-alert-ok.arn] - dimensions = { - QueueName = "${var.celery_queue_prefix}${var.sqs_email_queue_name}" - } -} - - resource "aws_cloudwatch_metric_alarm" "sqs-send-email-high-queue-delay-warning" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "sqs-send-email-high-queue-delay-warning" diff --git a/aws/common/cloudwatch_alarms_sms.tf b/aws/common/cloudwatch_alarms_sms.tf index 0c5976dbb..319e3de40 100644 --- a/aws/common/cloudwatch_alarms_sms.tf +++ b/aws/common/cloudwatch_alarms_sms.tf @@ -253,44 +253,6 @@ resource "aws_cloudwatch_metric_alarm" "sns-sms-rate-exceeded-us-west-2-warning" treat_missing_data = "notBreaching" } -resource "aws_cloudwatch_metric_alarm" "sqs-sms-stuck-in-queue-warning" { - count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "sqs-sms-stuck-in-queue-warning" - alarm_description = "ApproximateAgeOfOldestMessage in SMS queue is older than 10 minutes for 5 minutes" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "5" - metric_name = "ApproximateAgeOfOldestMessage" - namespace = "AWS/SQS" - period = 60 - statistic = "Average" - threshold = 60 * 10 - treat_missing_data = "missing" - alarm_actions = [aws_sns_topic.notification-canada-ca-alert-warning.arn] - dimensions = { - QueueName = "${var.celery_queue_prefix}${var.sqs_sms_queue_name}" - } -} - -resource "aws_cloudwatch_metric_alarm" "sqs-sms-stuck-in-queue-critical" { - count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "sqs-sms-stuck-in-queue-critical" - alarm_description = "ApproximateAgeOfOldestMessage in SMS queue is older than 15 minutes for 5 minutes" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "5" - metric_name = "ApproximateAgeOfOldestMessage" - namespace = "AWS/SQS" - period = 60 - statistic = "Average" - threshold = 60 * 15 - treat_missing_data = "missing" - alarm_actions = [aws_sns_topic.notification-canada-ca-alert-critical.arn] - insufficient_data_actions = [aws_sns_topic.notification-canada-ca-alert-warning.arn] - ok_actions = [aws_sns_topic.notification-canada-ca-alert-ok.arn] - dimensions = { - QueueName = "${var.celery_queue_prefix}${var.sqs_sms_queue_name}" - } -} - resource "aws_cloudwatch_metric_alarm" "sqs-send-sms-high-queue-delay-warning" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "sqs-send-sms-high-queue-delay-warning" diff --git a/aws/common/dashboards.tf b/aws/common/dashboards.tf index c3db9f591..18810d7b9 100644 --- a/aws/common/dashboards.tf +++ b/aws/common/dashboards.tf @@ -223,7 +223,6 @@ resource "aws_cloudwatch_dashboard" "emails" { "${aws_cloudwatch_metric_alarm.ses-bounce-rate-warning[0].arn}", "${aws_cloudwatch_metric_alarm.ses-complaint-rate-warning[0].arn}", "${aws_cloudwatch_metric_alarm.ses-complaint-rate-critical[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-email-queue-delay-critical[0].arn}", "${aws_cloudwatch_metric_alarm.no-emails-sent-5-minutes-critical[0].arn}", "${aws_cloudwatch_metric_alarm.no-emails-sent-5-minutes-warning[0].arn}" ] @@ -310,7 +309,7 @@ resource "aws_cloudwatch_dashboard" "emails" { "x": 18, "type": "text", "properties": { - "markdown": "\n# Sending emails\n\nEmails are sent with [SES](https://${var.region}.console.aws.amazon.com/ses/home?region=${var.region}#dashboard:).\n\nOur limits are:\n- 1,000,000 emails per 24 hour period\n- 100 emails/second\n\nEmails are sent by Celery through the `deliver_email` task through the [send-email-tasks](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-casend-email-tasks) queue.\n\n## Message flow\n\nAfter a notification has been created in the database, Celery sends the email to the provider using the deliver_email Celery task. This Celery task is assigned to the SQS queue eks-notification-canada-casend-email-tasks, unless a specific queue has been assigned to the queue (for example: eks-notification-canada-capriority-tasks for priority notifications or eks-notification-canada-cabulk-tasks through the API REST service). This task calls the AWS SES API to send a text message.\n\n## Delivery receipts\n\nReceipts from SES are dispatched to SNS -> [Lambda](https://${var.region}.console.aws.amazon.com/lambda/home?region=${var.region}#/functions/ses-to-sqs-email-callbacks) -> [SQS](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-cadelivery-receipts) in the `delivery-receipts` queue.\n\nA delay in this queue means that we are slow to process delivery receipts (delivered, bounce, complaints).\n" + "markdown": "\n# Sending emails\n\nEmails are sent with [SES](https://${var.region}.console.aws.amazon.com/ses/home?region=${var.region}#dashboard:).\n\nOur limits are:\n- 1,000,000 emails per 24 hour period\n- 100 emails/second\n\nEmails are sent by Celery through the `deliver_email` task through the [send-email-low](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-casend-email-low), [send-email-medium](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-casend-email-medium), or [send-email-high](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-casend-email-high) queues.\n\n## Message flow\n\nAfter a notification has been created in the database, Celery sends the email to the provider using the deliver_email Celery task. This Celery task is assigned to the send-email-low, send-email-medium, or send-email-high SQS queue depending on the email's priority. This task calls the AWS SES API to send a text message.\n\n## Delivery receipts\n\nReceipts from SES are dispatched to SNS -> [Lambda](https://${var.region}.console.aws.amazon.com/lambda/home?region=${var.region}#/functions/ses-to-sqs-email-callbacks) -> [SQS](https://${var.region}.console.aws.amazon.com/sqs/v2/home?region=${var.region}#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-cadelivery-receipts) in the `delivery-receipts` queue.\n\nA delay in this queue means that we are slow to process delivery receipts (delivered, bounce, complaints).\n" } }, { @@ -388,9 +387,9 @@ resource "aws_cloudwatch_dashboard" "emails" { }, { "height": 6, - "width": 9, - "y": 42, - "x": 9, + "width": 8, + "y": 36, + "x": 8, "type": "metric", "properties": { "metrics": [ @@ -406,8 +405,8 @@ resource "aws_cloudwatch_dashboard" "emails" { }, { "height": 6, - "width": 9, - "y": 48, + "width": 8, + "y": 36, "x": 0, "type": "metric", "properties": { @@ -424,9 +423,9 @@ resource "aws_cloudwatch_dashboard" "emails" { }, { "height": 6, - "width": 9, - "y": 48, - "x": 9, + "width": 8, + "y": 36, + "x": 16, "type": "metric", "properties": { "metrics": [ @@ -450,60 +449,6 @@ resource "aws_cloudwatch_dashboard" "emails" { "markdown": "# Delivery Queues\n" } }, - { - "height": 6, - "width": 9, - "y": 42, - "x": 0, - "type": "metric", - "properties": { - "metrics": [ - [ "AWS/SQS", "ApproximateAgeOfOldestMessage", "QueueName", "eks-notification-canada-casend-email-tasks", { "color": "#1f77b4" } ] - ], - "view": "timeSeries", - "stacked": true, - "region": "${var.region}", - "stat": "Average", - "period": 60, - "title": "Average approximate age of oldest message in send-email-tasks" - } - }, - { - "height": 6, - "width": 9, - "y": 36, - "x": 0, - "type": "metric", - "properties": { - "metrics": [ - [ "AWS/SQS", "ApproximateNumberOfMessagesVisible", "QueueName", "eks-notification-canada-casend-email-tasks" ] - ], - "view": "timeSeries", - "stacked": false, - "region": "${var.region}", - "title": "Approximate number of messages in send-email-tasks", - "period": 60, - "stat": "Average" - } - }, - { - "height": 6, - "width": 9, - "y": 36, - "x": 9, - "type": "metric", - "properties": { - "metrics": [ - [ "AWS/SQS", "ApproximateAgeOfOldestMessage", "QueueName", "eks-notification-canada-casend-email-tasks", { "color": "#1f77b4" } ] - ], - "view": "timeSeries", - "stacked": true, - "region": "${var.region}", - "stat": "Average", - "period": 60, - "title": "Average approximate age of oldest message in send-email-tasks" - } - }, { "height": 6, "width": 8, @@ -703,8 +648,6 @@ resource "aws_cloudwatch_dashboard" "sms" { "alarms": [ "${aws_cloudwatch_metric_alarm.sns-sms-success-rate-canadian-numbers-critical[0].arn}", "${aws_cloudwatch_metric_alarm.sns-sms-success-rate-canadian-numbers-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-sms-stuck-in-queue-warning[0].arn}", - "${aws_cloudwatch_metric_alarm.sqs-sms-stuck-in-queue-critical[0].arn}", "${aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-warning[0].arn}", "${aws_cloudwatch_metric_alarm.sqs-send-sms-high-queue-delay-critical[0].arn}", "${aws_cloudwatch_metric_alarm.sqs-send-sms-medium-queue-delay-warning[0].arn}", @@ -900,7 +843,7 @@ resource "aws_cloudwatch_dashboard" "sms" { "x": 18, "type": "text", "properties": { - "markdown": "\n## Limits\n- SNS [maximum sending rate](https://docs.aws.amazon.com/general/latest/gr/sns.html#limits_sns): 20 SMS/second\n- [Spending limit](https://${var.region}.console.aws.amazon.com/sns/v3/home?region=${var.region}#/mobile/text-messaging) of 30,000 USD/month\n\n## Message flow\nAfter a notification has been created in the database, Celery sends the SMS to the provider using the `deliver_sms` Celery task. This Celery task is assigned to the SQS queue [eks-notification-canada-casend-sms-tasks](#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2Feks-notification-canada-casend-sms-tasks), unless a specific queue has been assigned to the queue (for example priority templates, SMS sent by the Notify service etc.). This task calls the SNS API to send a text message.\n\n## SNS IDs\nSNS keeps track of SMS with a `messageId`, the value of SNS' `messageId` is stored in the `Notification` object in the `reference` column.\n\n## Logging\nCelery tasks output multiple messages when processing tasks/calling the SNS API, take a look at the relevant Celery code to know more.\n\nAfter an SMS has been sent by SNS, the delivery details are stored in CloudWatch Log groups:\n\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber) for successful deliveries\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber$252FFailure) for failures\n\n## Phone numbers\n\nSMS sent in `${var.region}` use random phone numbers managed by AWS.\n\n### ⚠️ SNS in `us-west-2`\nIf a Notify service has an inbound number attached, SMS will be sent with SNS using a long code phone number ordered on Pinpoint in the `us-west-2` region. Statistics for this region and alarms are **not visible on this dashboard**.\n" + "markdown": "\n## Limits\n- SNS [maximum sending rate](https://docs.aws.amazon.com/general/latest/gr/sns.html#limits_sns): 20 SMS/second\n- [Spending limit](https://${var.region}.console.aws.amazon.com/sns/v3/home?region=${var.region}#/mobile/text-messaging) of 30,000 USD/month\n\n## Message flow\nAfter a notification has been created in the database, Celery sends the SMS to the provider using the `deliver_sms` Celery task. This Celery task is assigned to the SQS queue [${var.celery_queue_prefix}send-sms-low](#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2F${var.celery_queue_prefix}send-sms-low), [${var.celery_queue_prefix}send-sms-medium](#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2F${var.celery_queue_prefix}send-sms-medium), or [${var.celery_queue_prefix}send-sms-high](#/queues/https%3A%2F%2Fsqs.${var.region}.amazonaws.com%2F${var.account_id}%2F${var.celery_queue_prefix}send-sms-high) depending on the SMS priority. This task calls the SNS API to send a text message.\n\n## SNS IDs\nSNS keeps track of SMS with a `messageId`, the value of SNS' `messageId` is stored in the `Notification` object in the `reference` column.\n\n## Logging\nCelery tasks output multiple messages when processing tasks/calling the SNS API, take a look at the relevant Celery code to know more.\n\nAfter an SMS has been sent by SNS, the delivery details are stored in CloudWatch Log groups:\n\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber) for successful deliveries\n- [sns/${var.region}/${var.account_id}/DirectPublishToPhoneNumber/Failure](#logsV2:log-groups/log-group/sns$252F${var.region}$252F${var.account_id}$252FDirectPublishToPhoneNumber$252FFailure) for failures\n\n## Phone numbers\n\nSMS sent in `${var.region}` use random phone numbers managed by AWS.\n\n### ⚠️ SNS in `us-west-2`\nIf a Notify service has an inbound number attached, SMS will be sent with SNS using a long code phone number ordered on Pinpoint in the `us-west-2` region. Statistics for this region and alarms are **not visible on this dashboard**.\n" } }, { diff --git a/aws/common/variables.tf b/aws/common/variables.tf index 278a0f4a4..5a1a373ea 100644 --- a/aws/common/variables.tf +++ b/aws/common/variables.tf @@ -68,14 +68,6 @@ variable "sqs_visibility_timeout_priority_high" { default = 26 } -# TODO: delete this variable once we verify that we've transitioned to the new queues -variable "sqs_email_queue_name" { - type = string - # See QueueNames in - # https://github.com/cds-snc/notification-api/blob/master/app/config.py - default = "send-email-tasks" -} - variable "sqs_send_email_high_queue_name" { type = string # See QueueNames in @@ -97,14 +89,6 @@ variable "sqs_send_email_low_queue_name" { default = "send-email-low" } -# TODO: delete this variable once we verify that we've transitioned to the new queues -variable "sqs_sms_queue_name" { - type = string - # See QueueNames in - # https://github.com/cds-snc/notification-api/blob/master/app/config.py - default = "send-sms-tasks" -} - variable "sqs_send_sms_high_queue_name" { type = string # See QueueNames in @@ -313,4 +297,4 @@ variable "budget_sre_bot_webhook" { description = "Slack webhook used to post budget alerts to the SRE bot" type = string sensitive = true -} \ No newline at end of file +} diff --git a/aws/eks/dashboards.tf b/aws/eks/dashboards.tf index 128c28e52..fbb14b8b2 100644 --- a/aws/eks/dashboards.tf +++ b/aws/eks/dashboards.tf @@ -158,7 +158,6 @@ resource "aws_cloudwatch_dashboard" "notify_system" { "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:logs-10-500-error-5-minutes-critical-ses_to_sqs_email_callbacks-500-errors-api", "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:lambda-ses-delivery-receipts-errors-critical", "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:sqs-sms-stuck-in-queue-critical", - "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:sqs-email-queue-delay-critical", "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:sns-sms-success-rate-canadian-numbers-critical", "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:ddos-detected-load-balancer-critical", "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:inflights-not-being-processed-critical", diff --git a/aws/eks/variables.tf b/aws/eks/variables.tf index e8dd53dd7..f6e585dfc 100644 --- a/aws/eks/variables.tf +++ b/aws/eks/variables.tf @@ -206,14 +206,6 @@ variable "sqs_send_email_low_queue_name" { default = "send-email-low" } -# TODO: delete this variable once we verify that we've transitioned to the new queues -variable "sqs_sms_queue_name" { - type = string - # See QueueNames in - # https://github.com/cds-snc/notification-api/blob/master/app/config.py - default = "send-sms-tasks" -} - variable "sqs_send_sms_high_queue_name" { type = string # See QueueNames in