Skip to content

Commit

Permalink
Adding alarms to the SRE Bot so that we can get alerting when somethi…
Browse files Browse the repository at this point in the history
…ng is wrong (#333)
  • Loading branch information
sylviamclaughlin authored Dec 11, 2023
1 parent 69f238a commit ca67123
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .github/workflows/tf_apply.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
TERRAFORM_VERSION: 1.3.3
TERRAGRUNT_VERSION: 0.31.1
TF_VAR_google_oauth_pickle_string: "${{ secrets.GOOGLE_PICKLE_STRING }}"
TF_VAR_slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}


permissions:
id-token: write
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/tf_plan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ env:
TERRAFORM_VERSION: 1.3.3
TERRAGRUNT_VERSION: 0.31.1
TF_VAR_google_oauth_pickle_string: "${{ secrets.GOOGLE_PICKLE_STRING }}"
TF_VAR_slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}


permissions:
id-token: write
Expand Down
57 changes: 57 additions & 0 deletions terraform/alarms.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
resource "aws_cloudwatch_log_metric_filter" "sre_bot_error" {
name = local.error_logged
pattern = "?ERROR ?Error"
log_group_name = local.api_cloudwatch_log_group

metric_transformation {
name = local.error_logged
namespace = local.error_namespace
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "sre_bot_error" {
alarm_name = "SRE Bot Errors"
alarm_description = "Errors logged by the SRE Bot"
comparison_operator = "GreaterThanOrEqualToThreshold"

metric_name = aws_cloudwatch_log_metric_filter.sre_bot_error.metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.sre_bot_error.metric_transformation[0].namespace
period = "60"
evaluation_periods = "1"
statistic = "Sum"
threshold = var.error_threshold
treat_missing_data = "notBreaching"

alarm_actions = [aws_sns_topic.cloudwatch_warning.arn]
ok_actions = [aws_sns_topic.cloudwatch_warning.arn]
}

resource "aws_cloudwatch_log_metric_filter" "sre_bot_warning" {
name = local.warning_logged
pattern = "?WARNING ?Warning"
log_group_name = local.api_cloudwatch_log_group

metric_transformation {
name = local.warning_logged
namespace = local.error_namespace
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "sre_bot_warning" {
alarm_name = "SRE Bot Warnings"
alarm_description = "Warnings logged by the SRE Bot"
comparison_operator = "GreaterThanOrEqualToThreshold"

metric_name = aws_cloudwatch_log_metric_filter.sre_bot_warning.metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.sre_bot_warning.metric_transformation[0].namespace
period = "60"
evaluation_periods = "1"
statistic = "Sum"
threshold = var.warning_threshold
treat_missing_data = "notBreaching"

alarm_actions = [aws_sns_topic.cloudwatch_warning.arn]
ok_actions = [aws_sns_topic.cloudwatch_warning.arn]
}
6 changes: 6 additions & 0 deletions terraform/local.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
locals {
api_cloudwatch_log_group = "/ecs/sre-bot-app"
error_logged = "SREBotErrorLogged"
error_namespace = "SREBot"
warning_logged = "SREBotWarningLogged"
}
29 changes: 29 additions & 0 deletions terraform/queries.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
resource "aws_cloudwatch_query_definition" "api_errors" {
name = "SRE Bot Errors"

log_group_names = [
local.api_cloudwatch_log_group
]

query_string = <<-QUERY
fields @timestamp, @message, @logStream
| filter @message like /(?i)ERROR|FAILED/
| sort @timestamp desc
| limit 20
QUERY
}

resource "aws_cloudwatch_query_definition" "api_warnings" {
name = "SRE Bot Warnings"

log_group_names = [
local.api_cloudwatch_log_group
]

query_string = <<-QUERY
fields @timestamp, @message, @logStream
| filter @message like /WARNING/
| sort @timestamp desc
| limit 20
QUERY
}
17 changes: 17 additions & 0 deletions terraform/sns.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# SNS: topic & subscription
#
resource "aws_sns_topic" "cloudwatch_warning" {
name = "sre-bot-cloudwatch-alarms-warning"

tags = {
CostCentre = var.billing_code
Terraform = true
}
}

resource "aws_sns_topic_subscription" "alert_warning" {
topic_arn = aws_sns_topic.cloudwatch_warning.arn
protocol = "https"
endpoint = var.slack_webhook_url
}
20 changes: 19 additions & 1 deletion terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,22 @@ variable "google_oauth_pickle_string" {
description = "(Required) The Google OAuth pickle string"
type = string
sensitive = true
}
}

variable "error_threshold" {
description = "CloudWatch alarm threshold for the SRE Bot ERROR logs"
type = string
default = "1"
}

variable "warning_threshold" {
description = "CloudWatch alarm threshold for the SRE Bot WARNING logs"
type = string
default = "10"
}

variable "slack_webhook_url" {
description = "The URL of the Slack webhook."
type = string
sensitive = true
}

0 comments on commit ca67123

Please sign in to comment.