Skip to content

Commit

Permalink
fix: opsgenie integration healthcheck (#379)
Browse files Browse the repository at this point in the history
Update to an API entpoint that the bot's Team API key can access.

Move the API call into the try block so that a request failure is
properly identified as a failed healthcheck.

Make the alarms trigger for all `ERROR`, `Exception` and `WARNING` strings.
  • Loading branch information
patheard authored Jan 24, 2024
1 parent 382cad2 commit 50274ae
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
10 changes: 5 additions & 5 deletions app/integrations/opsgenie.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ def create_alert(description):
def healthcheck():
"""Check if the bot can interact with the Opsgenie API."""
healthy = False
content = api_get_request(
"https://api.opsgenie.com/v2/account",
{"name": "GenieKey", "token": OPSGENIE_KEY},
)
try:
content = api_get_request(
"https://api.opsgenie.com/v1/services",
{"name": "GenieKey", "token": OPSGENIE_KEY},
)
result = json.loads(content)
logging.info(f"OpsGenie healthcheck result: {result}")
healthy = "data" in result
logging.info(f"OpsGenie healthcheck result: {result}")
except Exception as error:
logging.error(f"OpsGenie healthcheck failed: {error}")
return healthy
Expand Down
10 changes: 7 additions & 3 deletions app/jobs/scheduled_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,14 @@ def scheduler_heartbeat():

def integration_healthchecks():
logging.info("Running integration healthchecks ...")
healthchecks = [opsgenie.healthcheck]
for healthcheck in healthchecks:
healthchecks = {
"opsgenie": opsgenie.healthcheck,
}
for key, healthcheck in healthchecks.items():
if not healthcheck():
logging.error(f"Integration {healthcheck.__name__} is unhealthy 💀")
logging.error(f"Integration {key} is unhealthy 💀")
else:
logging.info(f"Integration {key} is healthy 🌈")


def run_continuously(interval=1):
Expand Down
4 changes: 2 additions & 2 deletions terraform/alarms.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
resource "aws_cloudwatch_log_metric_filter" "sre_bot_error" {
name = local.error_logged
pattern = "\"ERROR:slack_bolt.App\""
pattern = "?ERROR ?Exception"
log_group_name = local.api_cloudwatch_log_group

metric_transformation {
Expand Down Expand Up @@ -29,7 +29,7 @@ resource "aws_cloudwatch_metric_alarm" "sre_bot_error" {

resource "aws_cloudwatch_log_metric_filter" "sre_bot_warning" {
name = local.warning_logged
pattern = "\"WARNING:slack_bolt.App\""
pattern = "WARNING"
log_group_name = local.api_cloudwatch_log_group

metric_transformation {
Expand Down

0 comments on commit 50274ae

Please sign in to comment.