Skip to content

Commit

Permalink
add secondary pagerduty variable to Indexer monitors (#71)
Browse files Browse the repository at this point in the history
  • Loading branch information
dydxwill authored Mar 4, 2024
1 parent 35adcf8 commit 175f299
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 11 deletions.
1 change: 1 addition & 0 deletions indexer/indexer_monitors.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ module "indexer_monitors" {
environment = var.environment
slack_channel = var.monitoring_slack_channel
pagerduty_tag = var.monitoring_pagerduty_tag
secondary_pagerduty_tag = var.secondary_monitoring_pagerduty_tag
ecs_cluster_name = var.full_node_name
msk_cluster_name = aws_msk_cluster.main.cluster_name
team = var.monitoring_team
Expand Down
6 changes: 6 additions & 0 deletions indexer/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,12 @@ variable "monitoring_pagerduty_tag" {
default = ""
}

variable "secondary_monitoring_pagerduty_tag" {
type = string
description = "PagerDuty tag to add to critical monitors. This will be in addition to monitoring_pagerduty_tag above. Should be prepended with @ such as '@pagerduty-indexer'"
default = ""
}

variable "monitoring_team" {
type = string
description = "Team tag to add to all monitors"
Expand Down
18 changes: 10 additions & 8 deletions modules/indexer_monitors/locals.tf
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
locals {
monitor_suffix_literal = "{{#is_alert}}\\n${var.pagerduty_tag}\\n{{/is_alert}}\\n\\n{{#is_recovery}}\\n${var.pagerduty_tag}\\n{{/is_recovery}}\\n\\n${var.slack_channel}"
monitor_suffix = "{{#is_alert}}\n${var.pagerduty_tag}\n{{/is_alert}}\n\n{{#is_recovery}}\n${var.pagerduty_tag}\n{{/is_recovery}}\n\n${var.slack_channel}"
wss_url = "wss://${var.url}/v4/ws"
https_url = "https://${var.url}/v4"
tick_frequency = 300 # 5 minutes
retry_interval = 3000 # 3 seconds in milliseconds
retry_count = 3 # 3 retries
snapshot_bucket_prefix = var.aws_account_id == "" ? var.environment : "${var.aws_account_id}-${var.environment}"
monitor_suffix_literal = "{{#is_alert}}\\n${var.pagerduty_tag}\\n{{/is_alert}}\\n\\n{{#is_recovery}}\\n${var.pagerduty_tag}\\n{{/is_recovery}}\\n\\n${var.slack_channel}"
monitor_suffix = "{{#is_alert}}\n${var.pagerduty_tag}\n{{/is_alert}}\n\n{{#is_recovery}}\n${var.pagerduty_tag}\n{{/is_recovery}}\n\n${var.slack_channel}"
critical_monitor_suffix_literal = "{{#is_alert}}\\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\\n{{/is_alert}}\\n\\n{{#is_recovery}}\\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\\n{{/is_recovery}}\\n\\n${var.slack_channel}"
critical_monitor_suffix = "{{#is_alert}}\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\n{{/is_alert}}\n\n{{#is_recovery}}\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\n{{/is_recovery}}\n\n${var.slack_channel}"
wss_url = "wss://${var.url}/v4/ws"
https_url = "https://${var.url}/v4"
tick_frequency = 300 # 5 minutes
retry_interval = 3000 # 3 seconds in milliseconds
retry_count = 3 # 3 retries
snapshot_bucket_prefix = var.aws_account_id == "" ? var.environment : "${var.aws_account_id}-${var.environment}"

api_http_synthetic_monitor_configurations = {
"height" : {
Expand Down
6 changes: 3 additions & 3 deletions modules/indexer_monitors/monitors.tf
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ resource "datadog_monitor_json" "last_processed_block_last_30min" {
"name": "[${var.environment}] Indexer Last processed block on Indexer is > 10 blocks behind latest block",
"type": "query alert",
"query": "min(last_30m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 10",
"message": "${local.monitor_suffix_literal}",
"message": "${local.critical_monitor_suffix_literal}",
"tags": [
"team:${var.team}",
"env:${var.env_tag}"
Expand Down Expand Up @@ -106,7 +106,7 @@ resource "datadog_monitor_json" "last_processed_block_last_10min" {
"name": "[${var.environment}] Indexer Last processed block on Indexer is > 100 blocks behind latest block",
"type": "query alert",
"query": "min(last_10m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 100",
"message": "${local.monitor_suffix_literal}",
"message": "${local.critical_monitor_suffix_literal}",
"tags": [
"team:${var.team}",
"env:${var.env_tag}"
Expand Down Expand Up @@ -137,7 +137,7 @@ resource "datadog_monitor_json" "on_chain_kafka_offset" {
"name": "[${var.environment}] Indexer High Kafka offset lag for on-chain messages",
"type": "query alert",
"query": "min(last_10m):avg:aws.kafka.max_offset_lag{topic:to-ender AND cluster_name IN (${var.msk_cluster_name}) AND consumer_group:ender} by {cluster_name} > 10",
"message": "Max. offset lag for the `to-ender` Kafka topic is > 10 meaning on-chain updates are delayed.\n\nResolution:\n- investigate why `ender` task running in ECS is not consuming from Kafka topic\n\n${local.monitor_suffix_literal}",
"message": "Max. offset lag for the `to-ender` Kafka topic is > 10 meaning on-chain updates are delayed.\n\nResolution:\n- investigate why `ender` task running in ECS is not consuming from Kafka topic\n\n${local.critical_monitor_suffix_literal}",
"tags": [
"team:${var.team}",
"env:${var.env_tag}"
Expand Down
5 changes: 5 additions & 0 deletions modules/indexer_monitors/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ variable "pagerduty_tag" {
description = "PagerDuty tag to add to all monitors. If \"\", then no PagerDuty tag will be used. Should be prepended with @ such as '@pagerduty-indexer'"
}

variable "secondary_pagerduty_tag" {
type = string
description = "PagerDuty tag to add to critical monitors. This will be in addition to pagerduty_tag above. Should be prepended with @ such as '@pagerduty-indexer'"
}

variable "ecs_cluster_name" {
type = string
description = "ECS cluster name for the full node"
Expand Down

0 comments on commit 175f299

Please sign in to comment.