Skip to content

Commit

Permalink
Consolidated queries in Celery folder for both staging and prod (#1035)
Browse files Browse the repository at this point in the history
* Consolidated queries in Celery folder for both staging and prod

* Removed/renamed duplicates

* Sorted the CW LI celery queries

---------

Co-authored-by: Ben Larabie <[email protected]>
  • Loading branch information
jimleroyer and ben851 authored Nov 16, 2023
1 parent 0133480 commit e98ae72
Showing 1 changed file with 178 additions and 7 deletions.
185 changes: 178 additions & 7 deletions aws/eks/cloudwatch_queries.tf
Original file line number Diff line number Diff line change
@@ -1,23 +1,42 @@
resource "aws_cloudwatch_query_definition" "admin-api-50X-errors" {
################################ CELERY FOLDER ################################

resource "aws_cloudwatch_query_definition" "celery-errors" {
count = var.cloudwatch_enabled ? 1 : 0
name = "ADMIN & API - 50X errors"
name = "Celery errors"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /admin|api/
| filter @message like /HTTP\/\d+\.\d+\\" 50\d/
| filter kubernetes.container_name like /^celery/
| filter @message like /ERROR\/.*Worker/
| sort @timestamp desc
| limit 20
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-errors" {
resource "aws_cloudwatch_query_definition" "celery-filter-by-job" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Celery errors"
name = "Filter by job"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.labels.app as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.labels.app like /^celery/
| filter @message like /0d58e195-d6ae-4fe3-aa73-064ff106972b/
| sort @timestamp desc
| limit 20
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-filter-by-notification-id" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Filter by notification id"

log_group_names = [
local.eks_application_log_group
Expand All @@ -26,7 +45,159 @@ resource "aws_cloudwatch_query_definition" "celery-errors" {
query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| filter @message like /ERROR\/.*Worker/
| filter @message like /notification_id/
| sort @timestamp desc
| limit 20
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-memory-usage-by-pod" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Celery Memory Usage By Pod"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, @message, @logStream, @log, PodName, kubernetes.pod_name, pod_memory_usage
| filter kubernetes.pod_name = "<pod-name>"
| sort @timestamp desc
| stats avg(pod_memory_utilization_over_pod_limit) by bin(30s)
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-pods-over-cpu-limit" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Celery Pods over CPU Limit"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, @message, @logStream, @log, PodName, kubernetes.pod_name, pod_memory_usage
| filter kubernetes.pod_name = "<pod-name>"
| sort @timestamp desc
| stats avg(pod_cpu_utilization_over_pod_limit) by bin(30s)
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-queues" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Celery queues"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.labels.app as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.labels.app like /^celery/
| filter @message like /queue for delivery/
| sort @timestamp desc
| limit 20
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-starts" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Celery Starts"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.labels.app as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| filter @message like /Notify config/
| sort @timestamp desc
| stats count
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-worker-exited-normally" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Worker exited normally"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| filter @message like /Warm shutdown/
| sort @timestamp desc
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-worker-exited-prematurely" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Worker exited prematurely"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| filter @message like /Task handler raised error: WorkerLostError\('Worker exited prematurely/
| sort @timestamp desc
QUERY
}

resource "aws_cloudwatch_query_definition" "celery-worker-exits-cold-vs-warm" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Worker exits, cold vs warm"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| parse "Warm shutdown" as @warm
| parse "Worker exited prematurely" as @cold
| filter ispresent(@warm) or ispresent(@cold)
| stats count(@warm) as warm, count(@cold) as cold by bin(15m)
QUERY
}

resource "aws_cloudwatch_query_definition" "retry-attemps-by-duration" {
count = var.cloudwatch_enabled ? 1 : 0
name = "Retry attempts by duration"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /^celery/
| filter @message like /Retry in/
| parse @message /Retry in (?<retry_duration>\d+s)/
| stats count() by retry_duration
QUERY
}

################################ UNSORTED YET #################################

resource "aws_cloudwatch_query_definition" "admin-api-50X-errors" {
count = var.cloudwatch_enabled ? 1 : 0
name = "ADMIN & API - 50X errors"

log_group_names = [
local.eks_application_log_group
]

query_string = <<QUERY
fields @timestamp, log, kubernetes.container_name as app, kubernetes.pod_name as pod_name, @logStream
| filter kubernetes.container_name like /admin|api/
| filter @message like /HTTP\/\d+\.\d+\\" 50\d/
| sort @timestamp desc
| limit 20
QUERY
Expand Down

0 comments on commit e98ae72

Please sign in to comment.