diff --git a/aws-observability-terraform/app-modules/alb/app.tf b/aws-observability-terraform/app-modules/alb/app.tf index 73c62ee5..29465094 100644 --- a/aws-observability-terraform/app-modules/alb/app.tf +++ b/aws-observability-terraform/app-modules/alb/app.tf @@ -124,6 +124,74 @@ module "alb_module" { connection_notifications = var.connection_notifications email_notifications = var.email_notifications }, + "AWSApplicationLoadBalancerDeletionAlert" = { + monitor_name = "AWS Application Load Balancer - Deletion Alert" + monitor_description = "This alert fires when we detect greater than or equal to 2 application load balancers are deleted over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"errorCode\" \"2015-12-01\" | json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop | where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/applicationelb\" | where event_name matches \"DeleteLoadBalancer\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 2, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 2, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, + "AWSApplicationLoadBalancerTargetsDeregistered" = { + monitor_name = "AWS Application Load Balancer - Targets Deregistered" + monitor_description = "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"errorCode\" \"2015-12-01\" | json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop | where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" | where namespace matches \"aws/applicationelb\" and event_name=\"DeregisterTargets\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, "AWSApplicationLoadBalancerHigh5XXErrors" = { monitor_name = "AWS Application Load Balancer - High 5XX Errors" monitor_description = "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes." diff --git a/aws-observability-terraform/app-modules/elb/app.tf b/aws-observability-terraform/app-modules/elb/app.tf index 51b23f08..1a2dd3aa 100644 --- a/aws-observability-terraform/app-modules/elb/app.tf +++ b/aws-observability-terraform/app-modules/elb/app.tf @@ -122,6 +122,74 @@ module "classic_elb_module" { connection_notifications = var.connection_notifications email_notifications = var.email_notifications }, + "AWSClassicLoadBalancerDeletionAlert" = { + monitor_name = "AWS Classic Load Balancer - Deletion Alert" + monitor_description = "This alert fires when we detect greater than or equal to 2 classic load balancers are deleted over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/elb \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"\"apiVersion\":\"2012-06-01\"\" | json \"eventSource\", \"eventName\" as event_source, event_name nodrop | where event_source = \"elasticloadbalancing.amazonaws.com\" | where event_name matches \"DeleteLoadBalancer\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 2, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 2, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, + "AWSClassicLoadBalancerTargetsDeregistered" = { + monitor_name = "AWS Classic Load Balancer - Targets Deregistered" + monitor_description = "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/elb \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"\"apiVersion\":\"2012-06-01\"\" \n| json \"eventSource\", \"eventName\" as event_source, event_name nodrop \n| where event_source = \"elasticloadbalancing.amazonaws.com\" \n| where event_name matches \"DeregisterInstancesFromLoadBalancer\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, "AWSClassicLoadBalancerHigh5XXErrors" = { monitor_name = "AWS Classic Load Balancer - High 5XX Errors" monitor_description = "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes." diff --git a/aws-observability-terraform/app-modules/nlb/app.tf b/aws-observability-terraform/app-modules/nlb/app.tf index d93e9fc3..9faea8f0 100644 --- a/aws-observability-terraform/app-modules/nlb/app.tf +++ b/aws-observability-terraform/app-modules/nlb/app.tf @@ -91,6 +91,74 @@ module "nlb_module" { connection_notifications = var.connection_notifications email_notifications = var.email_notifications }, + "AWSNetworkLoadBalancerDeletionAlert" = { + monitor_name = "AWS Network Load Balancer - Deletion Alert" + monitor_description = "This alert fires when we detect greater than or equal to 2 network load balancers are deleted over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"errorCode\" \"2015-12-01\" | json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop | where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/networkelb\" | where event_name matches \"DeleteLoadBalancer\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 2, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 2, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, + "AWSNetworkLoadBalancerTargetsDeregistered" = { + monitor_name = "AWS Network Load Balancer - Targets Deregistered" + monitor_description = "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"errorCode\" \"2015-12-01\" | json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop | where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/networkelb\" | where event_name matches \"DeregisterTargets\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, "AWSNetworkLoadBalancerHighUnhealthyHosts" = { monitor_name = "AWS Network Load Balancer - High Unhealthy Hosts" monitor_description = "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer" diff --git a/aws-observability-terraform/app-modules/rds/app.tf b/aws-observability-terraform/app-modules/rds/app.tf index ea738ed5..72e0a62b 100644 --- a/aws-observability-terraform/app-modules/rds/app.tf +++ b/aws-observability-terraform/app-modules/rds/app.tf @@ -588,6 +588,74 @@ module "rds_module" { group_notifications = var.group_notifications connection_notifications = var.connection_notifications email_notifications = var.email_notifications + }, + "RDSOracleLogsDBCrash" = { + monitor_name = "Amazon RDS - Oracle Logs - DB Crash" + monitor_description = "This alert fires when we detect greater than or equal to 1 Oracle DB crash over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*alert ORA-* | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse regex field=message \"(?ORA-\\d{5}): (?.*)\" multi | count" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, + "RDSOracleLogsFailedConnectionAttempts" = { + monitor_name = "Amazon RDS - Oracle Logs - Failed Connection Attempts" + monitor_description = "This alert fires when we detect greater than or equal to 25 failed connection attempts over a 5 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*listener establish \"PROGRAM=\" (\"SID=\" or \"SERVICE_NAME=\") and (\"\nTNS-\" or \"TNS-\") | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse regex field=message \"\\* \\(CONNECT_DATA[\\s\\S]+?\\* establish \\* \\S+ \\* (?\\d+)\" nodrop | parse regex field=message \"CONNECT_DATA[\\s\\S]+?SERVICE_NAME=(?[^)]*)\\)[\\s\\S]+establish\" nodrop | parse regex field=message \"CONNECT_DATA[\\s\\S]+?service_name=(?[^)]*)\\)[\\s\\S]+establish\" nodrop | parse regex field=message \"CONNECT_DATA[\\s\\S]+?SID=(?[^)]*)\\)[\\s\\S]+establish\" nodrop | parse regex field=message \"CONNECT_DATA[\\s\\S]+?sid=(?[^)]*)\\)[\\s\\S]+establish\" nodrop | parse regex field=message \"CONNECT_DATA[\\s\\S]+?PROGRAM=(?[^)]*)\\)[\\s\\S]+?HOST=(?[^)]*)\\)[\\s\\S]+?USER=(?[^)]*)\\)\" nodrop | parse field=message \"(ADDRESS=(PROTOCOL=*)(HOST=*)(PORT=*))\" as clientProtocol, clientHost, clientPort nodrop | parse regex field=message \"(?TNS-\\d{5}): (?.*)\" nodrop | where status != \"0\"" + } + triggers = [ + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "Critical", + threshold = 25, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "StaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 25, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ], + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications } } } \ No newline at end of file diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index eac10853..5931cba2 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -1,11 +1,58 @@ { "name": "AWS Observability", - "description": "", + "description": "This folder contains all the monitors for AWS Observability solution.", "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS SNS - Access from Highly Malicious Sources", - "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS API Gateway - High Server-Side Errors", + "description": "This alert fires where there are too many API requests (>5%) with server-side errors within 5 minutes. This can be caused by 5xx errors from your integration, permission issues, or other factors preventing successful invocation of the integration, such as the integration being throttled or deleted.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway (metric=5XX or metric=5xxError or metric=ExecutionError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0.05, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0.05, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon RDS PostgreSQL - Statement Timeouts", + "description": "This alert fires when we detect Postgres logs show statement timeouts", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -15,7 +62,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name\n" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql \"statement timeout\" | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg | count by dbidentifier, database" } ], "triggers": [ @@ -49,8 +96,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Write Throttle", - "description": "This alert fires when we detect that the total write throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", + "name": "AWS EC2 CW - High CPU Utilization", + "description": "This alert fires when the average CPU Utilization based on cloud watch metrics, within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -60,7 +107,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=WriteThrottleEvents statistic=sum | sum by account, region, namespace, tablename" + "query": "account=* region=* namespace=aws/ec2 metric=CPUUtilization instanceid=* statistic=average | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -69,7 +116,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 @@ -79,7 +126,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -88,7 +135,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -96,10 +143,10 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Access from highly malicious sources", - "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -107,27 +154,29 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 85, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -141,8 +190,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - Statement Timeouts", - "description": "This alert fires when we detect Postgres logs show statement timeouts", + "name": "AWS Network Load Balancer - Deletion Alert", + "description": "This alert fires when we detect greater than or equal to 2 application load balancers are deleted over a 5 minute time-period.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -152,7 +201,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql \"statement timeout\" | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg | count by dbidentifier, database" + "query": "account=* region=* \"\\\"eventsource\\\":\\\"elasticloadbalancing.amazonaws.com\\\"\" \"errorCode\" \"2015-12-01\"\n| json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop\n| where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/networkelb\"\n| where event_name matches \"DeleteLoadBalancer\"" } ], "triggers": [ @@ -161,21 +210,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 2, + "thresholdType": "GreaterThanOrEqual", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 2, + "thresholdType": "LessThan", "field": null } ], - "timeZone": null, + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -186,44 +235,40 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -237,50 +282,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "AWS Application Load Balancer - Deletion Alert", + "description": "This alert fires when we detect greater than or equal to 2 application load balancers are deleted over a 5 minute time-period.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* \"\\\"eventsource\\\":\\\"elasticloadbalancing.amazonaws.com\\\"\" \"errorCode\" \"2015-12-01\"\n| json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop\n| where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/applicationelb\"\n| where event_name matches \"DeleteLoadBalancer\"" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 2, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 5, + "threshold": 2, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], - "timeZone": null, + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -288,40 +327,38 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Notifications", - "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, + "threshold": 0, "thresholdType": "GreaterThan", - "occurrenceType": "Always", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, + "threshold": 0, "thresholdType": "LessThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 2 + "field": null } ], "timeZone": null, @@ -335,8 +372,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Notification to DLQ Failure", - "description": "This alert fires when an SNS topic messages that couldn't be moved to a dead-letter queue.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -346,7 +383,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsFailedToRedriveToDlq statistic=sum | sum by account, region, namespace, topicname " + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -355,8 +392,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -365,8 +402,8 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -374,7 +411,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -382,10 +419,10 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS MSSQL - Database observing authentication failures from multiple client IPs", + "description": "This alert fires when we detect more than or equal to 10 client IPs attempting authentication failures on the database over a 15-minute period.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -393,39 +430,33 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(client_ip) as unique_client_ip by dbidentifier\n| 10 as threshold\n| where unique_client_ip >= threshold\n| sort by unique_client_ip\n| fields - threshold" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 1, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 1, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -433,8 +464,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -444,39 +475,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 1.5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 1.5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -484,41 +511,43 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High WAF Latency", - "description": "This alert fires when we detect the high WAF latency for the REST and WebSocket API requests in a stage within 5 minutes.", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "1m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=* apiname=* apiid stage domainname requestId wafLatency\n| json \"wafLatency\", \"apiId\", \"stage\" as wafLatency, apiid, stage \n| pct(wafLatency, 90) as wafLatency90th by apiid,stage" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThan", - "field": "wafLatency90th" + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThanOrEqual", - "field": "wafLatency90th" + "threshold": 85, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -529,89 +558,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>=5) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum \n| sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 2, "thresholdType": "GreaterThan", - "field": null + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 2, "thresholdType": "LessThanOrEqual", - "field": null - } - ], - "timeZone": null, - "notifications": [], - "isDisabled": true, - "groupNotifications": true, - "playbook": "", - "sloId": null, - "monitorTemplateId": null, - "tags": null, - "automatedPlaybookIds": [] - }, - { - "name": "Amazon RDS MySQL - Excessive Slow Query Detected", - "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over last 10 minutes.", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*SlowQuery \"User@Host\" \"Query_time\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"(?# User@Host:[\\S\\s]+?SET timestamp=\\d+;[\\S\\s]+?;)\" multi\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*\\[(?\\S*?)\\]\\s*Id:\\s*(?\\d*)\" nodrop\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*(?\\S+)\\s\\[(?\\S*?)\\]\\s+Id:\\s*(?\\d+)\"\n| parse regex field=query_block \"# Query_time:\\s+(?[\\d.]*)\\s+Lock_time:\\s+(?[\\d.]*)\\s+Rows_sent:\\s+(?[\\d]*)\\s+Rows_examined:\\s+(?[\\d]*)\" nodrop\n| parse regex field=query_block \"SET timestamp=(?\\d*);\\n(?[\\s\\S]*);\" nodrop\n| parse regex field=sql_cmd \"[^a-zA-Z]*(?[a-zA-Z]+)\\s*\"\n| fields -query_block\n| num (query_time)\n| count as frequency, sum(query_time) as total_time, min(query_time) as min_time, max(query_time) as max_time, avg(query_time) as avg_time, avg(rows_examined) as avg_rows_examined, avg(rows_sent) as avg_rows_sent, avg(Lock_Time) as avg_lock_time group by sql_cmd, dbidentifier\n| 5 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where avg_time > threshold\n| sort by avg_time, frequency asc" - } - ], - "triggers": [ - { - "detectionMethod": "StaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null - }, - { - "detectionMethod": "StaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -625,8 +605,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -636,39 +616,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -723,7 +699,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -731,8 +707,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -742,39 +718,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -782,8 +754,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -793,39 +765,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 20, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 20, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -833,8 +801,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS MySQL - High Authentication Failure", - "description": "This alert fires when we detect more then 10 authentication failure over a 5 minute time-period", + "name": "AWS Application Load Balancer - Targets Deregistered", + "description": "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -844,7 +812,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error \"Access denied for user\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \" [*] \" as LogLevel\n| parse field=message \" * [Note] Access denied for user '*'@'*' (using *: *)\" as requestid, user, host, authenticationType, flag nodrop\n| parse field=message \"[Warning] Access denied for user '*'@'*' (using *: *)\" as user, host, authenticationType, flag nodrop" + "query": "account=* region=* \"\\\"eventsource\\\":\\\"elasticloadbalancing.amazonaws.com\\\"\" \"errorCode\" \"2015-12-01\" \n| json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop \n| where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" \n| where namespace matches \"aws/applicationelb\" and event_name=\"DeregisterTargets\"\n" } ], "triggers": [ @@ -853,21 +821,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", + "threshold": 1, + "thresholdType": "LessThan", "field": null } ], - "timeZone": null, + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -878,44 +846,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -929,8 +893,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -940,39 +904,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -980,44 +940,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-5m", + "threshold": 50, + "thresholdType": "LessThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-5m", + "threshold": 50, + "thresholdType": "GreaterThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -1031,50 +987,44 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon RDS MySQL - Excessive Slow Query Detected", + "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over last 10 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*SlowQuery \"User@Host\" \"Query_time\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"(?# User@Host:[\\S\\s]+?SET timestamp=\\d+;[\\S\\s]+?;)\" multi\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*\\[(?\\S*?)\\]\\s*Id:\\s*(?\\d*)\" nodrop\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*(?\\S+)\\s\\[(?\\S*?)\\]\\s+Id:\\s*(?\\d+)\"\n| parse regex field=query_block \"# Query_time:\\s+(?[\\d.]*)\\s+Lock_time:\\s+(?[\\d.]*)\\s+Rows_sent:\\s+(?[\\d]*)\\s+Rows_examined:\\s+(?[\\d]*)\" nodrop\n| parse regex field=query_block \"SET timestamp=(?\\d*);\\n(?[\\s\\S]*);\" nodrop\n| parse regex field=sql_cmd \"[^a-zA-Z]*(?[a-zA-Z]+)\\s*\"\n| fields -query_block\n| num (query_time)\n| count as frequency, sum(query_time) as total_time, min(query_time) as min_time, max(query_time) as max_time, avg(query_time) as avg_time, avg(rows_examined) as avg_rows_examined, avg(rows_sent) as avg_rows_sent, avg(Lock_Time) as avg_lock_time group by sql_cmd, dbidentifier\n| 5 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where avg_time > threshold\n| sort by avg_time, frequency asc" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 80, + "timeRange": "-10m", + "threshold": 1, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 80, + "timeRange": "-10m", + "threshold": 1, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1082,8 +1032,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Throttling", - "description": "This alert fires when we detect a Lambda running into throttling within an interval of 10 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1093,7 +1043,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Throttles statistic=average account=* region=* functionname=* Resource=* | avg by account, region,namespace, functionname " + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ @@ -1101,9 +1051,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 0, - "thresholdType": "GreaterThan", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -1111,9 +1061,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } @@ -1121,7 +1071,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1129,50 +1079,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "Amazon RDS PostgreSQL - High Errors", + "description": "This alert fires when we detect high number (>10) of error/fatal logs in Postgres logs over a 5 minutes time period", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* (\"ERROR\" OR \"FATAL\")\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,threadid,user,database,processid,severity,msg \n| where severity IN (\"ERROR\", \"FATAL\") " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 10, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 10, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1180,8 +1124,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "Amazon RDS - Low Free Storage", + "description": "This alert fires when the average free storage space of a RDS instance is low (< 512MB) for an interval of 15 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1191,83 +1135,34 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/rds metric=FreeStorageSpace statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThanOrEqual", - "field": null, + "timeRange": "-15m", + "threshold": 512, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThan", - "field": null, + "timeRange": "-15m", + "threshold": 512, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "", - "sloId": null, - "monitorTemplateId": null, - "tags": null, - "automatedPlaybookIds": [] - }, - { - "name": "AWS API Gateway - High Integration Errors", - "description": "This alert fires where there are too many API requests (>5%) with integration errors within 5 minutes.", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "1m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId integrationError\n| json \"status\", \"integrationError\", \"apiid\", \"stage\" as status, integrationError, apiid, stage \n| if (!(integrationError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_integrationError\n| sum(is_integrationError) as integrationError_count, count as totalRequests by apiid, stage\n| (integrationError_count*100/totalRequests) as integrationError_percent\n| fields integrationError_percent, apiid, stage" - } - ], - "triggers": [ - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", - "field": "integrationError_percent" - }, - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", - "field": "integrationError_percent" - } - ], - "timeZone": "Asia/Kolkata", - "notifications": [], - "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, @@ -1276,8 +1171,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1287,90 +1182,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" - } - ], - "triggers": [ - { - "detectionMethod": "StaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 - }, - { - "detectionMethod": "StaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 - } - ], - "timeZone": null, - "notifications": [], - "isDisabled": true, - "groupNotifications": false, - "playbook": "", - "sloId": null, - "monitorTemplateId": null, - "tags": null, - "automatedPlaybookIds": [] - }, - { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "0m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1402,34 +1242,30 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1437,10 +1273,10 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1448,39 +1284,33 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code) \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user, username) as user \n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1488,8 +1318,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Events", - "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", + "name": "Amazon RDS - Oracle Logs - Failed Connection Attempts", + "description": "This alert fires when we detect greater than or equal to 25 failed connection attempts over a 5 minute time-period.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -1499,7 +1329,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*listener establish \"PROGRAM=\" (\"SID=\" or \"SERVICE_NAME=\") and (\"\\nTNS-\" or \"TNS-\")\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"\\* \\(CONNECT_DATA[\\s\\S]+?\\* establish \\* \\S+ \\* (?\\d+)\" nodrop\n| parse regex field=message \"CONNECT_DATA[\\s\\S]+?SERVICE_NAME=(?[^)]*)\\)[\\s\\S]+establish\" nodrop\n| parse regex field=message \"CONNECT_DATA[\\s\\S]+?service_name=(?[^)]*)\\)[\\s\\S]+establish\" nodrop\n| parse regex field=message \"CONNECT_DATA[\\s\\S]+?SID=(?[^)]*)\\)[\\s\\S]+establish\" nodrop\n| parse regex field=message \"CONNECT_DATA[\\s\\S]+?sid=(?[^)]*)\\)[\\s\\S]+establish\" nodrop\n| parse regex field=message \"CONNECT_DATA[\\s\\S]+?PROGRAM=(?[^)]*)\\)[\\s\\S]+?HOST=(?[^)]*)\\)[\\s\\S]+?USER=(?[^)]*)\\)\" nodrop\n| parse field=message \"(ADDRESS=(PROTOCOL=*)(HOST=*)(PORT=*))\" as clientProtocol, clientHost, clientPort nodrop\n| parse regex field=message \"(?TNS-\\d{5}): (?.*)\" nodrop\n| where status != \"0\"" } ], "triggers": [ @@ -1508,21 +1338,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", + "threshold": 25, + "thresholdType": "GreaterThanOrEqual", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", + "threshold": 25, + "thresholdType": "LessThan", "field": null } ], - "timeZone": null, + "timeZone": "Asia/Colombo", "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -1533,8 +1363,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1544,47 +1374,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1592,8 +1418,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Notification to DLQ", - "description": "This alert fires when an SNS topic messages are moved to a dead-letter queue.", + "name": "AWS DynamoDB - High Write Throttle", + "description": "This alert fires when we detect that the total write throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1603,7 +1429,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsRedrivenToDlq statistic=sum | sum by account, region, namespace, topicname " + "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=WriteThrottleEvents statistic=sum | sum by account, region, namespace, tablename" } ], "triggers": [ @@ -1612,7 +1438,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 @@ -1622,7 +1448,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -1631,7 +1457,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1639,8 +1465,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1650,39 +1476,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1690,8 +1512,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect the high integration latency for the API requests in a stage within 5 minutes. This alarm is recommended for WebSocket APIs by AWS, and optional for other APIs because they already have separate alarm recommendations for the Latency metric. You can correlate the IntegrationLatency metric value with the corresponding latency metric of your backend such as the Duration metric for Lambda integrations. This helps you determine whether the API backend is taking more time to process requests from clients due to performance issues or if there is some other overhead from initialization or cold start.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1701,39 +1523,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* Namespace=aws/apigateway metric=IntegrationLatency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 2000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 2000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1741,18 +1559,18 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Authorizer Errors", - "description": "This alert fires where there are too many API requests (>5%) with authorizer errors within 5 minutes", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "1m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId authorizerError\n| json \"status\", \"authorizerError\", \"apiid\", \"stage\" as status, authorizerError, apiid, stage \n| if (!(authorizerError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_authorizerError\n| sum(is_authorizerError) as is_authorizerError_count, count as totalRequests by apiid, stage\n| (is_authorizerError_count*100/totalRequests) as authorizerError_percent\n| fields authorizerError_percent, apiid, stage\n" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" } ], "triggers": [ @@ -1761,21 +1579,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "GreaterThan", - "field": "authorizerError_percent" + "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "LessThanOrEqual", - "field": "authorizerError_percent" + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -1786,8 +1604,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 CW - High CPU Utilization", - "description": "This alert fires when the average CPU Utilization based on cloud watch metrics, within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1797,7 +1615,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/ec2 metric=CPUUtilization instanceid=* statistic=average | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" } ], "triggers": [ @@ -1806,8 +1624,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThan", + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -1816,8 +1634,8 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThanOrEqual", + "threshold": 3000, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -1825,7 +1643,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1833,38 +1651,40 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - High Errors", - "description": "This alert fires when we detect high rate (>10) of error/fatal logs in Postgres logs over a 5 minutes time period", + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* (\"ERROR\" OR \"FATAL\")\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,threadid,user,database,processid,severity,msg \n| where severity IN (\"ERROR\", \"FATAL\") " + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", - "field": null + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -1878,38 +1698,40 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - High Authentication Failure", - "description": "This alert fires when we detect more than 10 authentication failure in Postgres logs over a 5 minute time-period", + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* \"authentication failed\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| where msg matches \"*authentication failed*\"" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", - "field": null + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -1923,8 +1745,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Free Storage", - "description": "This alert fires when the average free storage space of a RDS instance is low (< 512MB) for an interval of 15 minutes.", + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1934,7 +1756,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds metric=FreeStorageSpace statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ @@ -1942,9 +1764,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 512, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -1952,9 +1774,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 512, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -1962,7 +1784,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1970,8 +1792,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Freeable Memory", - "description": "This alert fires when the average Freeable memory of an RDS instance is < 128 MB for an interval of 15 minutes. If this value is lower you may need to scale up to a larger instance class.", + "name": "AWS SNS - Notification to DLQ Failure", + "description": "This alert fires when an SNS topic messages that couldn't be moved to a dead-letter queue.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1981,7 +1803,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds metric=FreeableMemory statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsFailedToRedriveToDlq statistic=sum | sum by account, region, namespace, topicname " } ], "triggers": [ @@ -1989,19 +1811,74 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 128, + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "GreaterThan", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 128, - "thresholdType": "GreaterThan", + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -2009,7 +1886,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2017,8 +1894,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS MSSQL - Database observing authentication failures from multiple client IPs", - "description": "This alert fires when we detect more than or equal to 10 client IPs attempting authentication failures on the database over a 15-minute period.", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -2028,7 +1905,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(client_ip) as unique_client_ip by dbidentifier\n| 10 as threshold\n| where unique_client_ip >= threshold\n| sort by unique_client_ip\n| fields - threshold" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user_type, username) as user_type \n| count as ip_count by src_ip, event_name, region, accountid,user_type \n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip \n| where type=\"ip_address\" and malicious_confidence = \"high\" \n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name \n| replace(label_name, \"\\\"\",\" \") as label_name \n| if (isEmpty(actor), \"Unassigned\", actor) as actor \n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name" } ], "triggers": [ @@ -2036,22 +1913,22 @@ "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "GreaterThan", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-15m", - "timeRange": "-15m", - "threshold": 1, - "thresholdType": "LessThan", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -2062,43 +1939,41 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect the high integration latency for the API requests in a stage within 5 minutes. This alarm is recommended for WebSocket APIs by AWS, and optional for other APIs because they already have separate alarm recommendations for the Latency metric.\nYou can correlate the IntegrationLatency metric value with the corresponding latency metric of your backend such as the Duration metric for Lambda integrations. This helps you determine whether the API backend is taking more time to process requests from clients due to performance issues or if there is some other overhead from initialization or cold start.", + "name": "AWS SQS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SQS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/apigateway metric=IntegrationLatency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2000, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 5 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2000, - "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 5 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -2109,8 +1984,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2120,47 +1995,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2168,8 +2039,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "AWS DynamoDB - High Read Throttle", + "description": "This alert fires when we detect that the total read throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2179,7 +2050,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=ReadThrottleEvents statistic=sum | sum by account, region, namespace, tablename" } ], "triggers": [ @@ -2188,8 +2059,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, - "thresholdType": "GreaterThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -2198,8 +2069,8 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, - "thresholdType": "LessThan", + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } @@ -2207,7 +2078,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2215,8 +2086,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2226,39 +2097,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 90, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2266,18 +2133,18 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Server-Side Errors", - "description": "This alert fires where there are too many API requests (>5%) with server-side errors within 5 minutes.\nThis can be caused by 5xx errors from your integration, permission issues, or other factors preventing successful invocation of the integration, such as the integration being throttled or deleted.", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=5XX or metric=5xxError or metric=ExecutionError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -2286,26 +2153,26 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, + "threshold": 85, "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2313,8 +2180,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2324,39 +2191,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 85, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2364,8 +2227,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Memory Utilization", - "description": "This alert fires when we detect a Lambda execution with memory usage of more than 85% within an interval of 10 minutes.", + "name": "Amazon RDS PostgreSQL - Excessive Slow Query Detected", + "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over a 10 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -2375,7 +2238,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/lambda Memory Size Used\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| _sourceName as logStream | _sourceHost as logGroup\n| parse regex field=message \"REPORT\\s+RequestId:\\s+(?[^\\s]+)\\s+Duration:\\s+(?[^\\s]+)\\s+ms\\s+Billed Duration:\\s+(?[^\\s]+)\\s+ms\\s+Memory\\s+Size:\\s+(?[^\\s]+)\\s+MB\\s+Max\\s+Memory\\s+Used:\\s+(?[^\\s]+)\\s+MB\" \n| parse field=loggroup \"/aws/lambda/*\" as functionname\n| avg(MemorySize) as MemorySizeAvg, avg(MaxMemoryUsed) as MaxMemoryUsedAvg by functionname\n| (MaxMemoryUsedAvg/MemorySizeAvg)*100 as memoryUtilization\n| where memoryUtilization>85" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql\n| json \"message\" nodrop \n| if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| parse regex field=msg \"duration: (?[\\S]+) ms (?.+)\"\n| 5000 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where execution_time_ms > threshold \n| count by dbidentifier, database" } ], "triggers": [ @@ -2383,7 +2246,7 @@ "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", + "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "field": null @@ -2391,8 +2254,8 @@ { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-10m", - "timeRange": "-10m", + "resolutionWindow": null, + "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "field": null @@ -2409,46 +2272,136 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - Low Traffic API", - "description": "This alert fires where there is low message traffic volume for the API within 5 minutes. \nThis can indicate an issue with the application calling the API such as using incorrect endpoints. It could also indicate an issue with the configuration or permissions of the API making it unreachable for clients. This alarm is not recommended for APIs that don't expect constant and consistent traffic.", + "name": "Amazon RDS MySQL - High Authentication Failure", + "description": "This alert fires when we detect more then 10 authentication failure over a 5 minute time-period", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=ConnectCount OR metric=Count) statistic=SampleCount account=* region=* apiname=* stage=* !(route=*) !(resource=*) | quantize using sum | sum by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error \"Access denied for user\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \" [*] \" as LogLevel\n| parse field=message \" * [Note] Access denied for user '*'@'*' (using *: *)\" as requestid, user, host, authenticationType, flag nodrop\n| parse field=message \"[Warning] Access denied for user '*'@'*' (using *: *)\" as user, host, authenticationType, flag nodrop\n| count as event_count" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "LessThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 10 + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "GreaterThan", + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High Client-Side Errors", + "description": "This alert fires where there are too many API requests (>5%) with client-side errors within 5 minutes. This can indicate an issue in the authorisation or client request parameters. It could also mean that a resource was removed or a client is requesting one that doesn't exist. Errors could also be caused by exceeding the configured throttling limit.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway (metric=4XX or metric=4xxError or metric=ClientError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0.05, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 0.05, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 10 + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Classic Load Balancer - Deletion Alert", + "description": "This alert fires when we detect greater than or equal to 2 application load balancers are deleted over a 5 minute time-period.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/elb \"\\\"eventsource\\\":\\\"elasticloadbalancing.amazonaws.com\\\"\" \"\\\"apiVersion\\\":\\\"2012-06-01\\\"\"\n| json \"eventSource\", \"eventName\" as event_source, event_name nodrop\n| where event_source = \"elasticloadbalancing.amazonaws.com\" \n| where event_name matches \"DeleteLoadBalancer\"" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 2, + "thresholdType": "GreaterThanOrEqual", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": "-5m", + "timeRange": "-5m", + "threshold": 2, + "thresholdType": "LessThan", + "field": null } ], "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2456,8 +2409,110 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases", - "description": "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period.", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along functionname, account, region, namespace" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon RDS - Low Freeable Memory", + "description": "This alert fires when the average Freeable memory of an RDS instance is < 128 MB for an interval of 15 minutes. If this value is lower you may need to scale up to a larger instance class.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/rds metric=FreeableMemory statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 128, + "thresholdType": "LessThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 128, + "thresholdType": "GreaterThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon RDS PostgreSQL - High Authentication Failure", + "description": "This alert fires when we detect more than 10 authentication failure in Postgres logs over a 5 minute time-period", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -2467,7 +2522,97 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" + "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* \"authentication failed\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| where msg matches \"*authentication failed*\"" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 10, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Lambda - High Memory Utilization", + "description": "This alert fires when we detect a Lambda execution with memory usage of more than 85% within an interval of 10 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* Namespace=aws/lambda Memory Size Used\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| _sourceName as logStream | _sourceHost as logGroup\n| parse regex field=message \"REPORT\\s+RequestId:\\s+(?[^\\s]+)\\s+Duration:\\s+(?[^\\s]+)\\s+ms\\s+Billed Duration:\\s+(?[^\\s]+)\\s+ms\\s+Memory\\s+Size:\\s+(?[^\\s]+)\\s+MB\\s+Max\\s+Memory\\s+Used:\\s+(?[^\\s]+)\\s+MB\" \n| parse field=loggroup \"/aws/lambda/*\" as functionname\n| avg(MemorySize) as MemorySizeAvg, avg(MaxMemoryUsed) as MaxMemoryUsedAvg by functionname\n| (MaxMemoryUsedAvg/MemorySizeAvg)*100 as memoryUtilization\n| where memoryUtilization>85" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ @@ -2476,21 +2621,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-15m", - "threshold": 1, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-15m", + "resolutionWindow": null, "timeRange": "-15m", - "threshold": 1, + "threshold": 5, "thresholdType": "LessThan", "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -2501,8 +2646,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High WAF Errors", - "description": "This alert fires where there are too many API requests (>5%) with WAF errors within 5 minutes.", + "name": "AWS API Gateway - High WAF Latency", + "description": "This alert fires when we detect the high WAF latency for the REST and WebSocket API requests in a stage within 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "1m", @@ -2512,7 +2657,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId \n| json \"status\", \"apiid\", \"stage\", \"wafResponseCode\" as status, apiid, stage, wafResponseCode\n| if (wafResponseCode==\"WAF_BLOCK\" and !(status matches \"2*\"), 1, 0) as is_wafError\n| sum(is_wafError) as is_wafError_count, count as totalRequests by apiid, stage\n| (is_wafError_count*100/totalRequests) as wafError_percent\n| fields wafError_percent, apiid, stage" + "query": "account=* region=* namespace=* apiname=* apiid stage domainname requestId wafLatency \n| json \"wafLatency\", \"apiId\", \"stage\" as wafLatency, apiid, stage \n| pct(wafLatency, 90) as wafLatency90th by apiid,stage" } ], "triggers": [ @@ -2521,21 +2666,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "GreaterThan", - "field": "wafError_percent" + "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "LessThanOrEqual", - "field": "wafError_percent" + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -2546,50 +2691,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2597,8 +2736,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - System Errors", - "description": "This alert fires when we detect system errors for a dynamodb table is high (>10) for a time interval of 5 minutes.", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2608,7 +2747,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb metric=SystemErrors statistic=samplecount | sum " + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -2617,8 +2756,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -2627,8 +2766,8 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", + "threshold": 85, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -2636,7 +2775,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2644,8 +2783,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2655,15 +2794,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ @@ -2691,7 +2830,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2699,8 +2838,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -2710,39 +2849,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2750,8 +2885,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Read Throttle", - "description": "This alert fires when we detect that the total read throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", + "name": "AWS DynamoDB - System Errors", + "description": "This alert fires when we detect system errors for a dynamodb table is high (>10) for a time interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2761,7 +2896,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=ReadThrottleEvents statistic=sum | sum by account, region, namespace, tablename" + "query": "account=* region=* namespace=aws/dynamodb metric=SystemErrors statistic=samplecount | sum" } ], "triggers": [ @@ -2770,7 +2905,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 @@ -2780,16 +2915,151 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } ], - "timeZone": null, + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Classic Load Balancer - Targets Deregistered", + "description": "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/elb \"\"eventsource\":\"elasticloadbalancing.amazonaws.com\"\" \"\"apiVersion\":\"2012-06-01\"\" \n| json \"eventSource\", \"eventName\" as event_source, event_name nodrop \n| where event_source = \"elasticloadbalancing.amazonaws.com\" \n| where event_name matches \"DeregisterInstancesFromLoadBalancer\"" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": "-5m", + "timeRange": "-5m", + "threshold": 1, + "thresholdType": "LessThan", + "field": null + } + ], + "timeZone": "Asia/Kolkata", + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High Integration Errors", + "description": "This alert fires where there are too many API requests (>5%) with integration errors within 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "1m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId integrationError\n| json \"status\", \"integrationError\", \"apiid\", \"stage\" as status, integrationError, apiid, stage \n| if (!(integrationError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_integrationError \n| sum(is_integrationError) as integrationError_count, count as totalRequests by apiid, stage \n| (integrationError_count*100/totalRequests) as integrationError_percent \n| fields integrationError_percent, apiid, stage" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Network Load Balancer - Targets Deregistered", + "description": "This alert fires when we detect greater than or equal to 1 target is de-registered over a 5 minute time-period.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* \"\\\"eventsource\\\":\\\"elasticloadbalancing.amazonaws.com\\\"\" \"errorCode\" \"2015-12-01\" | json \"eventSource\", \"eventName\",\"apiVersion\" as event_source, event_name, api_version nodrop \n| where event_source = \"elasticloadbalancing.amazonaws.com\" and api_version matches \"2015-12-01\" and namespace matches \"aws/networkelb\"\n| where event_name matches \"DeregisterTargets\"" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": "-5m", + "timeRange": "-5m", + "threshold": 1, + "thresholdType": "LessThan", + "field": null + } + ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2797,58 +3067,44 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "Amazon RDS - Oracle Logs - DB Crash", + "description": "This alert fires when we detect greater than or equal to 1 Oracle DB crash over a 5 minute time-period.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*alert ORA-*\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message \n| parse regex field=message \"(?ORA-\\d{5}): (?.*)\" multi\n| count" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": "_count" }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 5, + "threshold": 1, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": "_count" } ], - "timeZone": null, + "timeZone": "Asia/Colombo", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2856,8 +3112,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Queue has stopped receiving messages", - "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect the high Latency in a stage within 5 minutes for REST and HTTP API. Find the IntegrationLatency metric value to check the API backend latency. If the two metrics are mostly aligned, the API backend is the source of higher latency and you should investigate there for issues. View this metric per resource and method and narrow down the source of the latency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2867,7 +3123,7 @@ "queries": [ { "rowId": "A", - "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "account=* region=* Namespace=aws/apigateway metric=Latency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ @@ -2875,21 +3131,21 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 2500, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 2500, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], "timeZone": null, @@ -2903,8 +3159,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Message processing not fast enough", - "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2914,7 +3170,7 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ @@ -2923,20 +3179,20 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", + "threshold": 90, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", + "threshold": 90, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], "timeZone": null, @@ -2950,8 +3206,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 CW - Status Check Failed", - "description": "This alert fires when there is a status check failures within a 5 minute interval for an EC2 instance.", + "name": "AWS API Gateway - Low Traffic API", + "description": "This alert fires where there is low message traffic volume for the API within 5 minutes. This can indicate an issue with the application calling the API such as using incorrect endpoints. It could also indicate an issue with the configuration or permissions of the API making it unreachable for clients. This alarm is not recommended for APIs that don't expect constant and consistent traffic.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2961,7 +3217,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/ec2 instanceid=* metric=StatusCheckFailed statistic=maximum | filter latest=1 | count by account, region, namespace,instanceid " + "query": "Namespace=aws/apigateway (metric=ConnectCount OR metric=Count) statistic=SampleCount account=* region=* apiname=* stage=* !(route=*) !(resource=*) | quantize using sum | sum by apiname, namespace, region, account, stage" } ], "triggers": [ @@ -2969,9 +3225,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -2979,9 +3235,9 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -2989,7 +3245,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2997,8 +3253,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3008,47 +3264,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 80, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, + "threshold": 80, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3056,50 +3300,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "AWS API Gateway - High Authorizer Errors", + "description": "This alert fires where there are too many API requests (>5%) with authorizer errors within 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId authorizerError\n| json \"status\", \"authorizerError\", \"apiid\", \"stage\" as status, authorizerError, apiid, stage \n| if (!(authorizerError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_authorizerError \n| sum(is_authorizerError) as is_authorizerError_count, count as totalRequests by apiid, stage \n| (is_authorizerError_count*100/totalRequests) as authorizerError_percent \n| fields authorizerError_percent, apiid, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3107,8 +3345,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "AWS SNS - Notification to DLQ", + "description": "This alert fires when an SNS topic messages are moved to a dead-letter queue.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3118,39 +3356,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsRedrivenToDlq statistic=sum | sum by account, region, namespace, topicname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 0, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 0, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3158,8 +3392,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Client-Side Errors", - "description": "This alert fires where there are too many API requests (>5%) with client-side errors within 5 minutes. \nThis can indicate an issue in the authorisation or client request parameters. It could also mean that a resource was removed or a client is requesting one that doesn't exist. Errors could also be caused by exceeding the configured throttling limit.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3169,7 +3403,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=4XX or metric=4xxError or metric=ClientError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ @@ -3178,26 +3412,26 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, - "thresholdType": "GreaterThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, - "thresholdType": "LessThan", + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3205,40 +3439,38 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Messages not processed", - "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", + "name": "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases", + "description": "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, + "timeRange": "-15m", + "threshold": 1, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, + "timeRange": "-15m", + "threshold": 1, "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null } ], "timeZone": null, @@ -3252,8 +3484,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect the high Latency in a stage within 5 minutes for REST and HTTP API.\nFind the IntegrationLatency metric value to check the API backend latency. If the two metrics are mostly aligned, the API backend is the source of higher latency and you should investigate there for issues. View this metric per resource and method and narrow down the source of the latency.", + "name": "AWS EC2 CW - Status Check Failed", + "description": "This alert fires when there is a status check failures within a 5 minute interval for an EC2 instance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3263,7 +3495,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/apigateway metric=Latency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=aws/ec2 instanceid=* metric=StatusCheckFailed statistic=maximum | filter latest=1 | count by account, region, namespace,instanceid " } ], "triggers": [ @@ -3272,23 +3504,23 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2500, - "thresholdType": "GreaterThanOrEqual", + "threshold": 0, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2500, - "thresholdType": "LessThan", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -3299,8 +3531,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS Lambda - Throttling", + "description": "This alert fires when we detect a Lambda running into throttling within an interval of 10 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3310,39 +3542,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/lambda metric=Throttles statistic=average account=* region=* functionname=* Resource=* | avg by account, region,namespace, functionname " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3350,44 +3578,38 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - Excessive Slow Query Detected", - "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over a 10 minutes.", + "name": "AWS API Gateway - High WAF Errors", + "description": "This alert fires where there are too many API requests (>5%) with WAF errors within 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "0m", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql\n| json \"message\" nodrop \n| if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| parse regex field=msg \"duration: (?[\\S]+) ms (?.+)\"\n| 5000 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where execution_time_ms > threshold \n| count by dbidentifier, database" + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId \n| json \"status\", \"apiid\", \"stage\", \"wafResponseCode\" as status, apiid, stage, wafResponseCode \n| if (wafResponseCode==\"WAF_BLOCK\" and !(status matches \"2*\"), 1, 0) as is_wafError \n| sum(is_wafError) as is_wafError_count, count as totalRequests by apiid, stage \n| (is_wafError_count*100/totalRequests) as wafError_percent \n| fields wafError_percent, apiid, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": null } ], "timeZone": null, @@ -3401,50 +3623,46 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null,