From 909ca9eeed669a4ad0efdeb53e48b5fabb7d5610 Mon Sep 17 00:00:00 2001 From: Akhil Dangore Date: Thu, 27 Jun 2024 11:43:01 +0530 Subject: [PATCH 1/5] Added new mssql dashboard --- aws-observability/json/Rds-App.json | 817 +++++++++++++++++++++++++++- 1 file changed, 816 insertions(+), 1 deletion(-) diff --git a/aws-observability/json/Rds-App.json b/aws-observability/json/Rds-App.json index 43b00a79..31d202dd 100644 --- a/aws-observability/json/Rds-App.json +++ b/aws-observability/json/Rds-App.json @@ -5215,7 +5215,7 @@ "coloringRules": null, "linkedDashboards": [ { - "id": "Abkssnc1TyrHnPDUrUpid0NA6dsdUjk0Fo5OMHkYWbxkWr6Xcv70xrLyCBT4", + "id": "TSs5j8Fdvs403tVwBZP4ZHPZzLpgzs6Yvjx88xxlzucFSOzQEPWqxhyK9ugj", "relativePath": "../09. Amazon RDS - MySQL Logs - Audit Log Analysis", "includeTimeRange": false, "includeVariables": false @@ -11402,6 +11402,821 @@ } ], "coloringRules": [] + }, + { + "type": "DashboardV2SyncDefinition", + "name": "18. Amazon RDS - MSSQL Logs - Error Logs - Logon Analysis", + "description": "The Amazon RDS - MSSQL Logs - Error Logs - Logon Analysis dashboard provides information about the error logs, including failed authentications and logon errors.", + "title": "18. Amazon RDS - MSSQL Logs - Error Logs - Logon Analysis", + "theme": "Light", + "topologyLabelMap": { + "data": { + "ffe0d04967abc0c87d695d4a7f2700e0": [ + "*" + ], + "namespace": [ + "aws/rds" + ], + "region": [ + "*" + ], + "f049c8a107a343b5188930219d3063f5": [ + "*" + ], + "dbidentifier": [ + "*" + ], + "account": [ + "*" + ] + } + }, + "refreshInterval": 0, + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "layout": { + "layoutType": "Grid", + "layoutStructures": [ + { + "key": "panelPANE-5556565F9FF73B4A", + "structure": "{\"height\":5,\"width\":6,\"x\":0,\"y\":0}" + }, + { + "key": "panelPANE-C97F5EDEB22FB84E", + "structure": "{\"height\":8,\"width\":16,\"x\":0,\"y\":10}" + }, + { + "key": "panelPANE-1359B3B4A25FF947", + "structure": "{\"height\":10,\"width\":8,\"x\":16,\"y\":0}" + }, + { + "key": "panel2C0BFEAFBAC60849", + "structure": "{\"height\":8,\"width\":8,\"x\":16,\"y\":10}" + }, + { + "key": "panel1B3270DCA521BA4A", + "structure": "{\"height\":8,\"width\":8,\"x\":16,\"y\":18}" + }, + { + "key": "panelPANE-28DC265CAE0DEB4F", + "structure": "{\"height\":8,\"width\":16,\"x\":0,\"y\":18}" + }, + { + "key": "panelA414E941A506CB41", + "structure": "{\"height\":10,\"width\":10,\"x\":6,\"y\":0}" + }, + { + "key": "panel4F5E4422BA291842", + "structure": "{\"height\":5,\"width\":6,\"x\":0,\"y\":5}" + } + ] + }, + "panels": [ + { + "id": null, + "key": "panelPANE-5556565F9FF73B4A", + "title": "Failed Authentication Attempts", + "visualSettings": "{\"general\":{\"mode\":\"singleValueMetrics\",\"type\":\"svp\",\"displayType\":\"default\",\"roundDataPoints\":true},\"title\":{\"fontSize\":14},\"svp\":{\"option\":\"Latest\",\"unitify\":false,\"textColor\":\"\",\"backgroundColor\":\"\",\"label\":\"\",\"useBackgroundColor\":false,\"useNoData\":false,\"noDataString\":\"\",\"hideData\":false,\"hideLabel\":false,\"rounding\":0,\"valueFontSize\":24,\"labelFontSize\":14,\"thresholds\":[{\"from\":null,\"to\":1,\"color\":\"#16943E\"},{\"from\":1,\"to\":5,\"color\":\"#DFBE2E\"},{\"from\":5,\"to\":null,\"color\":\"#BF2121\"}],\"sparkline\":{\"show\":false,\"color\":\"#222D3B\"},\"gauge\":{\"show\":false,\"min\":0,\"max\":100,\"showThreshold\":false,\"showThresholdMarker\":false}},\"series\":{},\"legend\":{\"enabled\":false}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| count as eventCount", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelPANE-C97F5EDEB22FB84E", + "title": "Failed Authentication - Details", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| timeslice 1s\n| count as frequency by _timeslice, user, dbidentifier, reason, client_ip\n| sort by _timeslice\n", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelPANE-1359B3B4A25FF947", + "title": "Failed Authentication - User Location", + "visualSettings": "{\"general\":{\"mode\":\"map\",\"type\":\"map\",\"displayType\":\"default\",\"roundDataPoints\":true},\"title\":{\"fontSize\":14},\"series\":{},\"legend\":{\"enabled\":false}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and !isBlank(client_ip) and client_ip matches \"{{client_ip}}\"\n| count by client_ip\n| lookup latitude, longitude from geo://location on ip = client_ip\n| where !isNull(latitude)", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel2C0BFEAFBAC60849", + "title": "Failed Auth Attempts by User", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| count as frequency by user\n| sort by frequency, user asc", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel1B3270DCA521BA4A", + "title": "Failed Auth Attempts by Client IP", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| count as frequency by client_ip\n| sort by frequency, client_ip asc", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelPANE-28DC265CAE0DEB4F", + "title": "Logon Errors", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Error\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"Error: *, Severity: *, State: *.\" as error_code, severity, state\n| count as frequency by dbidentifier, error_code, severity, state\n| sort by severity, frequency", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelA414E941A506CB41", + "title": "Failed Authentication Attempts - Trend", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false,\"title\":\"Failed Auth Attempts\"}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"column\",\"displayType\":\"stacked\",\"roundDataPoints\":true,\"fillOpacity\":1,\"mode\":\"timeSeries\"},\"overrides\":[]}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| timeslice 15m\n| count as failedLogonAttempts by _timeslice, dbidentifier\n| transpose row _timeslice column dbidentifier", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel4F5E4422BA291842", + "title": "Failed Authentication Attempts by dbidentifier", + "visualSettings": "{\"title\":{\"fontSize\":14},\"general\":{\"type\":\"pie\",\"displayType\":\"default\",\"roundDataPoints\":true,\"fillOpacity\":1,\"startAngle\":270,\"innerRadius\":\"30%\",\"maxNumOfSlices\":10,\"mode\":\"distribution\"},\"legend\":{\"enabled\":false,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12}},\"series\":{}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| where user != \"rdsadmin\" and !isEmpty(user) and user matches \"{{user}}\"\n| where !isEmpty(client_ip) and client_ip matches \"{{client_ip}}\"\n| count as eventCount by dbidentifier\n| sort by eventCount, dbidentifier asc", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + } + ], + "variables": [ + { + "id": null, + "name": "account", + "displayName": "account", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "", + "key": "account" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "region", + "displayName": "region", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region=*", + "key": "region" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "namespace", + "displayName": "namespace", + "defaultValue": "aws/rds", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region={{region}} namespace=aws/rds", + "key": "namespace" + }, + "allowMultiSelect": false, + "includeAllOption": false, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "dbidentifier", + "displayName": "dbidentifier", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region={{region}} namespace={{namespace}}", + "key": "dbidentifier" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "user", + "displayName": "user", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "LogQueryVariableSourceDefinition", + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| parse \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count by user\n| sort by user asc", + "field": "user" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "client_ip", + "displayName": "client_ip", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "LogQueryVariableSourceDefinition", + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| parse \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count by client_ip\n| sort by client_ip asc", + "field": "client_ip" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + } + ], + "coloringRules": [] + }, + { + "type": "DashboardV2SyncDefinition", + "name": "19. Amazon RDS - MSSQL Logs - Error Logs - Infrastructure Overview", + "description": "The Amazon RDS - MSSQL Logs - Error Logs - Infrastructure Overview dashboard provides details for hardware, authentications mode, collation, process, recent SQL Server terminations, and newly created databases.", + "title": "19. Amazon RDS - MSSQL Logs - Error Logs - Infrastructure Overview", + "theme": "Light", + "topologyLabelMap": { + "data": { + "ffe0d04967abc0c87d695d4a7f2700e0": [ + "*" + ], + "namespace": [ + "aws/rds" + ], + "region": [ + "*" + ], + "f049c8a107a343b5188930219d3063f5": [ + "*" + ], + "dbidentifier": [ + "*" + ], + "account": [ + "*" + ] + } + }, + "refreshInterval": 0, + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "layout": { + "layoutType": "Grid", + "layoutStructures": [ + { + "key": "panelPANE-1B625C63AFF01A43", + "structure": "{\"height\":6,\"width\":6,\"x\":18,\"y\":0}" + }, + { + "key": "panelPANE-44D782B2B2738B48", + "structure": "{\"height\":6,\"width\":6,\"x\":5,\"y\":0}" + }, + { + "key": "panel76EEA97B9A431944", + "structure": "{\"height\":6,\"width\":9,\"x\":7,\"y\":6}" + }, + { + "key": "panelAFCBC601BB5F4A4D", + "structure": "{\"height\":6,\"width\":8,\"x\":16,\"y\":6}" + }, + { + "key": "panel9F73C967AECF9A4C", + "structure": "{\"height\":6,\"width\":5,\"x\":0,\"y\":0}" + }, + { + "key": "panel227CB3299FF30948", + "structure": "{\"height\":6,\"width\":14,\"x\":10,\"y\":12}" + }, + { + "key": "panelE0A53C29AD3B3840", + "structure": "{\"height\":6,\"width\":7,\"x\":11,\"y\":0}" + }, + { + "key": "panelPANE-E8CD03B485782844", + "structure": "{\"height\":6,\"width\":7,\"x\":0,\"y\":6}" + }, + { + "key": "panel7B61E36EAF144940", + "structure": "{\"height\":6,\"width\":10,\"x\":0,\"y\":12}" + } + ] + }, + "panels": [ + { + "id": null, + "key": "panelPANE-1B625C63AFF01A43", + "title": "Configured Authentication mode", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Authentication mode\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"Authentication mode is *.\" as mode\n| withtime mode\n| most_recent(mode_withtime) as mode by dbidentifier", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelPANE-44D782B2B2738B48", + "title": "DB Setup Details", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error System Manufacturer\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"System Manufacturer: '*', System Model: '*'\" as service, instance_class\n| concat(service, \" - \", instance_class) as db_setup\n| withtime db_setup\n| most_recent(db_setup_withtime) as instance_class by dbidentifier", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel76EEA97B9A431944", + "title": "DB Process IDs", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error SQL Server has been using a process ID\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"This instance of SQL Server has been using a process ID of * since * (local) * (UTC)\" as process_id, local_created_time, utc_created_time\n| withtime process_id\n| most_recent(process_id_withtime) as process_id by dbidentifier, process_id, utc_created_time", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelAFCBC601BB5F4A4D", + "title": "DB Collation Details", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Default collation\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"Default collation: *\" as collation\n| withtime collation\n| most_recent(collation_withtime) as collation by dbidentifier, collation", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel9F73C967AECF9A4C", + "title": "DB Instance Type", + "visualSettings": "{\"title\":{\"fontSize\":14},\"general\":{\"type\":\"pie\",\"displayType\":\"default\",\"roundDataPoints\":true,\"fillOpacity\":1,\"startAngle\":270,\"innerRadius\":\"30%\",\"maxNumOfSlices\":10,\"mode\":\"distribution\"},\"legend\":{\"enabled\":false,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12}},\"series\":{}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error System Manufacturer\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"System Manufacturer: '*', System Model: '*'\" as service, instance_class\n| withtime instance_class\n| most_recent(instance_class_withtime) as instance_class by dbidentifier\n| count by instance_class\n| sort by _count, instance_class asc", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel227CB3299FF30948", + "title": "Recently Created Databases", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Starting up database\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"Starting up database '*'.\" as db_name\n| dedup by db_name, dbidentifier\n| values(db_name) as db_name by dbidentifier", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelE0A53C29AD3B3840", + "title": "SQL Server Versions", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error Microsoft SQL Server\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"Microsoft SQL Server (?\\d{4})\" \n| parse regex field=message \"\\n(?[\\w\\s]+ Edition \\(\\d+-bit\\))\"\n| trim(edition) as edition\n| withtime edition \n| most_recent(edition_withtime) as edition by dbidentifier, version", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panelPANE-E8CD03B485782844", + "title": "DBCC CHECK DB", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error DBCC CHECKDB\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"DBCC CHECKDB (rdsadmin) WITH all_errormsgs, no_infomsgs, tableresults executed by NT AUTHORITY\\\\SYSTEM found * errors and repaired * errors.\" as error, repaired_error\n| count as frequency by dbidentifier, error, repaired_error", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": null, + "coloringRules": null, + "linkedDashboards": [] + }, + { + "id": null, + "key": "panel7B61E36EAF144940", + "title": "Recently Terminated SQL Servers", + "visualSettings": "{\"title\":{\"fontSize\":14},\"axes\":{\"axisX\":{\"titleFontSize\":12,\"labelFontSize\":12},\"axisY\":{\"titleFontSize\":12,\"labelFontSize\":12,\"logarithmic\":false}},\"legend\":{\"enabled\":true,\"verticalAlign\":\"bottom\",\"fontSize\":12,\"maxHeight\":50,\"showAsTable\":false,\"wrap\":true},\"color\":{\"family\":\"Categorical Default\"},\"series\":{},\"general\":{\"type\":\"table\",\"displayType\":\"default\",\"roundDataPoints\":true,\"paginationPageSize\":100,\"fontSize\":12,\"mode\":\"timeSeries\"}}", + "keepVisualSettingsConsistentWithParent": true, + "panelType": "SumoSearchPanel", + "queries": [ + { + "transient": false, + "queryString": "account={{account}} region={{region}} namespace={{namespace}} dbidentifier={{dbidentifier}} _sourceHost=/aws/rds/*Error SQL Server is terminating\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| withtime dbidentifier \n| most_recent(dbidentifier_withtime) as dbidentifier by dbidentifier", + "queryType": "Logs", + "queryKey": "A", + "metricsQueryMode": null, + "metricsQueryData": null, + "tracesQueryData": null, + "spansQueryData": null, + "parseMode": "Auto", + "timeSource": "Message", + "outputCardinalityLimit": 1000 + } + ], + "description": "", + "timeRange": { + "type": "BeginBoundedTimeRange", + "from": { + "type": "RelativeTimeRangeBoundary", + "relativeTime": "-1d" + }, + "to": null + }, + "coloringRules": null, + "linkedDashboards": [] + } + ], + "variables": [ + { + "id": null, + "name": "account", + "displayName": "account", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "", + "key": "account" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "region", + "displayName": "region", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region=*", + "key": "region" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "namespace", + "displayName": "namespace", + "defaultValue": "aws/rds", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region={{region}} namespace=aws/rds", + "key": "namespace" + }, + "allowMultiSelect": false, + "includeAllOption": false, + "hideFromUI": false, + "valueType": "Any" + }, + { + "id": null, + "name": "dbidentifier", + "displayName": "dbidentifier", + "defaultValue": "*", + "sourceDefinition": { + "variableSourceType": "MetadataVariableSourceDefinition", + "filter": "account={{account}} region={{region}} namespace={{namespace}}", + "key": "dbidentifier" + }, + "allowMultiSelect": false, + "includeAllOption": true, + "hideFromUI": false, + "valueType": "Any" + } + ], + "coloringRules": [] } ] } \ No newline at end of file From c0ba991402974bd85a44cd8657ce0b6c3b029635 Mon Sep 17 00:00:00 2001 From: Akhil Dangore Date: Thu, 27 Jun 2024 14:35:06 +0530 Subject: [PATCH 2/5] Added monitors for MSSQL in CF --- aws-observability/json/Alerts-App.json | 1895 +++++++++++++----------- 1 file changed, 1064 insertions(+), 831 deletions(-) diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index 51b332ca..9949c477 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -4,18 +4,18 @@ "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS API Gateway - High Authorizer Errors", - "description": "This alert fires where there are too many API requests (>5%) with authorizer errors within 5 minutes", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "1m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId authorizerError\n| json \"status\", \"authorizerError\", \"apiid\", \"stage\" as status, authorizerError, apiid, stage \n| if (!(authorizerError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_authorizerError\n| sum(is_authorizerError) as is_authorizerError_count, count as totalRequests by apiid, stage\n| (is_authorizerError_count*100/totalRequests) as authorizerError_percent\n| fields authorizerError_percent, apiid, stage\n" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name\n" } ], "triggers": [ @@ -24,21 +24,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "GreaterThan", - "field": "authorizerError_percent" + "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "LessThanOrEqual", - "field": "authorizerError_percent" + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -49,8 +49,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Client-Side Errors", - "description": "This alert fires where there are too many API requests (>5%) with client-side errors within 5 minutes. \nThis can indicate an issue in the authorisation or client request parameters. It could also mean that a resource was removed or a client is requesting one that doesn't exist. Errors could also be caused by exceeding the configured throttling limit.", + "name": "AWS DynamoDB - High Write Throttle", + "description": "This alert fires when we detect that the total write throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -60,7 +60,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=4XX or metric=4xxError or metric=ClientError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=WriteThrottleEvents statistic=sum | sum by account, region, namespace, tablename" } ], "triggers": [ @@ -69,23 +69,23 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, - "thresholdType": "GreaterThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, - "thresholdType": "LessThan", + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 5 + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -96,18 +96,18 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Errors", - "description": "This alert fires where there are too many API requests (>5%) with integration errors within 5 minutes.", + "name": "AWS SQS - Access from highly malicious sources", + "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "1m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId integrationError\n| json \"status\", \"integrationError\", \"apiid\", \"stage\" as status, integrationError, apiid, stage \n| if (!(integrationError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_integrationError\n| sum(is_integrationError) as integrationError_count, count as totalRequests by apiid, stage\n| (integrationError_count*100/totalRequests) as integrationError_percent\n| fields integrationError_percent, apiid, stage" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" } ], "triggers": [ @@ -116,21 +116,21 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "GreaterThan", - "field": "integrationError_percent" + "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 0, "thresholdType": "LessThanOrEqual", - "field": "integrationError_percent" + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -141,43 +141,41 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect the high integration latency for the API requests in a stage within 5 minutes. This alarm is recommended for WebSocket APIs by AWS, and optional for other APIs because they already have separate alarm recommendations for the Latency metric.\nYou can correlate the IntegrationLatency metric value with the corresponding latency metric of your backend such as the Duration metric for Lambda integrations. This helps you determine whether the API backend is taking more time to process requests from clients due to performance issues or if there is some other overhead from initialization or cold start.", + "name": "Amazon RDS PostgreSQL - Statement Timeouts", + "description": "This alert fires when we detect Postgres logs show statement timeouts", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/apigateway metric=IntegrationLatency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql \"statement timeout\" | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg | count by dbidentifier, database" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2000, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 5 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2000, - "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 5 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -188,43 +186,47 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect the high Latency in a stage within 5 minutes for REST and HTTP API.\nFind the IntegrationLatency metric value to check the API backend latency. If the two metrics are mostly aligned, the API backend is the source of higher latency and you should investigate there for issues. View this metric per resource and method and narrow down the source of the latency.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/apigateway metric=Latency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 2500, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 5 + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 2500, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 5 + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -235,8 +237,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Server-Side Errors", - "description": "This alert fires where there are too many API requests (>5%) with server-side errors within 5 minutes.\nThis can be caused by 5xx errors from your integration, permission issues, or other factors preventing successful invocation of the integration, such as the integration being throttled or deleted.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -246,32 +248,36 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=5XX or metric=5xxError or metric=ExecutionError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", - "minDataPoints": 5 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0.05, + "threshold": 5, "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", - "minDataPoints": 5 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -282,41 +288,43 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High WAF Errors", - "description": "This alert fires where there are too many API requests (>5%) with WAF errors within 5 minutes.", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "1m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId \n| json \"status\", \"apiid\", \"stage\", \"wafResponseCode\" as status, apiid, stage, wafResponseCode\n| if (wafResponseCode==\"WAF_BLOCK\" and !(status matches \"2*\"), 1, 0) as is_wafError\n| sum(is_wafError) as is_wafError_count, count as totalRequests by apiid, stage\n| (is_wafError_count*100/totalRequests) as wafError_percent\n| fields wafError_percent, apiid, stage" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 2, "thresholdType": "GreaterThan", - "field": "wafError_percent" + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 2, "thresholdType": "LessThanOrEqual", - "field": "wafError_percent" + "occurrenceType": "Always", + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -327,44 +335,46 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High WAF Latency", - "description": "This alert fires when we detect the high WAF latency for the REST and WebSocket API requests in a stage within 5 minutes.", + "name": "AWS SNS - Notification to DLQ Failure", + "description": "This alert fires when an SNS topic messages that couldn't be moved to a dead-letter queue.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "1m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=* apiname=* apiid stage domainname requestId wafLatency\n| json \"wafLatency\", \"apiId\", \"stage\" as wafLatency, apiid, stage \n| pct(wafLatency, 90) as wafLatency90th by apiid,stage" + "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsFailedToRedriveToDlq statistic=sum | sum by account, region, namespace, topicname " } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 0, "thresholdType": "GreaterThan", - "field": "wafLatency90th" + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-5m", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 0, "thresholdType": "LessThanOrEqual", - "field": "wafLatency90th" + "occurrenceType": "Always", + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -372,43 +382,47 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - Low Traffic API", - "description": "This alert fires where there is low message traffic volume for the API within 5 minutes. \nThis can indicate an issue with the application calling the API such as using incorrect endpoints. It could also indicate an issue with the configuration or permissions of the API making it unreachable for clients. This alarm is not recommended for APIs that don't expect constant and consistent traffic.", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway (metric=ConnectCount OR metric=Count) statistic=SampleCount account=* region=* apiname=* stage=* !(route=*) !(resource=*) | quantize using sum | sum by apiname, namespace, region, account, stage" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "LessThanOrEqual", + "timeRange": "-5m", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", - "minDataPoints": 10 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, - "thresholdType": "GreaterThan", + "timeRange": "-5m", + "threshold": 85, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", - "minDataPoints": 10 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], - "timeZone": "Asia/Kolkata", + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -419,18 +433,18 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" } ], "triggers": [ @@ -439,7 +453,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -451,7 +465,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 3000, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -459,26 +473,29 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Events", - "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", + "name": "AWS API Gateway - High WAF Latency", + "description": "This alert fires when we detect the high WAF latency for the REST and WebSocket API requests in a stage within 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "0m", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" + "query": "account=* region=* namespace=* apiname=* apiid stage domainname requestId wafLatency\n| json \"wafLatency\", \"apiId\", \"stage\" as wafLatency, apiid, stage \n| pct(wafLatency, 90) as wafLatency90th by apiid,stage" } ], "triggers": [ @@ -487,78 +504,78 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "GreaterThan", - "field": null + "field": "wafLatency90th" }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "LessThanOrEqual", - "field": null + "field": "wafLatency90th" } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Access from Highly Malicious Sources", - "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", + "name": "Amazon RDS MySQL - Excessive Slow Query Detected", + "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over last 10 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -568,39 +585,48 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name\n" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*SlowQuery \"User@Host\" \"Query_time\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"(?# User@Host:[\\S\\s]+?SET timestamp=\\d+;[\\S\\s]+?;)\" multi\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*\\[(?\\S*?)\\]\\s*Id:\\s*(?\\d*)\" nodrop\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*(?\\S+)\\s\\[(?\\S*?)\\]\\s+Id:\\s*(?\\d+)\"\n| parse regex field=query_block \"# Query_time:\\s+(?[\\d.]*)\\s+Lock_time:\\s+(?[\\d.]*)\\s+Rows_sent:\\s+(?[\\d]*)\\s+Rows_examined:\\s+(?[\\d]*)\" nodrop\n| parse regex field=query_block \"SET timestamp=(?\\d*);\\n(?[\\s\\S]*);\" nodrop\n| parse regex field=sql_cmd \"[^a-zA-Z]*(?[a-zA-Z]+)\\s*\"\n| fields -query_block\n| num (query_time)\n| count as frequency, sum(query_time) as total_time, min(query_time) as min_time, max(query_time) as max_time, avg(query_time) as avg_time, avg(rows_examined) as avg_rows_examined, avg(rows_sent) as avg_rows_sent, avg(Lock_Time) as avg_lock_time group by sql_cmd, dbidentifier\n| 5 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where avg_time > threshold\n| sort by avg_time, frequency asc" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "LessThan", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -610,15 +636,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -627,7 +645,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -639,7 +657,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -647,16 +665,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "AWS Classic Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -666,45 +687,52 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -714,7 +742,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -723,8 +751,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -735,24 +763,27 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", + "threshold": 5, + "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -762,7 +793,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ @@ -771,8 +802,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThanOrEqual", + "threshold": 90, + "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -783,70 +814,74 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThan", + "threshold": 90, + "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Message processing not fast enough", - "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", + "name": "Amazon RDS MySQL - High Authentication Failure", + "description": "This alert fires when we detect more then 10 authentication failure over a 5 minute time-period", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error \"Access denied for user\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \" [*] \" as LogLevel\n| parse field=message \" * [Note] Access denied for user '*'@'*' (using *: *)\" as requestid, user, host, authenticationType, flag nodrop\n| parse field=message \"[Warning] Access denied for user '*'@'*' (using *: *)\" as user, host, authenticationType, flag nodrop" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "GreaterThan", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "LessThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -854,7 +889,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" } ], "triggers": [ @@ -863,36 +898,39 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", + "threshold": 0, + "thresholdType": "GreaterThan", "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Notifications", - "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -902,41 +940,99 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "GreaterThan", + "threshold": 95, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "LessThanOrEqual", + "threshold": 95, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": false, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + } + ], + "triggers": [ + { + "detectionMethod": "StaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null + }, + { + "detectionMethod": "StaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 5, + "thresholdType": "LessThan", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null + } + ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -946,7 +1042,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -955,7 +1051,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -967,7 +1063,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -975,16 +1071,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Messages not processed", - "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", + "name": "AWS Lambda - Throttling", + "description": "This alert fires when we detect a Lambda running into throttling within an interval of 10 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -994,7 +1093,7 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "Namespace=aws/lambda metric=Throttles statistic=average account=* region=* functionname=* Resource=* | avg by account, region,namespace, functionname " } ], "triggers": [ @@ -1002,33 +1101,36 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, - "thresholdType": "LessThan", + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1038,15 +1140,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -1055,8 +1149,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -1067,24 +1161,27 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", + "threshold": 50, + "thresholdType": "GreaterThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1094,15 +1191,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ @@ -1111,8 +1200,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", + "threshold": 80, + "thresholdType": "LessThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -1123,78 +1212,127 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", + "threshold": 80, + "thresholdType": "GreaterThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Queue has stopped receiving messages", - "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", + "name": "AWS API Gateway - High Integration Errors", + "description": "This alert fires where there are too many API requests (>5%) with integration errors within 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId integrationError\n| json \"status\", \"integrationError\", \"apiid\", \"stage\" as status, integrationError, apiid, stage \n| if (!(integrationError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_integrationError\n| sum(is_integrationError) as integrationError_count, count as totalRequests by apiid, stage\n| (integrationError_count*100/totalRequests) as integrationError_percent\n| fields integrationError_percent, apiid, stage" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 3 + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThan", + "field": "integrationError_percent" }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": "-5m", + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": "integrationError_percent" + } + ], + "timeZone": "Asia/Kolkata", + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases", + "description": "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", + "timeRange": "-15m", "threshold": 1, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 3 + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": "-15m", + "timeRange": "-15m", + "threshold": 1, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ @@ -1203,7 +1341,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1.5, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1215,7 +1353,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1.5, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1223,18 +1361,21 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1242,7 +1383,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -1250,37 +1391,40 @@ "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "LessThan", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1290,59 +1434,66 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -1351,7 +1502,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1363,7 +1514,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1371,64 +1522,64 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1438,7 +1589,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ @@ -1447,7 +1606,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1459,7 +1618,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1467,16 +1626,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "AWS SNS - Notification to DLQ", + "description": "This alert fires when an SNS topic messages are moved to a dead-letter queue.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1486,45 +1648,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsRedrivenToDlq statistic=sum | sum by account, region, namespace, topicname " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 0, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "LessThan", - "field": null, + "threshold": 0, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1534,7 +1695,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -1543,7 +1704,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1555,7 +1716,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1563,16 +1724,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1582,7 +1746,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -1591,7 +1755,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1603,7 +1767,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1611,26 +1775,29 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Access from highly malicious sources", - "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS API Gateway - High Authorizer Errors", + "description": "This alert fires where there are too many API requests (>5%) with authorizer errors within 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "0m", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId authorizerError\n| json \"status\", \"authorizerError\", \"apiid\", \"stage\" as status, authorizerError, apiid, stage \n| if (!(authorizerError matches \"-\") and !(status matches \"2*\"), 1, 0) as is_authorizerError\n| sum(is_authorizerError) as is_authorizerError_count, count as totalRequests by apiid, stage\n| (is_authorizerError_count*100/totalRequests) as authorizerError_percent\n| fields authorizerError_percent, apiid, stage\n" } ], "triggers": [ @@ -1639,30 +1806,33 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "GreaterThan", - "field": null + "field": "authorizerError_percent" }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "LessThanOrEqual", - "field": null + "field": "authorizerError_percent" } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS EC2 CW - High CPU Utilization", + "description": "This alert fires when the average CPU Utilization based on cloud watch metrics, within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1672,15 +1842,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "account=* region=* namespace=aws/ec2 metric=CPUUtilization instanceid=* statistic=average | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -1689,8 +1851,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", + "threshold": 85, + "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -1699,22 +1861,25 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", + "threshold": 85, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "Amazon RDS PostgreSQL - High Errors", + "description": "This alert fires when we detect high rate (>10) of error/fatal logs in Postgres logs over a 5 minutes time period", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -1724,7 +1889,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* (\"ERROR\" OR \"FATAL\")\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,threadid,user,database,processid,severity,msg \n| where severity IN (\"ERROR\", \"FATAL\") " } ], "triggers": [ @@ -1733,7 +1898,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 10, "thresholdType": "GreaterThan", "field": null }, @@ -1742,23 +1907,26 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 10, "thresholdType": "LessThanOrEqual", "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS PostgreSQL - High Authentication Failure", + "description": "This alert fires when we detect more than 10 authentication failure in Postgres logs over a 5 minute time-period", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1766,45 +1934,42 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* \"authentication failed\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| where msg matches \"*authentication failed*\"" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 10, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 10, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon RDS - Low Free Storage", + "description": "This alert fires when the average free storage space of a RDS instance is low (< 512MB) for an interval of 15 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1814,45 +1979,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/rds metric=FreeStorageSpace statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "timeRange": "-15m", + "threshold": 512, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "timeRange": "-15m", + "threshold": 512, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "Amazon RDS - Low Freeable Memory", + "description": "This alert fires when the average Freeable memory of an RDS instance is < 128 MB for an interval of 15 minutes. If this value is lower you may need to scale up to a larger instance class.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1862,93 +2026,89 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/rds metric=FreeableMemory statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "timeRange": "-15m", + "threshold": 128, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "timeRange": "-15m", + "threshold": 128, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "Amazon RDS MSSQL - Database observing authentication failures from multiple client IPs", + "description": "This alert fires when we detect more than or equal to 10 client IPs attempting authentication failures on the database over a 15-minute period.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(client_ip) as unique_client_ip by dbidentifier\n| 10 as threshold\n| where unique_client_ip >= threshold\n| sort by unique_client_ip\n| fields - threshold" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 1, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "resolutionWindow": "-15m", + "timeRange": "-15m", + "threshold": 1, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect the high integration latency for the API requests in a stage within 5 minutes. This alarm is recommended for WebSocket APIs by AWS, and optional for other APIs because they already have separate alarm recommendations for the Latency metric.\nYou can correlate the IntegrationLatency metric value with the corresponding latency metric of your backend such as the Duration metric for Lambda integrations. This helps you determine whether the API backend is taking more time to process requests from clients due to performance issues or if there is some other overhead from initialization or cold start.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1958,45 +2118,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* Namespace=aws/apigateway metric=IntegrationLatency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 2000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 5 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 2000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 5 } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2006,7 +2165,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", @@ -2043,64 +2202,66 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 3000, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2110,7 +2271,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -2119,7 +2280,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -2131,7 +2292,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -2139,16 +2300,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "AWS API Gateway - High Server-Side Errors", + "description": "This alert fires where there are too many API requests (>5%) with server-side errors within 5 minutes.\nThis can be caused by 5xx errors from your integration, permission issues, or other factors preventing successful invocation of the integration, such as the integration being throttled or deleted.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2158,7 +2322,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "Namespace=aws/apigateway (metric=5XX or metric=5xxError or metric=ExecutionError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ @@ -2167,32 +2331,35 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 0.05, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 2 + "minDataPoints": 5 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 0.05, "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 2 + "minDataPoints": 5 } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2202,7 +2369,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ @@ -2231,64 +2398,64 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "AWS Lambda - High Memory Utilization", + "description": "This alert fires when we detect a Lambda execution with memory usage of more than 85% within an interval of 10 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "account=* region=* Namespace=aws/lambda Memory Size Used\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| _sourceName as logStream | _sourceHost as logGroup\n| parse regex field=message \"REPORT\\s+RequestId:\\s+(?[^\\s]+)\\s+Duration:\\s+(?[^\\s]+)\\s+ms\\s+Billed Duration:\\s+(?[^\\s]+)\\s+ms\\s+Memory\\s+Size:\\s+(?[^\\s]+)\\s+MB\\s+Max\\s+Memory\\s+Used:\\s+(?[^\\s]+)\\s+MB\" \n| parse field=loggroup \"/aws/lambda/*\" as functionname\n| avg(MemorySize) as MemorySizeAvg, avg(MaxMemoryUsed) as MaxMemoryUsedAvg by functionname\n| (MaxMemoryUsedAvg/MemorySizeAvg)*100 as memoryUtilization\n| where memoryUtilization>85" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 90, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 90, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "resolutionWindow": "-10m", + "timeRange": "-10m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS API Gateway - Low Traffic API", + "description": "This alert fires where there is low message traffic volume for the API within 5 minutes. \nThis can indicate an issue with the application calling the API such as using incorrect endpoints. It could also indicate an issue with the configuration or permissions of the API making it unreachable for clients. This alarm is not recommended for APIs that don't expect constant and consistent traffic.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2298,101 +2465,89 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/apigateway (metric=ConnectCount OR metric=Count) statistic=SampleCount account=* region=* apiname=* stage=* !(route=*) !(resource=*) | quantize using sum | sum by apiname, namespace, region, account, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 10 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, + "timeRange": "-10m", + "threshold": 1, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 10 } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS API Gateway - High WAF Errors", + "description": "This alert fires where there are too many API requests (>5%) with WAF errors within 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", - "evaluationDelay": "0m", + "evaluationDelay": "1m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "account=* region=* namespace=aws/apigateway apiname=* apiid stage domainname requestId \n| json \"status\", \"apiid\", \"stage\", \"wafResponseCode\" as status, apiid, stage, wafResponseCode\n| if (wafResponseCode==\"WAF_BLOCK\" and !(status matches \"2*\"), 1, 0) as is_wafError\n| sum(is_wafError) as is_wafError_count, count as totalRequests by apiid, stage\n| (is_wafError_count*100/totalRequests) as wafError_percent\n| fields wafError_percent, apiid, stage" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": "wafError_percent" }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": null, + "resolutionWindow": "-5m", "timeRange": "-5m", - "threshold": 0, + "threshold": 5, "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": "wafError_percent" } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2402,7 +2557,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ @@ -2411,7 +2566,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -2423,7 +2578,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -2431,16 +2586,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 CW - Status Check Failed", - "description": "This alert fires when there is a status check failures within a 5 minute interval for an EC2 instance.", + "name": "AWS DynamoDB - System Errors", + "description": "This alert fires when we detect system errors for a dynamodb table is high (>10) for a time interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2450,7 +2608,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/ec2 instanceid=* metric=StatusCheckFailed statistic=maximum | filter latest=1 | count by account, region, namespace,instanceid " + "query": "account=* region=* namespace=aws/dynamodb metric=SystemErrors statistic=samplecount | sum " } ], "triggers": [ @@ -2459,7 +2617,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 10, "thresholdType": "GreaterThan", "occurrenceType": "Always", "minDataPoints": 2 @@ -2469,12 +2627,13 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, + "threshold": 10, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2485,8 +2644,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Notification to DLQ", - "description": "This alert fires when an SNS topic messages are moved to a dead-letter queue.", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2496,7 +2655,15 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsRedrivenToDlq statistic=sum | sum by account, region, namespace, topicname " + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ @@ -2505,8 +2672,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -2515,12 +2682,13 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2531,42 +2699,47 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Notification to DLQ Failure", - "description": "This alert fires when an SNS topic messages that couldn't be moved to a dead-letter queue.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns topicname=* metric=NumberOfNotificationsFailedToRedriveToDlq statistic=sum | sum by account, region, namespace, topicname " + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 85, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2613,6 +2786,7 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2623,8 +2797,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Write Throttle", - "description": "This alert fires when we detect that the total write throttle events for a dynamodb table is high (>5) for a time interval of 5 minutes.", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2634,31 +2808,44 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb tablename=* metric=WriteThrottleEvents statistic=sum | sum by account, region, namespace, tablename" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, - "thresholdType": "GreaterThan", + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, - "thresholdType": "LessThanOrEqual", + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2669,8 +2856,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - System Errors", - "description": "This alert fires when we detect system errors for a dynamodb table is high (>10) for a time interval of 5 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2680,7 +2867,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb metric=SystemErrors statistic=samplecount | sum " + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ @@ -2688,26 +2875,27 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 2 + "minDataPoints": 3 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 2 + "minDataPoints": 3 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2715,40 +2903,43 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Memory Utilization", - "description": "This alert fires when we detect a Lambda execution with memory usage of more than 85% within an interval of 10 minutes.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* Namespace=aws/lambda Memory Size Used\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| _sourceName as logStream | _sourceHost as logGroup\n| parse regex field=message \"REPORT\\s+RequestId:\\s+(?[^\\s]+)\\s+Duration:\\s+(?[^\\s]+)\\s+ms\\s+Billed Duration:\\s+(?[^\\s]+)\\s+ms\\s+Memory\\s+Size:\\s+(?[^\\s]+)\\s+MB\\s+Max\\s+Memory\\s+Used:\\s+(?[^\\s]+)\\s+MB\" \n| parse field=loggroup \"/aws/lambda/*\" as functionname\n| avg(MemorySize) as MemorySizeAvg, avg(MaxMemoryUsed) as MaxMemoryUsedAvg by functionname\n| (MaxMemoryUsedAvg/MemorySizeAvg)*100 as memoryUtilization\n| where memoryUtilization>85" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 0, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "GreaterThan", - "field": null + "occurrenceType": "Always", + "minDataPoints": 3 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "resolutionWindow": "-10m", - "timeRange": "-10m", - "threshold": 0, + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "LessThanOrEqual", - "field": null + "occurrenceType": "Always", + "minDataPoints": 3 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -2759,8 +2950,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Throttling", - "description": "This alert fires when we detect a Lambda running into throttling within an interval of 10 minutes.", + "name": "AWS EC2 CW - Status Check Failed", + "description": "This alert fires when there is a status check failures within a 5 minute interval for an EC2 instance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2770,7 +2961,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Throttles statistic=average account=* region=* functionname=* Resource=* | avg by account, region,namespace, functionname " + "query": "account=* region=* namespace=aws/ec2 instanceid=* metric=StatusCheckFailed statistic=maximum | filter latest=1 | count by account, region, namespace,instanceid " } ], "triggers": [ @@ -2778,7 +2969,7 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", + "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", @@ -2788,13 +2979,14 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-10m", + "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -2805,18 +2997,26 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS MySQL - Excessive Slow Query Detected", - "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over last 10 minutes.", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*SlowQuery \"User@Host\" \"Query_time\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse regex field=message \"(?# User@Host:[\\S\\s]+?SET timestamp=\\d+;[\\S\\s]+?;)\" multi\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*\\[(?\\S*?)\\]\\s*Id:\\s*(?\\d*)\" nodrop\n| parse regex field=query_block \"# User@Host:\\s*\\S+?\\[(?\\S*?)\\]\\s*@\\s*(?\\S+)\\s\\[(?\\S*?)\\]\\s+Id:\\s*(?\\d+)\"\n| parse regex field=query_block \"# Query_time:\\s+(?[\\d.]*)\\s+Lock_time:\\s+(?[\\d.]*)\\s+Rows_sent:\\s+(?[\\d]*)\\s+Rows_examined:\\s+(?[\\d]*)\" nodrop\n| parse regex field=query_block \"SET timestamp=(?\\d*);\\n(?[\\s\\S]*);\" nodrop\n| parse regex field=sql_cmd \"[^a-zA-Z]*(?[a-zA-Z]+)\\s*\"\n| fields -query_block\n| num (query_time)\n| count as frequency, sum(query_time) as total_time, min(query_time) as min_time, max(query_time) as max_time, avg(query_time) as avg_time, avg(rows_examined) as avg_rows_examined, avg(rows_sent) as avg_rows_sent, avg(Lock_Time) as avg_lock_time group by sql_cmd, dbidentifier\n| 5 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where avg_time > threshold\n| sort by avg_time, frequency asc" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ @@ -2824,30 +3024,31 @@ "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, + "timeRange": "-5m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-10m", - "threshold": 1, + "timeRange": "-5m", + "threshold": 10, "thresholdType": "LessThan", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2855,43 +3056,50 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS MySQL - High Authentication Failure", - "description": "This alert fires when we detect more then 10 authentication failure over a 5 minute time-period", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error \"Access denied for user\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \" [*] \" as LogLevel\n| parse field=message \" * [Note] Access denied for user '*'@'*' (using *: *)\" as requestid, user, host, authenticationType, flag nodrop\n| parse field=message \"[Warning] Access denied for user '*'@'*' (using *: *)\" as user, host, authenticationType, flag nodrop" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", - "field": null + "threshold": 90, + "thresholdType": "GreaterThanOrEqual", + "field": null, + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 90, + "thresholdType": "LessThan", + "field": null, + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2899,18 +3107,18 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - Excessive Slow Query Detected", - "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over a 10 minutes.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql\n| json \"message\" nodrop \n| if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| parse regex field=msg \"duration: (?[\\S]+) ms (?.+)\"\n| 5000 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where execution_time_ms > threshold \n| count by dbidentifier, database" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -2919,29 +3127,30 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 50, + "thresholdType": "GreaterThan", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2949,43 +3158,46 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - High Authentication Failure", - "description": "This alert fires when we detect more than 10 authentication failure in Postgres logs over a 5 minute time-period", + "name": "AWS API Gateway - High Client-Side Errors", + "description": "This alert fires where there are too many API requests (>5%) with client-side errors within 5 minutes. \nThis can indicate an issue in the authorisation or client request parameters. It could also mean that a resource was removed or a client is requesting one that doesn't exist. Errors could also be caused by exceeding the configured throttling limit.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* \"authentication failed\"\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| where msg matches \"*authentication failed*\"" + "query": "Namespace=aws/apigateway (metric=4XX or metric=4xxError or metric=ClientError) Statistic=Average account=* region=* apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", - "field": null + "threshold": 0.05, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 5 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 0.05, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 5 } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2993,40 +3205,43 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - High Errors", - "description": "This alert fires when we detect high number (>10) of error/fatal logs in Postgres logs over a 5 minutes time period", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds _sourceHost=/aws/rds/*postgresql dbidentifier=* (\"ERROR\" OR \"FATAL\")\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,threadid,user,database,processid,severity,msg \n| where severity IN (\"ERROR\", \"FATAL\") " + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThan", - "field": null + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 3 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 20, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 3 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -3037,40 +3252,43 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS PostgreSQL - Statement Timeouts", - "description": "This alert fires when we detect Postgres logs show statement timeouts", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect the high Latency in a stage within 5 minutes for REST and HTTP API.\nFind the IntegrationLatency metric value to check the API backend latency. If the two metrics are mostly aligned, the API backend is the source of higher latency and you should investigate there for issues. View this metric per resource and method and narrow down the source of the latency.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql \"statement timeout\" | json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message | parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg | count by dbidentifier, database" + "query": "account=* region=* Namespace=aws/apigateway metric=Latency statistic=p90 apiname=* stage=* !(route=*) !(resource=*) | avg by apiname, namespace, region, account, stage" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 2500, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 5 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 2500, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 5 } ], + "timeZone": "Asia/Kolkata", "notifications": [], "isDisabled": true, "groupNotifications": true, @@ -3081,8 +3299,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Free Storage", - "description": "This alert fires when the average free storage space of a RDS instance is low (< 512MB) for an interval of 15 minutes.", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3092,31 +3310,36 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds metric=FreeStorageSpace statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 512, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 512, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 85, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, @@ -3127,45 +3350,50 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Freeable Memory", - "description": "This alert fires when the average Freeable memory of an RDS instance is < 128 MB for an interval of 15 minutes. If this value is lower you may need to scale up to a larger instance class.", + "name": "Amazon RDS PostgreSQL - Excessive Slow Query Detected", + "description": "This alert fires when we detect the average time to execute a query is more than 5 seconds over a 10 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/rds metric=FreeableMemory statistic=average | eval _value/(1024*1024) | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*postgresql\n| json \"message\" nodrop \n| if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* * *:*(*):*@*:[*]:*:*\" as date,time,time_zone,host,thread_id,user,database,processid,severity,msg \n| parse regex field=msg \"duration: (?[\\S]+) ms (?.+)\"\n| 5000 as threshold // customize if need different value. As an example, query taking more than 5 Seconds is considered as Excessive Slow.\n| where execution_time_ms > threshold \n| count by dbidentifier, database" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 128, - "thresholdType": "LessThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 2 + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 128, - "thresholdType": "GreaterThan", - "occurrenceType": "Always", - "minDataPoints": 2 + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -3173,8 +3401,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 CW - High CPU Utilization", - "description": "This alert fires when the average CPU Utilization based on cloud watch metrics, within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -3184,31 +3412,36 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/ec2 metric=CPUUtilization instanceid=* statistic=average | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThan", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThanOrEqual", + "threshold": 5, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, From a188a8781d107df556912acc54bbe45d71f9a4bc Mon Sep 17 00:00:00 2001 From: Akhil Dangore Date: Thu, 27 Jun 2024 15:35:05 +0530 Subject: [PATCH 3/5] Added new monitors for mssql --- .../app-modules/rds/app.tf | 68 ++++++++++++++ aws-observability/json/Alerts-App.json | 90 +++++++++---------- 2 files changed, 113 insertions(+), 45 deletions(-) diff --git a/aws-observability-terraform/app-modules/rds/app.tf b/aws-observability-terraform/app-modules/rds/app.tf index 1fcb8f5d..15e2d1bb 100644 --- a/aws-observability-terraform/app-modules/rds/app.tf +++ b/aws-observability-terraform/app-modules/rds/app.tf @@ -519,6 +519,74 @@ module "rds_module" { group_notifications = var.group_notifications connection_notifications = var.connection_notifications email_notifications = var.email_notifications + }, + "RdsMSSQLHighAuthFailureByClientIPsOnDB" = { + monitor_name = "Amazon RDS MSSQL - Database observing authentication failures from multiple client IPs" + monitor_description = "This alert fires when we detect more than or equal to 10 client IPs attempting authentication failures on the database over a 15-minute period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(client_ip) as unique_client_ip by dbidentifier\n| 10 as threshold\n| where unique_client_ip >= threshold\n| sort by unique_client_ip\n| fields - threshold" + } + triggers = [ + { + detection_method = "LogsStaticCondition", + time_range = "-15m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "LogsStaticCondition", + time_range = "-15m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ] + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications + }, + "RdsMSSQLHighAuthFailureByClientIPOnDBs" = { + monitor_name = "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases" + monitor_description = "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period." + monitor_monitor_type = "Logs" + monitor_parent_id = var.monitor_folder_id + monitor_is_disabled = var.monitors_disabled + monitor_evaluation_delay = "0m" + queries = { + A = "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" + } + triggers = [ + { + detection_method = "LogsStaticCondition", + time_range = "-15m", + trigger_type = "Critical", + threshold = 1, + threshold_type = "GreaterThanOrEqual", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + }, + { + detection_method = "LogsStaticCondition", + time_range = "-5m", + trigger_type = "ResolvedCritical", + threshold = 1, + threshold_type = "LessThan", + occurrence_type = "ResultCount", + trigger_source = "AllResults" + } + ] + group_notifications = var.group_notifications + connection_notifications = var.connection_notifications + email_notifications = var.email_notifications } } } \ No newline at end of file diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index 9949c477..eac10853 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -1275,51 +1275,6 @@ "tags": null, "automatedPlaybookIds": [] }, - { - "name": "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases", - "description": "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period.", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" - } - ], - "triggers": [ - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 1, - "thresholdType": "GreaterThan", - "field": null - }, - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": "-15m", - "timeRange": "-15m", - "threshold": 1, - "thresholdType": "LessThanOrEqual", - "field": null - } - ], - "timeZone": "Asia/Kolkata", - "notifications": [], - "isDisabled": true, - "groupNotifications": true, - "playbook": "", - "sloId": null, - "monitorTemplateId": null, - "tags": null, - "automatedPlaybookIds": [] - }, { "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", @@ -2500,6 +2455,51 @@ "tags": null, "automatedPlaybookIds": [] }, + { + "name": "Amazon RDS MSSQL - Authentication failures from the same client IP on multiple databases", + "description": "This alert fires when we detect specific client IP attempting authentication failures on more than or equal to 10 databases over a 15 minute time-period.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Logs", + "evaluationDelay": "0m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "account=* region=* namespace=aws/rds dbidentifier=* _sourceHost=/aws/rds/*Error Logon Login failed for user\n| json \"message\" nodrop | if (_raw matches \"{*\", message, _raw) as message\n| parse field=message \"* Logon Login failed for user '*'. Reason: * [CLIENT: *]\" as time, user, reason, client_ip\n| count_distinct(dbidentifier) as unique_db by client_ip\n| 10 as threshold\n| where unique_db >= threshold\n| sort by unique_db, client_ip asc\n| fields - threshold" + } + ], + "triggers": [ + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": "-15m", + "timeRange": "-15m", + "threshold": 1, + "thresholdType": "LessThan", + "field": null + } + ], + "timeZone": "Asia/Kolkata", + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, { "name": "AWS API Gateway - High WAF Errors", "description": "This alert fires where there are too many API requests (>5%) with WAF errors within 5 minutes.", From 5d62ccc350cd721bf5081a104d575f6331e7af9d Mon Sep 17 00:00:00 2001 From: Akhil Dangore Date: Thu, 27 Jun 2024 15:39:00 +0530 Subject: [PATCH 4/5] Added new permission --- .../permissionchecker/AWSObservabilityCFTemplatePermissions.json | 1 + 1 file changed, 1 insertion(+) diff --git a/aws-observability/apps/permissionchecker/AWSObservabilityCFTemplatePermissions.json b/aws-observability/apps/permissionchecker/AWSObservabilityCFTemplatePermissions.json index 05b500b0..8d4d93aa 100644 --- a/aws-observability/apps/permissionchecker/AWSObservabilityCFTemplatePermissions.json +++ b/aws-observability/apps/permissionchecker/AWSObservabilityCFTemplatePermissions.json @@ -117,6 +117,7 @@ "s3:ListBucket", "s3:PutBucketNotification", "s3:PutBucketPolicy", + "s3:PutBucketPublicAccessBlock", "secretsmanager:DescribeSecret", "secretsmanager:GetRandomPassword", "secretsmanager:GetResourcePolicy", From dbfae0ff56c50bf658a2c505d100631c7b3142f2 Mon Sep 17 00:00:00 2001 From: Akhil Dangore Date: Thu, 27 Jun 2024 15:59:08 +0530 Subject: [PATCH 5/5] corrected time_range value --- aws-observability-terraform/app-modules/rds/app.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws-observability-terraform/app-modules/rds/app.tf b/aws-observability-terraform/app-modules/rds/app.tf index 15e2d1bb..41884afb 100644 --- a/aws-observability-terraform/app-modules/rds/app.tf +++ b/aws-observability-terraform/app-modules/rds/app.tf @@ -576,7 +576,7 @@ module "rds_module" { }, { detection_method = "LogsStaticCondition", - time_range = "-5m", + time_range = "-15m", trigger_type = "ResolvedCritical", threshold = 1, threshold_type = "LessThan",