diff --git a/src/alert_rules/prometheus/percona-mongodb-exporter.yml b/src/alert_rules/prometheus/percona-mongodb-exporter.yml index fe0568dbf..1caaf5dd8 100644 --- a/src/alert_rules/prometheus/percona-mongodb-exporter.yml +++ b/src/alert_rules/prometheus/percona-mongodb-exporter.yml @@ -21,10 +21,64 @@ groups: description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationLag - expr: 'mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"} > 10' + expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' for: 0m labels: severity: critical annotations: summary: MongoDB replication lag (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbReplicationHeadroom + expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' + for: 0m + labels: + severity: critical + annotations: + summary: MongoDB replication headroom (instance {{ $labels.instance }}) + description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbNumberCursorsOpen + expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB number cursors open (instance {{ $labels.instance }}) + description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbCursorsTimeouts + expr: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100" + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB cursors timeouts (instance {{ $labels.instance }}) + description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbTooManyConnections + expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB too many connections (instance {{ $labels.instance }}) + description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongoDBNotWritable + expr: "sum(mongodb_mongod_replset_my_state == 1) == 0" + for: 2m + labels: + severity: critical + annotations: + summary: MongoDB is not writable, no node is primary + description: "MongoDB is not writable, no node is primary" + + - alert: MongoDBOneNodeLossAwayFromNotWitable + expr: "sum(mongodb_mongod_replset_my_state) == 1" + for: 2m + labels: + severity: warning + annotations: + summary: If MongoDB loses one more node it will not be writable + description: "If MongoDB loses one more node it will not be writable"