forked from 3scale/3scale-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththreescale-kube-state-metrics.yaml
87 lines (87 loc) · 5.01 KB
/
threescale-kube-state-metrics.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
metadata:
creationTimestamp: null
labels:
app: 3scale-api-management
prometheus: application-monitoring
role: alert-rules
name: threescale-kube-state-metrics
spec:
groups:
- name: __NAMESPACE__/threescale-kube-state-metrics.rules
rules:
- alert: ThreescalePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc
expr: rate(kube_pod_container_status_restarts_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[15m])
* 60 * 5 > 0
for: 5m
labels:
severity: critical
- alert: ThreescalePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc
expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)",
phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max
by(namespace, pod, owner_kind) (kube_pod_owner{namespace="__NAMESPACE__",owner_kind!="Job"}))
> 0
for: 5m
labels:
severity: critical
- alert: ThreescaleReplicationControllerReplicasMismatch
annotations:
message: ReplicationController {{ $labels.namespace }}/{{ $labels.replicationcontroller
}} has not matched the expected number of replicas for longer than 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc
expr: kube_replicationcontroller_spec_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}
!= kube_replicationcontroller_status_ready_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}
for: 5m
labels:
severity: critical
- alert: ThreescaleContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container
}} has been in waiting state for longer than 1 hour.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc
expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"})
> 0
for: 1h
labels:
severity: warning
- alert: ThreescaleContainerCPUHigh
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container
}} has High CPU usage for longer than 15 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc
expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"})
by (namespace, container, pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"})
by (namespace, container, pod) * 100 > 90
for: 15m
labels:
severity: warning
- alert: ThreescaleContainerMemoryHigh
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container
}} has High Memory usage for longer than 15 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc
expr: sum(container_memory_usage_bytes{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"})
by(namespace, container, pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"})
by(namespace, container, pod) * 100 > 90
for: 15m
labels:
severity: warning
- alert: ThreescaleContainerCPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc
expr: sum(increase(container_cpu_cfs_throttled_periods_total{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"
}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m]))
by (container, pod, namespace) > ( 25 / 100 )
for: 15m
labels:
severity: warning