forked from 3scale/3scale-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapicast.yaml
72 lines (72 loc) · 3.21 KB
/
apicast.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
app: 3scale-api-management
prometheus: application-monitoring
role: alert-rules
threescale_component: apicast
name: apicast
spec:
groups:
- name: __NAMESPACE__/apicast.rules
rules:
- alert: ThreescaleApicastJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*/apicast-production|.*/apicast-staging",namespace="__NAMESPACE__"}
== 0
for: 1m
labels:
severity: critical
- alert: ThreescaleApicastRequestTime
annotations:
description: High number of request taking more than a second to be processed
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_request_time.adoc
summary: Request on instance {{ $labels.instance }} is taking more than one
second to process the requests
expr: sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__',
pod=~'apicast-production.*'}[1m])) - sum(rate(upstream_response_time_seconds_bucket{namespace='__NAMESPACE__',
pod=~'apicast-production.*'}[1m])) > 1
for: 2m
labels:
severity: warning
- alert: ThreescaleApicastHttp4xxErrorRate
annotations:
description: The number of request with 4XX is bigger than the 5% of total
request.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc
summary: APICast high HTTP 4XX error rate (instance {{ $labels.instance }})
expr: sum(rate(apicast_status{namespace='__NAMESPACE__', status=~"^4.."}[1m]))
/ sum(rate(apicast_status{namespace='__NAMESPACE__'}[1m])) * 100 > 5
for: 5m
labels:
severity: warning
- alert: ThreescaleApicastLatencyHigh
annotations:
description: |-
APIcast p99 latency is higher than 5 seconds
VALUE = {{ $value }}
LABELS: {{ $labels }}
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc
summary: APICast latency high (instance {{ $labels.instance }})
expr: histogram_quantile(0.99, sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__',}[30m]))
by (le)) > 5
for: 5m
labels:
severity: warning
- alert: ThreescaleApicastWorkerRestart
annotations:
description: A new thread has been started. This could indicate that a worker
process has died due to the memory limits being exceeded. Please investigate
the memory pressure on pod (instance {{ $labels.instance }})
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc
summary: A new worker process in Nginx has been started
expr: changes(worker_process{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[5m])
> 0
for: 5m
labels:
severity: warning