From 09407808d06e44b131f9f8b6df77a03d608b32a5 Mon Sep 17 00:00:00 2001 From: andrh Date: Thu, 7 Dec 2017 13:02:07 +0100 Subject: [PATCH] Adds two shortcuts for monitoring replicas. --- docs/usage.md | 2 ++ server/server.go | 10 ++++++++++ server/server_test.go | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 5974513..06618c6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -52,6 +52,8 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/) |@node_mem_limit_total_above:[PERCENTAGE]|Whether memory usage of all the nodes is over the specified percentage of the total memory.
**Requirements:** `node-exporter` metrics
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@node_mem_limit_total_above:0.8` would be expanded to `(sum(node_memory_MemTotal{job="my-service"}) - sum(node_memory_MemFree{job="my-service"} + node_memory_Buffers{job="my-service"} + node_memory_Cached{job="my-service"})) / sum(node_memory_MemTotal{job="my-service"}) > 0.8`.| |@node_mem_limit_total_below:[PERCENTAGE]|Whether memory usage of all the nodes is below the specified percentage of the total memory.
**Requirements:** `node-exporter` metrics
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@node_mem_limit_total_below:0.4` would be expanded to `(sum(node_memory_MemTotal{job="my-service"}) - sum(node_memory_MemFree{job="my-service"} + node_memory_Buffers{job="my-service"} + node_memory_Cached{job="my-service"})) / sum(node_memory_MemTotal{job="my-service"}) < 0.4`.| |@replicas_running|Whether the number of running replicas is as desired.
**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.
**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) != 3`.| +|@replicas_more_than|Whether the number of running replicas is more than desired.
**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.
**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) > 3`.| +|@replicas_less_than|Whether the number of running replicas is less than desired.
**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.
**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) < 3`.| |@resp_time_above:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is above the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.
[QUANTILE] must be one of the quantiles defined in the metric.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_above:0.1,5m,0.9999` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`.| |@resp_time_below:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is below the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.
[QUANTILE] must be one of the quantiles defined in the metric.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_below:0.025,5m,0.75` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`.| |@resp_time_server_error:[RATE_DURATION],[PERCENTAGE]|Whether error rate over the specified *rate duration* is below the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with label `code` set to value of the HTTP response code.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_server_error:5m,0.001` would be expanded to `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`.| diff --git a/server/server.go b/server/server.go index 9ea37d6..1aafda4 100644 --- a/server/server.go +++ b/server/server.go @@ -292,6 +292,16 @@ var alertIfShortcutData = []alertIfShortcut{ shortcut: `@replicas_running`, annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is not [REPLICAS]"}, labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"}, + }, { + expanded: `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="[SERVICE_NAME]"}) < [REPLICAS]`, + shortcut: `@replicas_less_than`, + annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is less than [REPLICAS]"}, + labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"}, + }, { + expanded: `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="[SERVICE_NAME]"}) > [REPLICAS]`, + shortcut: `@replicas_more_than`, + annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is more than [REPLICAS]"}, + labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"}, }, { expanded: `sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]", code=~"^5..$$"}[[VALUE_0]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_0]])) > [VALUE_1]`, shortcut: `@resp_time_server_error:`, diff --git a/server/server_test.go b/server/server_test.go index 7e9d448..cd10a4c 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -258,6 +258,16 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts() { `@replicas_running`, map[string]string{"summary": "The number of running replicas of the service my-service is not 3"}, map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"}, + }, { + `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) > 3`, + `@replicas_more_than`, + map[string]string{"summary": "The number of running replicas of the service my-service is more than 3"}, + map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"}, + }, { + `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) < 3`, + `@replicas_less_than`, + map[string]string{"summary": "The number of running replicas of the service my-service is less than 3"}, + map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"}, }, { `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`, `@resp_time_server_error:5m,0.001`,