From 09407808d06e44b131f9f8b6df77a03d608b32a5 Mon Sep 17 00:00:00 2001
From: andrh <andrha@knowit.no>
Date: Thu, 7 Dec 2017 13:02:07 +0100
Subject: [PATCH] Adds two shortcuts for monitoring replicas.

---
 docs/usage.md         |  2 ++
 server/server.go      | 10 ++++++++++
 server/server_test.go | 10 ++++++++++
 3 files changed, 22 insertions(+)
diff --git a/docs/usage.md b/docs/usage.md
index 5974513..06618c6 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -52,6 +52,8 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
 |@node_mem_limit_total_above:[PERCENTAGE]|Whether memory usage of all the nodes is over the specified percentage of the total memory.<br>**Requirements:** `node-exporter` metrics<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@node_mem_limit_total_above:0.8` would be expanded to `(sum(node_memory_MemTotal{job="my-service"}) - sum(node_memory_MemFree{job="my-service"} + node_memory_Buffers{job="my-service"} + node_memory_Cached{job="my-service"})) / sum(node_memory_MemTotal{job="my-service"}) > 0.8`.|
 |@node_mem_limit_total_below:[PERCENTAGE]|Whether memory usage of all the nodes is below the specified percentage of the total memory.<br>**Requirements:** `node-exporter` metrics<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@node_mem_limit_total_below:0.4` would be expanded to `(sum(node_memory_MemTotal{job="my-service"}) - sum(node_memory_MemFree{job="my-service"} + node_memory_Buffers{job="my-service"} + node_memory_Cached{job="my-service"})) / sum(node_memory_MemTotal{job="my-service"}) < 0.4`.|
 |@replicas_running|Whether the number of running replicas is as desired.<br>**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.<br>**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) != 3`.|
+|@replicas_more_than|Whether the number of running replicas is more than desired.<br>**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.<br>**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) > 3`.|
+|@replicas_less_than|Whether the number of running replicas is less than desired.<br>**Requirements:** `cAdvisor` metrics and a service running in the replicated mode. The alert uses `container_memory_usage_bytes` metric only as a way to count the number of running containers.<br>**Example:** `@replicas_running` for a service with the number of desired replicas set to `3` would be expanded to `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) < 3`.|
 |@resp_time_above:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is above the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.<br>[QUANTILE] must be one of the quantiles defined in the metric.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_above:0.1,5m,0.9999` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`.|
 |@resp_time_below:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is below the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.<br>[QUANTILE] must be one of the quantiles defined in the metric.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_below:0.025,5m,0.75` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`.|
 |@resp_time_server_error:[RATE_DURATION],[PERCENTAGE]|Whether error rate over the specified *rate duration* is below the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with label `code` set to value of the HTTP response code.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_server_error:5m,0.001` would be expanded to `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`.|
diff --git a/server/server.go b/server/server.go
index 9ea37d6..1aafda4 100644
--- a/server/server.go
+++ b/server/server.go
@@ -292,6 +292,16 @@ var alertIfShortcutData = []alertIfShortcut{
 		shortcut:    `@replicas_running`,
 		annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is not [REPLICAS]"},
 		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"},
+	}, {
+		expanded:    `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="[SERVICE_NAME]"}) < [REPLICAS]`,
+		shortcut:    `@replicas_less_than`,
+		annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is less than [REPLICAS]"},
+		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"},
+	}, {
+		expanded:    `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="[SERVICE_NAME]"}) > [REPLICAS]`,
+		shortcut:    `@replicas_more_than`,
+		annotations: map[string]string{"summary": "The number of running replicas of the service [SERVICE_NAME] is more than [REPLICAS]"},
+		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up", "type": "node"},
 	}, {
 		expanded:    `sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]", code=~"^5..$$"}[[VALUE_0]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_0]])) > [VALUE_1]`,
 		shortcut:    `@resp_time_server_error:`,
diff --git a/server/server_test.go b/server/server_test.go
index 7e9d448..cd10a4c 100644
--- a/server/server_test.go
+++ b/server/server_test.go
@@ -258,6 +258,16 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts() {
 			`@replicas_running`,
 			map[string]string{"summary": "The number of running replicas of the service my-service is not 3"},
 			map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"},
+		}, {
+			`count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) > 3`,
+			`@replicas_more_than`,
+			map[string]string{"summary": "The number of running replicas of the service my-service is more than 3"},
+			map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"},
+		}, {
+			`count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}) < 3`,
+			`@replicas_less_than`,
+			map[string]string{"summary": "The number of running replicas of the service my-service is less than 3"},
+			map[string]string{"receiver": "system", "service": "my-service", "scale": "up", "type": "node"},
 		}, {
 			`sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`,
 			`@resp_time_server_error:5m,0.001`,