Docs

docker-flow · Aug 8, 2017 · 5c51ebc · 5c51ebc
1 parent fce5584
commit 5c51ebc
Show file tree

Hide file tree

Showing 11 changed files with 216 additions and 18 deletions.
diff --git a/docs/config.md b/docs/config.md
@@ -2,14 +2,7 @@
 
 *Docker Flow Monitor* can be configured through Docker environment variables and/or by creating a new image based on `vfarcic/docker-flow-monitor`.
 
-## Environment Variables
-
-!!! tip
-	The *Docker Flow Monitor* container can be configured through environment variables
-
-The environment variables that can be used to configure *Docker Flow Monitor* are divided into two groups distinguished by variable name prefixes.
-
-### ARG Variables
+## Startup Arguments
 
 Environment variables prefixed will `ARG_` are used instead Prometheus startup arguments.
 
@@ -34,7 +27,7 @@ ARG_WEB_CONSOLE_LIBRARIES=/usr/share/prometheus/console_libraries
 ARG_WEB_CONSOLE_TEMPLATES=/usr/share/prometheus/consoles
 ```
 
-### GLOBAL Variables
+## Global Configuration
 
 Environment variables prefixed with `GLOBAL_` are used instead Prometheus global entries in the configuration.
 
@@ -76,6 +69,6 @@ curl `[IP_OF_ONE_OF_SWARM_NODES]:8080/v1/docker-flow-monitor/reconfigure?scrapeP
 
 Please consult [Prometheus Configuration](https://prometheus.io/docs/operating/configuration/) for more information about the available options.
 
-## Secrets
+## Scrapes
 
 Content of Docker secrets prefixed with the name `scrape_` is automatically added to the `scrape_configs` section of the configuration.
diff --git a/docs/usage.md b/docs/usage.md
@@ -48,6 +48,9 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
 |----------------------------------|---------------------------------------------------------------|
 |@node_fs_limit:[PERCENTAGE]       |Whether node file system usage is over specified percentage of the total available file system size.<br>**Requirements:** `node-exporter` metrics<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@node_fs_limit:0.8` would be expanded to `(node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} > 0.8`.|
 |@node_mem_limit:[PERCENTAGE]      |Whether node memory usage is over specified percentage of the total node memory.<br>**Requirements:** `node-exporter` metrics<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@node_mem_limit:0.8` would be expanded to `(sum by (instance) (node_memory_MemTotal) - sum by (instance) (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum by (instance) (node_memory_MemTotal) > 0.8`.|
+|@resp_time_above:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is above the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.<br>[QUANTILE] must be one of the quantiles defined in the metric.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_above:0.1,5m,0.9999` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`.|
+|@resp_time_below:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is below the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.<br>[QUANTILE] must be one of the quantiles defined in the metric.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_below:0.025,5m,0.75` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`.|
+|@resp_time_server_error:[RATE_DURATION],[PERCENTAGE]|Whether error rate over the specified *rate duration* is below the set *percentage*.<br>**Requirements:** histogram with the name `http_server_resp_time` and with label `code` set to value of the HTTP response code.<br>[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** `@resp_time_server_error:5m,0.001` would be expanded to `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`.|
 |@service_mem_limit:[PERCENTAGE]   |Whether service memory usage is over specified percentage of the service memory limit.<br>**Requirements:** `cAdvisor` metrics and service memory limit specified as service resource.<br>[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).<br>**Example:** If `serviceName` is set to `my-service`, `@service_mem_limit:0.8` would be expanded to `container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`.|
 
 !!! note

diff --git a/scripts/dm-swarm-13.sh b/scripts/dm-swarm-13.sh
@@ -13,9 +13,18 @@ docker stack deploy \
 docker network create -d overlay monitor
 
 echo "route:
-  group_by: [service]
+  group_by: [service,scale]
   repeat_interval: 1h
   receiver: 'slack'
+  routes:
+  - match:
+      service: 'go-demo_main'
+      scale: 'up'
+    receiver: 'jenkins-go-demo_main-up'
+  - match:
+      service: 'go-demo_main'
+      scale: 'down'
+    receiver: 'jenkins-go-demo_main-down'
 
 receivers:
   - name: 'slack'
@@ -25,6 +34,14 @@ receivers:
         title_link: 'http://$(docker-machine ip swarm-1)/monitor/alerts'
         text: '{{ .CommonAnnotations.summary}}'
         api_url: 'https://hooks.slack.com/services/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'
+  - name: 'jenkins-go-demo_main-up'
+    webhook_configs:
+      - send_resolved: false
+        url: 'http://$(docker-machine ip swarm-1)/jenkins/job/service-scale/buildWithParameters?token=DevOps22&service=go-demo_main&scale=1'
+  - name: 'jenkins-go-demo_main-down'
+    webhook_configs:
+      - send_resolved: false
+        url: 'http://$(docker-machine ip swarm-1)/jenkins/job/service-scale/buildWithParameters?token=DevOps22&service=go-demo_main&scale=-1'
 " | docker secret create alert_manager_config -
 
 DOMAIN=$(docker-machine ip swarm-1) \
@@ -43,4 +60,8 @@ export SLACK_IP=$(ping \
     | awk -F'[()]' '/PING/{print $2}')
 
 docker stack deploy \
-    -c stacks/jenkins-scale.yml jenkins
+    -c stacks/jenkins-scale.yml jenkins
+
+docker stack deploy \
+    -c stacks/go-demo-instrument-alert-short.yml \
+    go-demo
diff --git a/server/server.go b/server/server.go
@@ -258,13 +258,18 @@ var alertIfShortcutData = []alertIfShortcut{
 	}, {
 		expanded:    `sum(rate(http_server_resp_time_bucket{job="[SERVICE_NAME]", le="[VALUE_0]"}[[VALUE_1]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_1]])) < [VALUE_2]`,
 		shortcut:    `@resp_time_above:`,
-		annotations: map[string]string{"summary": "Response time of a service [SERVICE_NAME] is above [VALUE_0]"},
+		annotations: map[string]string{"summary": "Response time of the service [SERVICE_NAME] is above [VALUE_0]"},
 		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up"},
 	}, {
 		expanded:    `sum(rate(http_server_resp_time_bucket{job="[SERVICE_NAME]", le="[VALUE_0]"}[[VALUE_1]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_1]])) > [VALUE_2]`,
 		shortcut:    `@resp_time_below:`,
-		annotations: map[string]string{"summary": "Response time of a service [SERVICE_NAME] is below [VALUE_0]"},
+		annotations: map[string]string{"summary": "Response time of the service [SERVICE_NAME] is below [VALUE_0]"},
 		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "down"},
+	}, {
+		expanded:    `sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]", code=~"^5..$$"}[[VALUE_0]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_0]])) > [VALUE_1]`,
+		shortcut:    `@resp_time_server_error:`,
+		annotations: map[string]string{"summary": "Error rate of the service [SERVICE_NAME] is above [VALUE_1]"},
+		labels:      map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "type": "errors"},
 	},
 }
 

diff --git a/server/server_test.go b/server/server_test.go
@@ -234,13 +234,18 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_AddsFormattedAlert() {
 		}, {
 			`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`,
 			`@resp_time_above:0.1,5m,0.9999`,
-			map[string]string{"summary": "Response time of a service my-service is above 0.1"},
+			map[string]string{"summary": "Response time of the service my-service is above 0.1"},
 			map[string]string{"receiver": "system", "service": "my-service", "scale": "up"},
 		}, {
 			`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`,
 			`@resp_time_below:0.025,5m,0.75`,
-			map[string]string{"summary": "Response time of a service my-service is below 0.025"},
+			map[string]string{"summary": "Response time of the service my-service is below 0.025"},
 			map[string]string{"receiver": "system", "service": "my-service", "scale": "down"},
+		}, {
+			`sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`,
+			`@resp_time_server_error:5m,0.001`,
+			map[string]string{"summary": "Error rate of the service my-service is above 0.001"},
+			map[string]string{"receiver": "system", "service": "my-service", "type": "errors"},
 		},
 	}
 	for _, data := range testData {

diff --git a/stacks/go-demo-instrument-alert-2.yml b/stacks/go-demo-instrument-alert-2.yml
@@ -0,0 +1,55 @@
+version: '3'
+
+services:
+
+  main:
+    image: vfarcic/go-demo
+    environment:
+      - DB=db
+    networks:
+      - proxy
+      - monitor
+    deploy:
+      replicas: 3
+      update_config:
+        parallelism: 1
+        delay: 10s
+      labels:
+        - com.df.notify=true
+        - com.df.distribute=true
+        - com.df.servicePath=/demo
+        - com.df.port=8080
+        - com.df.alertName.1=memlimit
+        - com.df.alertIf.1=@service_mem_limit:0.8
+        - com.df.alertFor.1=5m
+        - com.df.alertName.2=resptimeabove
+        - com.df.alertIf.2=sum(rate(http_server_resp_time_bucket{job="go-demo_main", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="go-demo_main"}[5m])) < 0.99
+        - com.df.alertLabels.2=scale=up,service=go-demo_main
+        - com.df.alertName.3=resptimebelow
+        - com.df.alertIf.3=sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75
+        - com.df.alertLabels.3=scale=down,service=go-demo_main
+        - com.df.scrapePort=8080
+        - com.df.scaleMin=2
+        - com.df.scaleMax=4
+      resources:
+        reservations:
+          memory: 5M
+        limits:
+          memory: 10M
+
+  db:
+    image: mongo
+    networks:
+      - proxy
+    deploy:
+      resources:
+        reservations:
+          memory: 40M
+        limits:
+          memory: 80M
+
+networks:
+  proxy:
+    external: true
+  monitor:
+    external: true
diff --git a/stacks/go-demo-instrument-alert-error.yml b/stacks/go-demo-instrument-alert-error.yml
@@ -0,0 +1,57 @@
+version: '3'
+
+services:
+
+  main:
+    image: vfarcic/go-demo
+    environment:
+      - DB=db
+    networks:
+      - proxy
+      - monitor
+    deploy:
+      replicas: 3
+      update_config:
+        parallelism: 1
+        delay: 10s
+      labels:
+        - com.df.notify=true
+        - com.df.distribute=true
+        - com.df.servicePath=/demo
+        - com.df.port=8080
+        - com.df.scaleMin=2
+        - com.df.scaleMax=4
+        - com.df.scrapePort=8080
+        - com.df.alertName.1=memlimit
+        - com.df.alertIf.1=@service_mem_limit:0.8
+        - com.df.alertFor.1=5m
+        - com.df.alertName.2=resptimeabove
+        - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99
+#        - com.df.alertName.3=resptimebelow
+#        - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75
+        - com.df.alertName.3=errorrate
+        - com.df.alertIf.3=sum(rate(http_server_resp_time_count{job="go-demo_main", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="go-demo_main"}[5m])) > 0.001
+        - com.df.alertLabels.3=service=go-demo_main,type=errors
+        - com.df.alertAnnotations.3=summary=Error rate is too high,description=Do something or start panicking
+      resources:
+        reservations:
+          memory: 5M
+        limits:
+          memory: 10M
+
+  db:
+    image: mongo
+    networks:
+      - proxy
+    deploy:
+      resources:
+        reservations:
+          memory: 40M
+        limits:
+          memory: 80M
+
+networks:
+  proxy:
+    external: true
+  monitor:
+    external: true
diff --git a/stacks/go-demo-instrument-alert-short-2.yml b/stacks/go-demo-instrument-alert-short-2.yml
@@ -0,0 +1,55 @@
+version: '3'
+
+services:
+
+  main:
+    image: vfarcic/go-demo
+    environment:
+      - DB=db
+    networks:
+      - proxy
+      - monitor
+    deploy:
+      replicas: 3
+      update_config:
+        parallelism: 1
+        delay: 10s
+      labels:
+        - com.df.notify=true
+        - com.df.distribute=true
+        - com.df.servicePath=/demo
+        - com.df.port=8080
+        - com.df.scaleMin=2
+        - com.df.scaleMax=4
+        - com.df.scrapePort=8080
+        - com.df.alertName.1=memlimit
+        - com.df.alertIf.1=@service_mem_limit:0.8
+        - com.df.alertFor.1=5m
+        - com.df.alertName.2=resptimeabove
+        - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99
+#        - com.df.alertName.3=resptimebelow
+#        - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75
+        - com.df.alertName.3=errorate
+        - com.df.alertIf.3=@resp_time_server_error:5m,0.001
+      resources:
+        reservations:
+          memory: 5M
+        limits:
+          memory: 10M
+
+  db:
+    image: mongo
+    networks:
+      - proxy
+    deploy:
+      resources:
+        reservations:
+          memory: 40M
+        limits:
+          memory: 80M
+
+networks:
+  proxy:
+    external: true
+  monitor:
+    external: true
diff --git a/stacks/go-demo-instrument-alert-short.yml b/stacks/go-demo-instrument-alert-short.yml
@@ -27,10 +27,8 @@ services:
         - com.df.alertFor.1=5m
         - com.df.alertName.2=resptimeabove
         - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99
-        - com.df.alertFor.2=1m
         - com.df.alertName.3=resptimebelow
         - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75
-        - com.df.alertFor.3=1m
       resources:
         reservations:
           memory: 5M
@@ -50,4 +48,6 @@ services:
 
 networks:
   proxy:
+    external: true
+  monitor:
     external: true
diff --git a/stacks/go-demo-instrument-alert.yml b/stacks/go-demo-instrument-alert.yml
@@ -47,4 +47,6 @@ services:
 
 networks:
   proxy:
+    external: true
+  monitor:
     external: true
diff --git a/stacks/go-demo-instrument.yml b/stacks/go-demo-instrument.yml
@@ -44,4 +44,6 @@ services:
 
 networks:
   proxy:
+    external: true
+  monitor:
     external: true