From 5c51ebc0a5e006c93de9c82999e82118bd615648 Mon Sep 17 00:00:00 2001 From: Viktor Date: Tue, 8 Aug 2017 17:07:17 +0200 Subject: [PATCH] Docs --- docs/config.md | 13 ++--- docs/usage.md | 3 ++ scripts/dm-swarm-13.sh | 25 ++++++++- server/server.go | 9 +++- server/server_test.go | 9 +++- stacks/go-demo-instrument-alert-2.yml | 55 ++++++++++++++++++++ stacks/go-demo-instrument-alert-error.yml | 57 +++++++++++++++++++++ stacks/go-demo-instrument-alert-short-2.yml | 55 ++++++++++++++++++++ stacks/go-demo-instrument-alert-short.yml | 4 +- stacks/go-demo-instrument-alert.yml | 2 + stacks/go-demo-instrument.yml | 2 + 11 files changed, 216 insertions(+), 18 deletions(-) create mode 100644 stacks/go-demo-instrument-alert-2.yml create mode 100644 stacks/go-demo-instrument-alert-error.yml create mode 100644 stacks/go-demo-instrument-alert-short-2.yml diff --git a/docs/config.md b/docs/config.md index 2aef001..9d6aad4 100644 --- a/docs/config.md +++ b/docs/config.md @@ -2,14 +2,7 @@ *Docker Flow Monitor* can be configured through Docker environment variables and/or by creating a new image based on `vfarcic/docker-flow-monitor`. -## Environment Variables - -!!! tip - The *Docker Flow Monitor* container can be configured through environment variables - -The environment variables that can be used to configure *Docker Flow Monitor* are divided into two groups distinguished by variable name prefixes. - -### ARG Variables +## Startup Arguments Environment variables prefixed will `ARG_` are used instead Prometheus startup arguments. @@ -34,7 +27,7 @@ ARG_WEB_CONSOLE_LIBRARIES=/usr/share/prometheus/console_libraries ARG_WEB_CONSOLE_TEMPLATES=/usr/share/prometheus/consoles ``` -### GLOBAL Variables +## Global Configuration Environment variables prefixed with `GLOBAL_` are used instead Prometheus global entries in the configuration. @@ -76,6 +69,6 @@ curl `[IP_OF_ONE_OF_SWARM_NODES]:8080/v1/docker-flow-monitor/reconfigure?scrapeP Please consult [Prometheus Configuration](https://prometheus.io/docs/operating/configuration/) for more information about the available options. -## Secrets +## Scrapes Content of Docker secrets prefixed with the name `scrape_` is automatically added to the `scrape_configs` section of the configuration. \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index f93668e..2a6d3d9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,6 +48,9 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/) |----------------------------------|---------------------------------------------------------------| |@node_fs_limit:[PERCENTAGE] |Whether node file system usage is over specified percentage of the total available file system size.
**Requirements:** `node-exporter` metrics
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@node_fs_limit:0.8` would be expanded to `(node_filesystem_size{fstype="aufs"} - node_filesystem_free{fstype="aufs"}) / node_filesystem_size{fstype="aufs"} > 0.8`.| |@node_mem_limit:[PERCENTAGE] |Whether node memory usage is over specified percentage of the total node memory.
**Requirements:** `node-exporter` metrics
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@node_mem_limit:0.8` would be expanded to `(sum by (instance) (node_memory_MemTotal) - sum by (instance) (node_memory_MemFree + node_memory_Buffers + node_memory_Cached)) / sum by (instance) (node_memory_MemTotal) > 0.8`.| +|@resp_time_above:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is above the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.
[QUANTILE] must be one of the quantiles defined in the metric.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_above:0.1,5m,0.9999` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`.| +|@resp_time_below:[QUANTILE],[RATE_DURATION],[PERCENTAGE]|Whether response time of a given *quantile* over the specified *rate duration* is below the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with response times expessed in seconds.
[QUANTILE] must be one of the quantiles defined in the metric.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_below:0.025,5m,0.75` would be expanded to `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`.| +|@resp_time_server_error:[RATE_DURATION],[PERCENTAGE]|Whether error rate over the specified *rate duration* is below the set *percentage*.
**Requirements:** histogram with the name `http_server_resp_time` and with label `code` set to value of the HTTP response code.
[RATE_DURATION] can be in any format supported by Prometheus (e.g. `5m`).
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** `@resp_time_server_error:5m,0.001` would be expanded to `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`.| |@service_mem_limit:[PERCENTAGE] |Whether service memory usage is over specified percentage of the service memory limit.
**Requirements:** `cAdvisor` metrics and service memory limit specified as service resource.
[PERCENTAGE] must be specified as a decimal value (e.g. `0.8` equals `80%`).
**Example:** If `serviceName` is set to `my-service`, `@service_mem_limit:0.8` would be expanded to `container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`.| !!! note diff --git a/scripts/dm-swarm-13.sh b/scripts/dm-swarm-13.sh index 81043b8..3172a2d 100755 --- a/scripts/dm-swarm-13.sh +++ b/scripts/dm-swarm-13.sh @@ -13,9 +13,18 @@ docker stack deploy \ docker network create -d overlay monitor echo "route: - group_by: [service] + group_by: [service,scale] repeat_interval: 1h receiver: 'slack' + routes: + - match: + service: 'go-demo_main' + scale: 'up' + receiver: 'jenkins-go-demo_main-up' + - match: + service: 'go-demo_main' + scale: 'down' + receiver: 'jenkins-go-demo_main-down' receivers: - name: 'slack' @@ -25,6 +34,14 @@ receivers: title_link: 'http://$(docker-machine ip swarm-1)/monitor/alerts' text: '{{ .CommonAnnotations.summary}}' api_url: 'https://hooks.slack.com/services/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' + - name: 'jenkins-go-demo_main-up' + webhook_configs: + - send_resolved: false + url: 'http://$(docker-machine ip swarm-1)/jenkins/job/service-scale/buildWithParameters?token=DevOps22&service=go-demo_main&scale=1' + - name: 'jenkins-go-demo_main-down' + webhook_configs: + - send_resolved: false + url: 'http://$(docker-machine ip swarm-1)/jenkins/job/service-scale/buildWithParameters?token=DevOps22&service=go-demo_main&scale=-1' " | docker secret create alert_manager_config - DOMAIN=$(docker-machine ip swarm-1) \ @@ -43,4 +60,8 @@ export SLACK_IP=$(ping \ | awk -F'[()]' '/PING/{print $2}') docker stack deploy \ - -c stacks/jenkins-scale.yml jenkins \ No newline at end of file + -c stacks/jenkins-scale.yml jenkins + +docker stack deploy \ + -c stacks/go-demo-instrument-alert-short.yml \ + go-demo \ No newline at end of file diff --git a/server/server.go b/server/server.go index d5593b5..2d674fe 100644 --- a/server/server.go +++ b/server/server.go @@ -258,13 +258,18 @@ var alertIfShortcutData = []alertIfShortcut{ }, { expanded: `sum(rate(http_server_resp_time_bucket{job="[SERVICE_NAME]", le="[VALUE_0]"}[[VALUE_1]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_1]])) < [VALUE_2]`, shortcut: `@resp_time_above:`, - annotations: map[string]string{"summary": "Response time of a service [SERVICE_NAME] is above [VALUE_0]"}, + annotations: map[string]string{"summary": "Response time of the service [SERVICE_NAME] is above [VALUE_0]"}, labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "up"}, }, { expanded: `sum(rate(http_server_resp_time_bucket{job="[SERVICE_NAME]", le="[VALUE_0]"}[[VALUE_1]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_1]])) > [VALUE_2]`, shortcut: `@resp_time_below:`, - annotations: map[string]string{"summary": "Response time of a service [SERVICE_NAME] is below [VALUE_0]"}, + annotations: map[string]string{"summary": "Response time of the service [SERVICE_NAME] is below [VALUE_0]"}, labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "scale": "down"}, + }, { + expanded: `sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]", code=~"^5..$$"}[[VALUE_0]])) / sum(rate(http_server_resp_time_count{job="[SERVICE_NAME]"}[[VALUE_0]])) > [VALUE_1]`, + shortcut: `@resp_time_server_error:`, + annotations: map[string]string{"summary": "Error rate of the service [SERVICE_NAME] is above [VALUE_1]"}, + labels: map[string]string{"receiver": "system", "service": "[SERVICE_NAME]", "type": "errors"}, }, } diff --git a/server/server_test.go b/server/server_test.go index d4ead73..bf303bc 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -234,13 +234,18 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_AddsFormattedAlert() { }, { `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.9999`, `@resp_time_above:0.1,5m,0.9999`, - map[string]string{"summary": "Response time of a service my-service is above 0.1"}, + map[string]string{"summary": "Response time of the service my-service is above 0.1"}, map[string]string{"receiver": "system", "service": "my-service", "scale": "up"}, }, { `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75`, `@resp_time_below:0.025,5m,0.75`, - map[string]string{"summary": "Response time of a service my-service is below 0.025"}, + map[string]string{"summary": "Response time of the service my-service is below 0.025"}, map[string]string{"receiver": "system", "service": "my-service", "scale": "down"}, + }, { + `sum(rate(http_server_resp_time_count{job="my-service", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.001`, + `@resp_time_server_error:5m,0.001`, + map[string]string{"summary": "Error rate of the service my-service is above 0.001"}, + map[string]string{"receiver": "system", "service": "my-service", "type": "errors"}, }, } for _, data := range testData { diff --git a/stacks/go-demo-instrument-alert-2.yml b/stacks/go-demo-instrument-alert-2.yml new file mode 100644 index 0000000..4ffbf7b --- /dev/null +++ b/stacks/go-demo-instrument-alert-2.yml @@ -0,0 +1,55 @@ +version: '3' + +services: + + main: + image: vfarcic/go-demo + environment: + - DB=db + networks: + - proxy + - monitor + deploy: + replicas: 3 + update_config: + parallelism: 1 + delay: 10s + labels: + - com.df.notify=true + - com.df.distribute=true + - com.df.servicePath=/demo + - com.df.port=8080 + - com.df.alertName.1=memlimit + - com.df.alertIf.1=@service_mem_limit:0.8 + - com.df.alertFor.1=5m + - com.df.alertName.2=resptimeabove + - com.df.alertIf.2=sum(rate(http_server_resp_time_bucket{job="go-demo_main", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="go-demo_main"}[5m])) < 0.99 + - com.df.alertLabels.2=scale=up,service=go-demo_main + - com.df.alertName.3=resptimebelow + - com.df.alertIf.3=sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 + - com.df.alertLabels.3=scale=down,service=go-demo_main + - com.df.scrapePort=8080 + - com.df.scaleMin=2 + - com.df.scaleMax=4 + resources: + reservations: + memory: 5M + limits: + memory: 10M + + db: + image: mongo + networks: + - proxy + deploy: + resources: + reservations: + memory: 40M + limits: + memory: 80M + +networks: + proxy: + external: true + monitor: + external: true \ No newline at end of file diff --git a/stacks/go-demo-instrument-alert-error.yml b/stacks/go-demo-instrument-alert-error.yml new file mode 100644 index 0000000..51ce2e1 --- /dev/null +++ b/stacks/go-demo-instrument-alert-error.yml @@ -0,0 +1,57 @@ +version: '3' + +services: + + main: + image: vfarcic/go-demo + environment: + - DB=db + networks: + - proxy + - monitor + deploy: + replicas: 3 + update_config: + parallelism: 1 + delay: 10s + labels: + - com.df.notify=true + - com.df.distribute=true + - com.df.servicePath=/demo + - com.df.port=8080 + - com.df.scaleMin=2 + - com.df.scaleMax=4 + - com.df.scrapePort=8080 + - com.df.alertName.1=memlimit + - com.df.alertIf.1=@service_mem_limit:0.8 + - com.df.alertFor.1=5m + - com.df.alertName.2=resptimeabove + - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99 +# - com.df.alertName.3=resptimebelow +# - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75 + - com.df.alertName.3=errorrate + - com.df.alertIf.3=sum(rate(http_server_resp_time_count{job="go-demo_main", code=~"^5..$$"}[5m])) / sum(rate(http_server_resp_time_count{job="go-demo_main"}[5m])) > 0.001 + - com.df.alertLabels.3=service=go-demo_main,type=errors + - com.df.alertAnnotations.3=summary=Error rate is too high,description=Do something or start panicking + resources: + reservations: + memory: 5M + limits: + memory: 10M + + db: + image: mongo + networks: + - proxy + deploy: + resources: + reservations: + memory: 40M + limits: + memory: 80M + +networks: + proxy: + external: true + monitor: + external: true \ No newline at end of file diff --git a/stacks/go-demo-instrument-alert-short-2.yml b/stacks/go-demo-instrument-alert-short-2.yml new file mode 100644 index 0000000..8f6c14e --- /dev/null +++ b/stacks/go-demo-instrument-alert-short-2.yml @@ -0,0 +1,55 @@ +version: '3' + +services: + + main: + image: vfarcic/go-demo + environment: + - DB=db + networks: + - proxy + - monitor + deploy: + replicas: 3 + update_config: + parallelism: 1 + delay: 10s + labels: + - com.df.notify=true + - com.df.distribute=true + - com.df.servicePath=/demo + - com.df.port=8080 + - com.df.scaleMin=2 + - com.df.scaleMax=4 + - com.df.scrapePort=8080 + - com.df.alertName.1=memlimit + - com.df.alertIf.1=@service_mem_limit:0.8 + - com.df.alertFor.1=5m + - com.df.alertName.2=resptimeabove + - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99 +# - com.df.alertName.3=resptimebelow +# - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75 + - com.df.alertName.3=errorate + - com.df.alertIf.3=@resp_time_server_error:5m,0.001 + resources: + reservations: + memory: 5M + limits: + memory: 10M + + db: + image: mongo + networks: + - proxy + deploy: + resources: + reservations: + memory: 40M + limits: + memory: 80M + +networks: + proxy: + external: true + monitor: + external: true \ No newline at end of file diff --git a/stacks/go-demo-instrument-alert-short.yml b/stacks/go-demo-instrument-alert-short.yml index e0bc54a..a0e1527 100644 --- a/stacks/go-demo-instrument-alert-short.yml +++ b/stacks/go-demo-instrument-alert-short.yml @@ -27,10 +27,8 @@ services: - com.df.alertFor.1=5m - com.df.alertName.2=resptimeabove - com.df.alertIf.2=@resp_time_above:0.1,5m,0.99 - - com.df.alertFor.2=1m - com.df.alertName.3=resptimebelow - com.df.alertIf.3=@resp_time_below:0.025,5m,0.75 - - com.df.alertFor.3=1m resources: reservations: memory: 5M @@ -50,4 +48,6 @@ services: networks: proxy: + external: true + monitor: external: true \ No newline at end of file diff --git a/stacks/go-demo-instrument-alert.yml b/stacks/go-demo-instrument-alert.yml index 35a3e76..70fecd5 100644 --- a/stacks/go-demo-instrument-alert.yml +++ b/stacks/go-demo-instrument-alert.yml @@ -47,4 +47,6 @@ services: networks: proxy: + external: true + monitor: external: true \ No newline at end of file diff --git a/stacks/go-demo-instrument.yml b/stacks/go-demo-instrument.yml index 386a3ca..842c5d2 100644 --- a/stacks/go-demo-instrument.yml +++ b/stacks/go-demo-instrument.yml @@ -44,4 +44,6 @@ services: networks: proxy: + external: true + monitor: external: true \ No newline at end of file