Merge pull request #34 from thomasjpfan/issue-28

Moves alertIf shortcuts to YAML file and alertIf secrets option
docker-flow · Feb 17, 2018 · c13d11a · c13d11a
2 parents f18893a + c91ea02
commit c13d11a
Show file tree

Hide file tree

Showing 5 changed files with 279 additions and 65 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -25,6 +25,7 @@ HEALTHCHECK --interval=5s CMD /bin/check.sh
 
 COPY --from=build /src/docker-flow-monitor /bin/docker-flow-monitor
 COPY check.sh /bin/check.sh
+COPY conf/shortcuts.yaml /etc/dfm/shortcuts.yaml
 
 USER root
 RUN chmod +x /bin/check.sh

diff --git a/conf/shortcuts.yaml b/conf/shortcuts.yaml
@@ -0,0 +1,92 @@
+"@service_mem_limit":
+  expanded: container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
+  annotations:
+    summary: Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+"@node_mem_limit":
+  expanded: (sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum by (instance) (node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}
+  annotations:
+    summary: Memory of a node is over {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+"@node_mem_limit_total_above":
+  expanded: (sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}
+  annotations:
+    summary: Total memory of the nodes is over {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: up
+    type: node
+"@node_mem_limit_total_below":
+  expanded: (sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) < {{ index .Values 0 }}
+  annotations:
+    summary: Total memory of the nodes is below {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: "down"
+    type: node
+"@node_fs_limit":
+  expanded: (node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} - node_filesystem_free{fstype="aufs", job="{{ .Alert.ServiceName }}"}) / node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
+  annotations:
+    summary: Disk usage of a node is over {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+"@resp_time_above":
+  expanded: sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) < {{ index .Values 2 }}
+  annotations:
+    summary: Response time of the service {{ .Alert.ServiceName }} is above {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: up
+    type: service
+"@resp_time_below":
+  expanded: sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) > {{ index .Values 2 }}
+  annotations:
+    summary: Response time of the service {{ .Alert.ServiceName }} is below {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: down
+    type: service
+"@replicas_running":
+  expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) != {{ .Alert.Replicas }}
+  annotations:
+    summary: The number of running replicas of the service {{ .Alert.ServiceName }} is not {{ .Alert.Replicas }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: up
+    type: node
+"@replicas_less_than":
+  expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) < {{ .Alert.Replicas }}
+  annotations:
+    summary: The number of running replicas of the service {{ .Alert.ServiceName }} is less than {{ .Alert.Replicas }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: up
+    type: node
+"@replicas_more_than":
+  expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) > {{ .Alert.Replicas }}
+  annotations:
+    summary: The number of running replicas of the service {{ .Alert.ServiceName }} is more than {{ .Alert.Replicas }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    scale: up
+    type: node
+"@resp_time_server_error":
+  expanded: sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}", code=~"^5..$$"}[{{ index .Values 0 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 0 }}])) > {{ index .Values 1 }}
+  annotations:
+    summary: Error rate of the service {{ .Alert.ServiceName }} is above {{ index .Values 1 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+    type: errors
diff --git a/docs/usage.md b/docs/usage.md
@@ -64,6 +64,23 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
 !!! note
     I hope that the number of shortcuts will grow with time thanks to community contributions. Please create [an issue](https://github.com/vfarcic/docker-flow-monitor/issues) with the `alertIf` statement and the suggested shortcut and I'll add it to the code as soon as possible.
 
+### AlertIf Secrets Configuration
+
+*Docker Flow Monitor* supports [Docker Secrets](https://docs.docker.com/engine/swarm/secrets/) for adding custom alertIf shortcuts. Only secrets with names that start with `alertif-` or `alertif_` will be considered. `alertIf` shortcuts are configured as a yaml file with a series of dictionaries. The key of each dictionary is your custom `alertIf` shortcut which must begin with the `@` character. The value of each dictionary consist of three keys: `expanded`, `annotations` and `labels`. `expanded` contains the expanded alert using go [templates](https://golang.org/pkg/text/template/). `annotations` and `labels` contains a dictionary with the alert's annotations and labels. For example `@service_mem_limit` is defined by the following yaml:
+
+```yaml
+"@service_mem_limit":
+  expanded: container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
+  annotations:
+    summary: Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}
+  labels:
+    receiver: system
+    service: "{{ .Alert.ServiceName }}"
+```
+
+!!! tip
+    AlertIf shortcuts defined in secrets will take priority over default shortcuts.
+
 ### AlertIf Logical Operators
 
 The logical operators `and`, `unless`, and `or` can be used in combinations with AlertIf Parameter Shortcuts. For example, to create an alert that triggers when response time is low unless response time is high, set `alertIf=@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`. This alert prevents `@resp_time_below` from triggering while `@resp_time_above` is triggering. The `summary` annotation for this alert will be merged with the `and` operator: "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1". When using logical operators, there are no default alert labels. The alert labels will have to be manually set by using the `alertLabels` query parameter.

diff --git a/server/server.go b/server/server.go
@@ -17,12 +17,18 @@ import (
 	"../prometheus"
 	"github.com/gorilla/mux"
 	"github.com/gorilla/schema"
+	"github.com/spf13/afero"
+	yaml "gopkg.in/yaml.v2"
 )
 
+// FS defines file system used to read and write configuration files
+var FS = afero.NewOsFs()
 var decoder = schema.NewDecoder()
 var mu = &sync.Mutex{}
 var logPrintf = log.Printf
 var listenerTimeout = 30 * time.Second
+var shortcutsPath = "/etc/dfm/shortcuts.yaml"
+var alertIfShortcutData map[string]AlertIfShortcut
 
 type serve struct {
 	scrapes    map[string]prometheus.Scrape
@@ -48,6 +54,7 @@ var New = func() *serve {
 	if len(promConfig) == 0 {
 		promConfig = "/etc/prometheus/prometheus.yml"
 	}
+	alertIfShortcutData = GetShortcuts()
 	return &serve{
 		alerts:     make(map[string]prometheus.Alert),
 		scrapes:    make(map[string]prometheus.Scrape),
@@ -254,73 +261,75 @@ func (s *serve) getAlerts(req *http.Request) []prometheus.Alert {
 	return alerts
 }
 
-type alertIfShortcut struct {
-	expanded    string
-	annotations map[string]string
-	labels      map[string]string
+// AlertIfShortcut defines how to expand a alertIf shortcut
+type AlertIfShortcut struct {
+	Expanded    string            `yaml:"expanded"`
+	Annotations map[string]string `yaml:"annotations"`
+	Labels      map[string]string `yaml:"labels"`
 }
 
 type alertTemplateInput struct {
 	Alert  *prometheus.Alert
 	Values []string
 }
 
-var alertIfShortcutData = map[string]alertIfShortcut{
-	"@service_mem_limit": {
-		expanded:    `container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}`,
-		annotations: map[string]string{"summary": "Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
-	},
-	"@node_mem_limit": {
-		expanded:    `(sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum by (instance) (node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}`,
-		annotations: map[string]string{"summary": "Memory of a node is over {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
-	},
-	"@node_mem_limit_total_above": {
-		expanded:    `(sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}`,
-		annotations: map[string]string{"summary": "Total memory of the nodes is over {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
-	},
-	"@node_mem_limit_total_below": {
-		expanded:    `(sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) < {{ index .Values 0 }}`,
-		annotations: map[string]string{"summary": "Total memory of the nodes is below {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "down", "type": "node"},
-	},
-	"@node_fs_limit": {
-		expanded:    `(node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} - node_filesystem_free{fstype="aufs", job="{{ .Alert.ServiceName }}"}) / node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}`,
-		annotations: map[string]string{"summary": "Disk usage of a node is over {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
-	},
-	"@resp_time_above": {
-		expanded:    `sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) < {{ index .Values 2 }}`,
-		annotations: map[string]string{"summary": "Response time of the service {{ .Alert.ServiceName }} is above {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "service"},
-	},
-	"@resp_time_below": {
-		expanded:    `sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) > {{ index .Values 2 }}`,
-		annotations: map[string]string{"summary": "Response time of the service {{ .Alert.ServiceName }} is below {{ index .Values 0 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "down", "type": "service"},
-	},
-	"@replicas_running": {
-		expanded:    `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) != {{ .Alert.Replicas }}`,
-		annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is not {{ .Alert.Replicas }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
-	},
-	"@replicas_less_than": {
-		expanded:    `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) < {{ .Alert.Replicas }}`,
-		annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is less than {{ .Alert.Replicas }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
-	},
-	"@replicas_more_than": {
-		expanded:    `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) > {{ .Alert.Replicas }}`,
-		annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is more than {{ .Alert.Replicas }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
-	},
-	"@resp_time_server_error": {
-		expanded:    `sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}", code=~"^5..$$"}[{{ index .Values 0 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 0 }}])) > {{ index .Values 1 }}`,
-		annotations: map[string]string{"summary": "Error rate of the service {{ .Alert.ServiceName }} is above {{ index .Values 1 }}"},
-		labels:      map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "type": "errors"},
-	},
+// GetShortcuts returns shortcuts from a YAML file
+func GetShortcuts() map[string]AlertIfShortcut {
+	yamlData, err := afero.ReadFile(FS, shortcutsPath)
+	if err != nil {
+		logPrintf(err.Error())
+		return map[string]AlertIfShortcut{}
+	}
+	shortcuts := map[string]AlertIfShortcut{}
+	err = yaml.Unmarshal(yamlData, &shortcuts)
+
+	if err != nil {
+		logPrintf(err.Error())
+		return map[string]AlertIfShortcut{}
+	}
+
+	if isDir, err := afero.IsDir(FS, "/run/secrets"); err != nil || !isDir {
+		return shortcuts
+	}
+
+	// Load alertIf shortcuts from secrets
+	files, err := afero.ReadDir(FS, "/run/secrets")
+	if err != nil {
+		logPrintf(err.Error())
+		return shortcuts
+	}
+
+	for _, file := range files {
+		if file.IsDir() {
+			continue
+		}
+		lName := strings.ToLower(file.Name())
+		if !strings.HasPrefix(lName, "alertif-") &&
+			!strings.HasPrefix(lName, "alertif_") {
+			continue
+		}
+
+		path := fmt.Sprintf("/run/secrets/%s", file.Name())
+		yamlData, err = afero.ReadFile(FS, path)
+		if err != nil {
+			logPrintf("Unable to read %s, error: %v", path, err)
+			continue
+		}
+
+		secretShortcuts := map[string]AlertIfShortcut{}
+		err = yaml.Unmarshal(yamlData, &secretShortcuts)
+		if err != nil {
+			logPrintf("YAML decoding reading %s, error: %v", path, err)
+			continue
+		}
+		fmt.Println(secretShortcuts)
+
+		for k, v := range secretShortcuts {
+			shortcuts[k] = v
+		}
+	}
+
+	return shortcuts
 }
 
 func (s *serve) formatAlert(alert *prometheus.Alert) {
@@ -353,12 +362,12 @@ func formatSingleAlert(alert *prometheus.Alert) {
 		return
 	}
 
-	alert.AlertIf = replaceTags(data.expanded, alert, value)
+	alert.AlertIf = replaceTags(data.Expanded, alert, value)
 
 	if alert.AlertAnnotations == nil {
 		alert.AlertAnnotations = map[string]string{}
 	}
-	for k, v := range data.annotations {
+	for k, v := range data.Annotations {
 		if _, ok := alert.AlertAnnotations[k]; !ok {
 			alert.AlertAnnotations[k] = replaceTags(v, alert, value)
 		}
@@ -367,7 +376,7 @@ func formatSingleAlert(alert *prometheus.Alert) {
 	if alert.AlertLabels == nil {
 		alert.AlertLabels = map[string]string{}
 	}
-	for k, v := range data.labels {
+	for k, v := range data.Labels {
 		if _, ok := alert.AlertLabels[k]; !ok {
 			alert.AlertLabels[k] = replaceTags(v, alert, value)
 		}
@@ -403,12 +412,12 @@ func formatCompoundAlert(alert *prometheus.Alert) {
 			return
 		}
 
-		alertIfFormattedBuffer.WriteString(replaceTags(data.expanded, alert, value))
+		alertIfFormattedBuffer.WriteString(replaceTags(data.Expanded, alert, value))
 		if len(bOp) > 0 {
 			alertIfFormattedBuffer.WriteString(fmt.Sprintf(" %s ", bOp))
 		}
 
-		for k, v := range data.annotations {
+		for k, v := range data.Annotations {
 			if _, ok := immutableAnnotations[k]; ok {
 				continue
 			}