Skip to content

Commit

Permalink
Merge pull request #34 from thomasjpfan/issue-28
Browse files Browse the repository at this point in the history
Moves alertIf shortcuts to YAML file and alertIf secrets option
  • Loading branch information
vfarcic authored Feb 17, 2018
2 parents f18893a + c91ea02 commit c13d11a
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 65 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ HEALTHCHECK --interval=5s CMD /bin/check.sh

COPY --from=build /src/docker-flow-monitor /bin/docker-flow-monitor
COPY check.sh /bin/check.sh
COPY conf/shortcuts.yaml /etc/dfm/shortcuts.yaml

USER root
RUN chmod +x /bin/check.sh
Expand Down
92 changes: 92 additions & 0 deletions conf/shortcuts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"@service_mem_limit":
expanded: container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
annotations:
summary: Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
"@node_mem_limit":
expanded: (sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum by (instance) (node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}
annotations:
summary: Memory of a node is over {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
"@node_mem_limit_total_above":
expanded: (sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}
annotations:
summary: Total memory of the nodes is over {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: up
type: node
"@node_mem_limit_total_below":
expanded: (sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) < {{ index .Values 0 }}
annotations:
summary: Total memory of the nodes is below {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: "down"
type: node
"@node_fs_limit":
expanded: (node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} - node_filesystem_free{fstype="aufs", job="{{ .Alert.ServiceName }}"}) / node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
annotations:
summary: Disk usage of a node is over {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
"@resp_time_above":
expanded: sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) < {{ index .Values 2 }}
annotations:
summary: Response time of the service {{ .Alert.ServiceName }} is above {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: up
type: service
"@resp_time_below":
expanded: sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) > {{ index .Values 2 }}
annotations:
summary: Response time of the service {{ .Alert.ServiceName }} is below {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: down
type: service
"@replicas_running":
expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) != {{ .Alert.Replicas }}
annotations:
summary: The number of running replicas of the service {{ .Alert.ServiceName }} is not {{ .Alert.Replicas }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: up
type: node
"@replicas_less_than":
expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) < {{ .Alert.Replicas }}
annotations:
summary: The number of running replicas of the service {{ .Alert.ServiceName }} is less than {{ .Alert.Replicas }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: up
type: node
"@replicas_more_than":
expanded: count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) > {{ .Alert.Replicas }}
annotations:
summary: The number of running replicas of the service {{ .Alert.ServiceName }} is more than {{ .Alert.Replicas }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
scale: up
type: node
"@resp_time_server_error":
expanded: sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}", code=~"^5..$$"}[{{ index .Values 0 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 0 }}])) > {{ index .Values 1 }}
annotations:
summary: Error rate of the service {{ .Alert.ServiceName }} is above {{ index .Values 1 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
type: errors
17 changes: 17 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
!!! note
I hope that the number of shortcuts will grow with time thanks to community contributions. Please create [an issue](https://github.com/vfarcic/docker-flow-monitor/issues) with the `alertIf` statement and the suggested shortcut and I'll add it to the code as soon as possible.

### AlertIf Secrets Configuration

*Docker Flow Monitor* supports [Docker Secrets](https://docs.docker.com/engine/swarm/secrets/) for adding custom alertIf shortcuts. Only secrets with names that start with `alertif-` or `alertif_` will be considered. `alertIf` shortcuts are configured as a yaml file with a series of dictionaries. The key of each dictionary is your custom `alertIf` shortcut which must begin with the `@` character. The value of each dictionary consist of three keys: `expanded`, `annotations` and `labels`. `expanded` contains the expanded alert using go [templates](https://golang.org/pkg/text/template/). `annotations` and `labels` contains a dictionary with the alert's annotations and labels. For example `@service_mem_limit` is defined by the following yaml:

```yaml
"@service_mem_limit":
expanded: container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}
annotations:
summary: Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}
labels:
receiver: system
service: "{{ .Alert.ServiceName }}"
```
!!! tip
AlertIf shortcuts defined in secrets will take priority over default shortcuts.
### AlertIf Logical Operators
The logical operators `and`, `unless`, and `or` can be used in combinations with AlertIf Parameter Shortcuts. For example, to create an alert that triggers when response time is low unless response time is high, set `alertIf=@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`. This alert prevents `@resp_time_below` from triggering while `@resp_time_above` is triggering. The `summary` annotation for this alert will be merged with the `and` operator: "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1". When using logical operators, there are no default alert labels. The alert labels will have to be manually set by using the `alertLabels` query parameter.
Expand Down
139 changes: 74 additions & 65 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,18 @@ import (
"../prometheus"
"github.com/gorilla/mux"
"github.com/gorilla/schema"
"github.com/spf13/afero"
yaml "gopkg.in/yaml.v2"
)

// FS defines file system used to read and write configuration files
var FS = afero.NewOsFs()
var decoder = schema.NewDecoder()
var mu = &sync.Mutex{}
var logPrintf = log.Printf
var listenerTimeout = 30 * time.Second
var shortcutsPath = "/etc/dfm/shortcuts.yaml"
var alertIfShortcutData map[string]AlertIfShortcut

type serve struct {
scrapes map[string]prometheus.Scrape
Expand All @@ -48,6 +54,7 @@ var New = func() *serve {
if len(promConfig) == 0 {
promConfig = "/etc/prometheus/prometheus.yml"
}
alertIfShortcutData = GetShortcuts()
return &serve{
alerts: make(map[string]prometheus.Alert),
scrapes: make(map[string]prometheus.Scrape),
Expand Down Expand Up @@ -254,73 +261,75 @@ func (s *serve) getAlerts(req *http.Request) []prometheus.Alert {
return alerts
}

type alertIfShortcut struct {
expanded string
annotations map[string]string
labels map[string]string
// AlertIfShortcut defines how to expand a alertIf shortcut
type AlertIfShortcut struct {
Expanded string `yaml:"expanded"`
Annotations map[string]string `yaml:"annotations"`
Labels map[string]string `yaml:"labels"`
}

type alertTemplateInput struct {
Alert *prometheus.Alert
Values []string
}

var alertIfShortcutData = map[string]alertIfShortcut{
"@service_mem_limit": {
expanded: `container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}`,
annotations: map[string]string{"summary": "Memory of the service {{ .Alert.ServiceName }} is over {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
},
"@node_mem_limit": {
expanded: `(sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum by (instance) (node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum by (instance) (node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}`,
annotations: map[string]string{"summary": "Memory of a node is over {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
},
"@node_mem_limit_total_above": {
expanded: `(sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) > {{ index .Values 0 }}`,
annotations: map[string]string{"summary": "Total memory of the nodes is over {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
},
"@node_mem_limit_total_below": {
expanded: `(sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) - sum(node_memory_MemFree{job="{{ .Alert.ServiceName }}"} + node_memory_Buffers{job="{{ .Alert.ServiceName }}"} + node_memory_Cached{job="{{ .Alert.ServiceName }}"})) / sum(node_memory_MemTotal{job="{{ .Alert.ServiceName }}"}) < {{ index .Values 0 }}`,
annotations: map[string]string{"summary": "Total memory of the nodes is below {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "down", "type": "node"},
},
"@node_fs_limit": {
expanded: `(node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} - node_filesystem_free{fstype="aufs", job="{{ .Alert.ServiceName }}"}) / node_filesystem_size{fstype="aufs", job="{{ .Alert.ServiceName }}"} > {{ index .Values 0 }}`,
annotations: map[string]string{"summary": "Disk usage of a node is over {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}"},
},
"@resp_time_above": {
expanded: `sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) < {{ index .Values 2 }}`,
annotations: map[string]string{"summary": "Response time of the service {{ .Alert.ServiceName }} is above {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "service"},
},
"@resp_time_below": {
expanded: `sum(rate(http_server_resp_time_bucket{job="{{ .Alert.ServiceName }}", le="{{ index .Values 0 }}"}[{{ index .Values 1 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 1 }}])) > {{ index .Values 2 }}`,
annotations: map[string]string{"summary": "Response time of the service {{ .Alert.ServiceName }} is below {{ index .Values 0 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "down", "type": "service"},
},
"@replicas_running": {
expanded: `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) != {{ .Alert.Replicas }}`,
annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is not {{ .Alert.Replicas }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
},
"@replicas_less_than": {
expanded: `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) < {{ .Alert.Replicas }}`,
annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is less than {{ .Alert.Replicas }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
},
"@replicas_more_than": {
expanded: `count(container_memory_usage_bytes{container_label_com_docker_swarm_service_name="{{ .Alert.ServiceName }}"}) > {{ .Alert.Replicas }}`,
annotations: map[string]string{"summary": "The number of running replicas of the service {{ .Alert.ServiceName }} is more than {{ .Alert.Replicas }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "scale": "up", "type": "node"},
},
"@resp_time_server_error": {
expanded: `sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}", code=~"^5..$$"}[{{ index .Values 0 }}])) / sum(rate(http_server_resp_time_count{job="{{ .Alert.ServiceName }}"}[{{ index .Values 0 }}])) > {{ index .Values 1 }}`,
annotations: map[string]string{"summary": "Error rate of the service {{ .Alert.ServiceName }} is above {{ index .Values 1 }}"},
labels: map[string]string{"receiver": "system", "service": "{{ .Alert.ServiceName }}", "type": "errors"},
},
// GetShortcuts returns shortcuts from a YAML file
func GetShortcuts() map[string]AlertIfShortcut {
yamlData, err := afero.ReadFile(FS, shortcutsPath)
if err != nil {
logPrintf(err.Error())
return map[string]AlertIfShortcut{}
}
shortcuts := map[string]AlertIfShortcut{}
err = yaml.Unmarshal(yamlData, &shortcuts)

if err != nil {
logPrintf(err.Error())
return map[string]AlertIfShortcut{}
}

if isDir, err := afero.IsDir(FS, "/run/secrets"); err != nil || !isDir {
return shortcuts
}

// Load alertIf shortcuts from secrets
files, err := afero.ReadDir(FS, "/run/secrets")
if err != nil {
logPrintf(err.Error())
return shortcuts
}

for _, file := range files {
if file.IsDir() {
continue
}
lName := strings.ToLower(file.Name())
if !strings.HasPrefix(lName, "alertif-") &&
!strings.HasPrefix(lName, "alertif_") {
continue
}

path := fmt.Sprintf("/run/secrets/%s", file.Name())
yamlData, err = afero.ReadFile(FS, path)
if err != nil {
logPrintf("Unable to read %s, error: %v", path, err)
continue
}

secretShortcuts := map[string]AlertIfShortcut{}
err = yaml.Unmarshal(yamlData, &secretShortcuts)
if err != nil {
logPrintf("YAML decoding reading %s, error: %v", path, err)
continue
}
fmt.Println(secretShortcuts)

for k, v := range secretShortcuts {
shortcuts[k] = v
}
}

return shortcuts
}

func (s *serve) formatAlert(alert *prometheus.Alert) {
Expand Down Expand Up @@ -353,12 +362,12 @@ func formatSingleAlert(alert *prometheus.Alert) {
return
}

alert.AlertIf = replaceTags(data.expanded, alert, value)
alert.AlertIf = replaceTags(data.Expanded, alert, value)

if alert.AlertAnnotations == nil {
alert.AlertAnnotations = map[string]string{}
}
for k, v := range data.annotations {
for k, v := range data.Annotations {
if _, ok := alert.AlertAnnotations[k]; !ok {
alert.AlertAnnotations[k] = replaceTags(v, alert, value)
}
Expand All @@ -367,7 +376,7 @@ func formatSingleAlert(alert *prometheus.Alert) {
if alert.AlertLabels == nil {
alert.AlertLabels = map[string]string{}
}
for k, v := range data.labels {
for k, v := range data.Labels {
if _, ok := alert.AlertLabels[k]; !ok {
alert.AlertLabels[k] = replaceTags(v, alert, value)
}
Expand Down Expand Up @@ -403,12 +412,12 @@ func formatCompoundAlert(alert *prometheus.Alert) {
return
}

alertIfFormattedBuffer.WriteString(replaceTags(data.expanded, alert, value))
alertIfFormattedBuffer.WriteString(replaceTags(data.Expanded, alert, value))
if len(bOp) > 0 {
alertIfFormattedBuffer.WriteString(fmt.Sprintf(" %s ", bOp))
}

for k, v := range data.annotations {
for k, v := range data.Annotations {
if _, ok := immutableAnnotations[k]; ok {
continue
}
Expand Down
Loading

0 comments on commit c13d11a

Please sign in to comment.