diff --git a/roles/velero/defaults/main.yml b/roles/velero/defaults/main.yml index 01256c10..49be8ddf 100644 --- a/roles/velero/defaults/main.yml +++ b/roles/velero/defaults/main.yml @@ -94,6 +94,34 @@ velero_release_namespace: velero velero_release_name: velero velero_wait_timeout: 10m velero_release_defaults: + metrics: + enabled: true + serviceMonitor: + enabled: true + prometheusRule: + enabled: true + spec: + - alert: VeleroBackupPartialFailures + annotations: + # Use the unsafe tag to prevent Ansible trying to render it as a template + message: !unsafe >- + Velero schedule '{{ $labels.schedule }}' has partially failed backups in the last 24 hours. + expr: |- + sum(increase(velero_backup_partial_failure_total{schedule!=""}[24h])) by(schedule) > 0 + for: 15m + labels: + severity: warning + + - alert: VeleroBackupFailures + annotations: + # Use the unsafe tag to prevent Ansible trying to render it as a template + message: !unsafe >- + Velero schedule '{{ $labels.schedule }}' has failed backups in the last 24 hours. + expr: |- + sum(increase(velero_backup_failure_total{schedule!=""}[24h])) by(schedule) > 0 + for: 15m + labels: + severity: warning configuration: features: EnableCSI backupStorageLocation: diff --git a/roles/velero/files/grafana_dashboard.json b/roles/velero/files/grafana_dashboard.json new file mode 100644 index 00000000..63871127 --- /dev/null +++ b/roles/velero/files/grafana_dashboard.json @@ -0,0 +1,1153 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 57, + "links": [], + "panels": [ + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of backups currently held.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 10, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.2+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(velero_backup_total)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Backups Held", + "type": "stat" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #D" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "Failure" + }, + "1": { + "color": "green", + "index": 0, + "text": "Success" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "applyToRow": false, + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #C" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "applyToRow": false, + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #E" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeFromNow" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #F" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #B" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 14, + "x": 10, + "y": 0 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.2.2+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "velero_backup_last_status{schedule!=\"\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "velero_backup_last_successful_timestamp{schedule!=\"\"} * 1000", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "title": "Schedules", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "schedule", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "container 1": true, + "container 2": true, + "container 3": true, + "container 4": true, + "container 5": true, + "container 6": true, + "endpoint 1": true, + "endpoint 2": true, + "endpoint 3": true, + "endpoint 4": true, + "endpoint 5": true, + "endpoint 6": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "namespace 5": true, + "namespace 6": true, + "pod 1": true, + "pod 2": true, + "pod 3": true, + "pod 4": true, + "pod 5": true, + "pod 6": true, + "service 1": true, + "service 2": true, + "service 3": true, + "service 4": true, + "service 5": true, + "service 6": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 6, + "Time 2": 14, + "Time 3": 22, + "Time 4": 30, + "Time 5": 41, + "Value #A": 4, + "Value #B": 5, + "Value #D": 1, + "Value #E": 2, + "Value #F": 3, + "__name__ 1": 38, + "__name__ 2": 39, + "__name__ 3": 40, + "container 1": 7, + "container 2": 15, + "container 3": 23, + "container 4": 31, + "container 5": 42, + "endpoint 1": 8, + "endpoint 2": 16, + "endpoint 3": 24, + "endpoint 4": 32, + "endpoint 5": 43, + "instance 1": 9, + "instance 2": 17, + "instance 3": 25, + "instance 4": 33, + "instance 5": 44, + "job 1": 10, + "job 2": 18, + "job 3": 26, + "job 4": 34, + "job 5": 45, + "namespace 1": 11, + "namespace 2": 19, + "namespace 3": 27, + "namespace 4": 35, + "namespace 5": 46, + "pod 1": 12, + "pod 2": 20, + "pod 3": 28, + "pod 4": 36, + "pod 5": 47, + "schedule": 0, + "service 1": 13, + "service 2": 21, + "service 3": 29, + "service 4": 37, + "service 5": 48 + }, + "renameByName": { + "Value #A": "Last BackupĀ Items", + "Value #B": "Last Backup Duration", + "Value #C": "# Failed", + "Value #D": "Last Status", + "Value #E": "Last Succeeded", + "Value #F": "Last Backup Size", + "schedule": "Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The percentage of backups that have succeeded in the last 7 days.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.6 + }, + { + "color": "green", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 4 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.2+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(velero_backup_success_total[$__range])) / sum(increase(velero_backup_attempt_total[$__range]))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "timeFrom": "7d", + "title": "Backup Success Rate", + "type": "gauge" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The percentage of CSI snapshots that succeeded in the last 7 days.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.6 + }, + { + "color": "green", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 5, + "y": 4 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.2+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "increase(sum(velero_csi_snapshot_success_total)[$__range:]) / increase(sum(velero_csi_snapshot_attempt_total)[$__range:])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "timeFrom": "7d", + "title": "CSI Snapshot Success Rate", + "type": "gauge" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of backups by schedule.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "round(sum(increase(velero_backup_success_total[$__rate_interval])) by(schedule))", + "instant": false, + "legendFormat": "{{schedule}} - success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "round(sum(increase(velero_backup_failure_total[$__rate_interval])) by(schedule))", + "hide": false, + "instant": false, + "legendFormat": "{{schedule}} - failure", + "range": true, + "refId": "B" + } + ], + "title": "Backup Count", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": " - (.+)$", + "renamePattern": "none - $1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of CSI snapshots by schedule.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "round(increase(sum(velero_csi_snapshot_success_total) by(schedule)[$__rate_interval:]))", + "instant": false, + "legendFormat": "{{schedule}} - success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "round(increase(sum(velero_csi_snapshot_failure_total) by(schedule)[$__rate_interval:]))", + "hide": false, + "instant": false, + "legendFormat": "{{schedule}} - failure", + "range": true, + "refId": "B" + } + ], + "title": "CSI Snapshot Count", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": " - (.+)$", + "renamePattern": "none - $1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 17 + }, + "id": 10, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xField": "Time", + "xTickLabelRotation": 0, + "xTickLabelSpacing": 100 + }, + "pluginVersion": "11.2.2+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(sum(velero_backup_duration_seconds_sum) by(schedule)[$__rate_interval:]) / rate(sum(velero_backup_duration_seconds_count) by(schedule)[$__rate_interval:])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Backup Duration", + "type": "barchart" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The size of the most recent backup for each schedule.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 5, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 17 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(velero_backup_tarball_size_bytes) by(schedule)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Backup Size", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "The number of items in the last backup for each schedule.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 17 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(velero_backup_items_total) by(schedule) > 0", + "instant": false, + "legendFormat": "{{schedule}} - total", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(velero_backup_items_errors) by(schedule) > 0", + "hide": false, + "instant": false, + "legendFormat": "{{schedule}} - errors", + "range": true, + "refId": "B" + } + ], + "title": "Backup Items", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": " - (.+)", + "renamePattern": "none - $1" + } + } + ], + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Velero Overview", + "uid": "ee1x4slai5nggc", + "version": 13, + "weekStart": "" + } diff --git a/roles/velero/tasks/install.yml b/roles/velero/tasks/install.yml index d29c0135..9ede412d 100644 --- a/roles/velero/tasks/install.yml +++ b/roles/velero/tasks/install.yml @@ -67,6 +67,23 @@ wait: yes wait_timeout: "{{ velero_wait_timeout }}" +- name: Install Grafana dashboard for Velero metrics + command: kubectl apply --server-side --force-conflicts -f - + args: + stdin: "{{ velero_dashboard_definition | to_nice_yaml }}" + vars: + velero_dashboard_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: velero-grafana-dashboard + namespace: "{{ velero_release_namespace }}" + labels: + grafana_dashboard: "1" + data: + velero_dashboard.json: |- + {{ lookup('file', 'grafana_dashboard.json' ) | from_json | to_nice_json }} + - block: - name: Ensure Velero CLI unpack directory exists file: