diff --git a/src/charm.py b/src/charm.py index fba4b5b..315fad2 100755 --- a/src/charm.py +++ b/src/charm.py @@ -77,7 +77,8 @@ def __init__(self, *args): self._grafana_agent = COSAgentProvider( self, metrics_endpoints=[{"path": "/metrics", "port": 9092}], - metrics_rules_dir="./src/alert_rules/prometheus", + metrics_rules_dir="./src/cos/alert_rules/prometheus", + dashboard_dirs="./src/cos/grafana_dashboards", recurse_rules_dirs=True, ) diff --git a/src/alert_rules/prometheus/high_cpu_usage.rule b/src/cos/alert_rules/prometheus/high_cpu_usage.rule similarity index 100% rename from src/alert_rules/prometheus/high_cpu_usage.rule rename to src/cos/alert_rules/prometheus/high_cpu_usage.rule diff --git a/src/alert_rules/prometheus/unreachable_slurmdbd.rule b/src/cos/alert_rules/prometheus/unreachable_slurmdbd.rule similarity index 100% rename from src/alert_rules/prometheus/unreachable_slurmdbd.rule rename to src/cos/alert_rules/prometheus/unreachable_slurmdbd.rule diff --git a/src/cos/grafana_dashboards/Slurm-overview.json b/src/cos/grafana_dashboards/Slurm-overview.json new file mode 100644 index 0000000..e9643e9 --- /dev/null +++ b/src/cos/grafana_dashboards/Slurm-overview.json @@ -0,0 +1,590 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 8, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "panels": [], + "title": "Jobs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CANCELLED" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 6, + "interval": "5s", + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "max_over_time(slurm_partition_job_state_total{state!~\"RUNNING|PENDING\"}[24h])", + "format": "time_series", + "instant": false, + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Statistics", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "None", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 5, + "y": 1 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "builder", + "expr": "slurm_partition_job_state_total{state!~\"COMPLETED|CANCELLED\"}", + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "Active jobs", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 4, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 8 + }, + "id": 3, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "code", + "expr": "slurm_cpu_load / slurm_cpus_total", + "legendFormat": "Global CPU load", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "code", + "expr": "slurm_mem_alloc / slurm_mem_real", + "hide": false, + "legendFormat": "Global memory usage", + "range": true, + "refId": "B" + } + ], + "title": "System metrics", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 5, + "y": 8 + }, + "id": 2, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "avg_over_time(slurm_job_scrape_duration[10m])", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg. job scrape duration", + "range": true, + "refId": "Average scrape duration" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "code", + "expr": "avg_over_time(slurm_node_scrape_duration[10m])", + "hide": false, + "legendFormat": "Avg. node scrape duration", + "range": true, + "refId": "A" + } + ], + "title": "Scrape info", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "gridPos": { + "h": 14, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 5, + "options": { + "alertInstanceLabelFilter": "", + "alertName": "Slurm", + "dashboardAlerts": false, + "folder": "", + "groupBy": [], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": true, + "normal": true, + "pending": true + }, + "viewMode": "list" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "builder", + "expr": "slurm_job", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active alerts", + "transparent": true, + "type": "alertlist" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 14 + }, + "id": 1, + "options": { + "displayLabels": [], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "code", + "expr": "slurm_cpus_idle{}", + "legendFormat": "Idle CPUs", + "range": true, + "refId": "Idle CPUs" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "editorMode": "code", + "expr": "slurm_cpus_total - slurm_cpus_idle", + "hide": false, + "legendFormat": "Allocated CPUs", + "range": true, + "refId": "Allocated CPUs" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "hide": false, + "refId": "A" + } + ], + "title": "CPU count", + "type": "piechart" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Slurm", + "uid": "f1de14a1-f45a-4d46-a640-ecf56ec6c687", + "version": 6, + "weekStart": "" + } \ No newline at end of file