From a42b1751e993476ddc92f7f318ee62c1c7897788 Mon Sep 17 00:00:00 2001 From: Vikram Venkataraman Date: Mon, 24 Oct 2022 11:40:44 -0400 Subject: [PATCH] Node exporter dashboards (#43) * Added dashboards for node exporter * Drop use cluster and node * Add nodename to ne metrics * Add account ID and region Co-authored-by: Venkataraman Co-authored-by: Rodrigue Koffi --- .github/workflows/plan-examples.yml | 2 +- .github/workflows/pre-commit.yaml | 2 +- .github/workflows/stale_issue_pr.yaml | 2 +- modules/workloads/infra/README.md | 1 + modules/workloads/infra/dashboards.tf | 6 + .../infra/dashboards/nodeexporter-nodes.json | 1282 +++++++++++++++++ modules/workloads/infra/main.tf | 6 +- .../templates/opentelemetrycollector.yaml | 10 + 8 files changed, 1307 insertions(+), 4 deletions(-) create mode 100644 modules/workloads/infra/dashboards/nodeexporter-nodes.json diff --git a/.github/workflows/plan-examples.yml b/.github/workflows/plan-examples.yml index 480ac997..a1570cc8 100644 --- a/.github/workflows/plan-examples.yml +++ b/.github/workflows/plan-examples.yml @@ -8,7 +8,7 @@ on: workflow_dispatch: concurrency: - group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}' + group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}" cancel-in-progress: true jobs: diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index 63bdf4c5..f39fe6d4 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -137,4 +137,4 @@ jobs: with: terraform-version: ${{ steps.minMax.outputs.maxVersion }} terraform-docs-version: ${{ env.TERRAFORM_DOCS_VERSION }} - tflint-version: ${{ env.TFLINT_VERSION }} \ No newline at end of file + tflint-version: ${{ env.TFLINT_VERSION }} diff --git a/.github/workflows/stale_issue_pr.yaml b/.github/workflows/stale_issue_pr.yaml index 9d3a86e0..035a69fb 100644 --- a/.github/workflows/stale_issue_pr.yaml +++ b/.github/workflows/stale_issue_pr.yaml @@ -30,4 +30,4 @@ jobs: with no activity. Remove stale label or comment or this issue will be closed in 10 days stale-pr-message: | This PR has been automatically marked as stale because it has been open 30 days - with no activity. Remove stale label or comment or this PR will be closed in 10 days \ No newline at end of file + with no activity. Remove stale label or comment or this PR will be closed in 10 days diff --git a/modules/workloads/infra/README.md b/modules/workloads/infra/README.md index 3361d861..bfdea8a6 100644 --- a/modules/workloads/infra/README.md +++ b/modules/workloads/infra/README.md @@ -42,6 +42,7 @@ This module is inspired from the open source [kube-prometheus-stack](https://git | [aws_prometheus_rule_group_namespace.recording_rules](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_rule_group_namespace) | resource | | [grafana_dashboard.cluster](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | | [grafana_dashboard.kubelet](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | +| [grafana_dashboard.nodeexp_nodes](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | | [grafana_dashboard.nodes](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | | [grafana_dashboard.nsworkload](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | | [grafana_dashboard.workloads](https://registry.terraform.io/providers/grafana/grafana/latest/docs/resources/dashboard) | resource | diff --git a/modules/workloads/infra/dashboards.tf b/modules/workloads/infra/dashboards.tf index e45bf6a9..e9a80d70 100644 --- a/modules/workloads/infra/dashboards.tf +++ b/modules/workloads/infra/dashboards.tf @@ -28,3 +28,9 @@ resource "grafana_dashboard" "cluster" { folder = var.dashboards_folder_id config_json = file("${path.module}/dashboards/cluster.json") } + +resource "grafana_dashboard" "nodeexp_nodes" { + count = var.enable_dashboards ? 1 : 0 + folder = var.dashboards_folder_id + config_json = file("${path.module}/dashboards/nodeexporter-nodes.json") +} diff --git a/modules/workloads/infra/dashboards/nodeexporter-nodes.json b/modules/workloads/infra/dashboards/nodeexporter-nodes.json new file mode 100644 index 00000000..e05b7b25 --- /dev/null +++ b/modules/workloads/infra/dashboards/nodeexporter-nodes.json @@ -0,0 +1,1282 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 265, + "iteration": 1666555993222, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "CPU", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum (\n (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\", nodename=~\"$instance\"}[5m])))\n/ ignoring(cpu) group_left\n count without (cpu, mode) (node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", nodename=~\"$instance\"})\n) by (nodename)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Aggegated CPU load averages (average system load over a period of time)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_load1{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "1m load average", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_load5{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "5m load average", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_load15{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "15m load average", + "refId": "C" + } + ], + "title": "Load Average", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 11, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Memory", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Aggregated values of memory used, buffered, cached and free", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 18, + "x": 0, + "y": 9 + }, + "id": 4, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "(\n sum(node_memory_MemTotal_bytes{job=\"node-exporter\", nodename=~\"$instance\"})\n-\n sum(node_memory_MemFree_bytes{job=\"node-exporter\", nodename=~\"$instance\"})\n-\n sum(node_memory_Buffers_bytes{job=\"node-exporter\", nodename=~\"$instance\"})\n-\n sum(node_memory_Cached_bytes{job=\"node-exporter\", nodename=~\"$instance\"})\n)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_memory_Buffers_bytes{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_memory_Cached_bytes{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_memory_MemFree_bytes{job=\"node-exporter\", nodename=~\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Overall memory consumption", + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", nodename=~\"$instance\"}) /\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", nodename=~\"$instance\"})\n* 100\n)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "gauge" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 12, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Disk read bytes per EC2 instance", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/ io time/" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 6, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_disk_read_bytes_total{job=\"node-exporter\", nodename=~\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])) by (nodename, device)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}} {{device}}", + "refId": "A" + } + ], + "title": "Disk I/O - Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/ io time/" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 14, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_disk_written_bytes_total{job=\"node-exporter\", nodename=~\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])) by (nodename, device)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}} {{device}}", + "refId": "B" + } + ], + "title": "Disk I/O - Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/ io time/" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 15, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", nodename=~\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[5m])) by (nodename, device)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}} {{device}}", + "refId": "C" + } + ], + "title": "Disk I/O time", + "type": "timeseries" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Aggregated disk space available / used", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.4.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "used", + "color": "#E0B400" + }, + { + "alias": "available", + "color": "#73BF69" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", nodename=~\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", nodename=~\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", nodename=~\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": 0, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 13, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 8, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\", nodename=~\"$instance\"}[5m])) by (nodename)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}}", + "refId": "A" + } + ], + "title": "Network Received", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 9, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": true, + "expr": "sum(rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\", nodename=~\"$instance\"}[5m])) by (nodename)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}}", + "refId": "A" + } + ], + "title": "Network Transmitted", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 35, + "style": "dark", + "tags": [ + "infrastructure" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info{job=\"node-exporter\", sysname!=\"Darwin\"}, nodename)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"node-exporter\", sysname!=\"Darwin\"}, nodename)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Node Exporter / Nodes", + "uid": "v8yDYJqnz", + "version": 41, + "weekStart": "" +} diff --git a/modules/workloads/infra/main.tf b/modules/workloads/infra/main.tf index 8bb38453..c1f1ab01 100644 --- a/modules/workloads/infra/main.tf +++ b/modules/workloads/infra/main.tf @@ -41,7 +41,7 @@ module "helm_addon" { { name = local.name chart = "${path.module}/otel-config" - version = "0.3.0" + version = "0.3.1" namespace = local.namespace description = "ADOT helm Chart deployment configuration" }, @@ -69,6 +69,10 @@ module "helm_addon" { name = "globalScrapeTimeout" value = var.prometheus_config.global_scrape_timeout }, + { + name = "accountId" + value = local.context.aws_caller_identity_account_id + }, ] irsa_config = { diff --git a/modules/workloads/infra/otel-config/templates/opentelemetrycollector.yaml b/modules/workloads/infra/otel-config/templates/opentelemetrycollector.yaml index ee3b9235..10088ad5 100644 --- a/modules/workloads/infra/otel-config/templates/opentelemetrycollector.yaml +++ b/modules/workloads/infra/otel-config/templates/opentelemetrycollector.yaml @@ -15,6 +15,8 @@ spec: scrape_timeout: {{ .Values.globalScrapeTimeout }} external_labels: cluster: {{ .Values.ekscluster }} + account_id: {{ .Values.accountId }} + region: {{ .Values.region }} scrape_configs: - job_name: 'kubernetes-kubelet' scrape_interval: {{ .Values.globalScrapeInterval }} @@ -1552,6 +1554,14 @@ spec: - job_name: 'node-exporter' kubernetes_sd_configs: - role: endpoints + ec2_sd_configs: + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:9100$' + - action: replace + source_labels: [__meta_kubernetes_endpoint_node_name] + target_label: nodename exporters: prometheusremotewrite: endpoint: {{ .Values.ampurl }}