From f327dbc5b4c2162a51fcf30e49ebc7df1995e7b2 Mon Sep 17 00:00:00 2001 From: Weifeng Wang Date: Tue, 9 Jul 2024 11:58:45 +0800 Subject: [PATCH] bump to grafana/mimir:2.13.0 Signed-off-by: Weifeng Wang update mimir-mixin Signed-off-by: Weifeng Wang Update .env --- .../common/compose-include/mimir.yaml | 4 +- docker-compose/common/config/.env | 4 +- .../common/config/mimir/gateway_mimir.conf | 15 + .../microservices-mode/metrics/compose.yaml | 4 +- .../monolithic-mode/all-in-one/compose.yaml | 4 +- .../read-write-mode/metrics/compose.yaml | 4 +- .../common/gateway/nginx/gateway_mimir.conf | 15 + .../traces/configs/gateway_mimir.conf | 15 + .../traces/configs/gateway_mimir.conf | 15 + monitoring-mixins/k8s-all-in-one.yaml | 3631 +++++++++++----- .../mimir-mixin/deploy.libsonnet | 1 + .../mimir-alertmanager-resources.json | 2 +- .../mimir-compactor-resources.json | 2 +- .../mimir-overview-networking.json | 9 +- .../mimir-overview-resources.json | 4 +- .../deploy/dashboards_out/mimir-overview.json | 439 +- .../deploy/dashboards_out/mimir-queries.json | 12 +- .../mimir-reads-networking.json | 18 +- .../dashboards_out/mimir-reads-resources.json | 2 +- .../deploy/dashboards_out/mimir-reads.json | 10 +- .../mimir-remote-ruler-reads-networking.json | 1052 +++++ .../mimir-remote-ruler-reads-resources.json | 6 +- .../mimir-remote-ruler-reads.json | 8 +- .../mimir-rollout-progress.json | 2 +- .../deploy/dashboards_out/mimir-ruler.json | 2 +- .../deploy/dashboards_out/mimir-scaling.json | 249 +- .../dashboards_out/mimir-slow-queries.json | 76 +- .../deploy/dashboards_out/mimir-tenants.json | 170 +- .../dashboards_out/mimir-top-tenants.json | 854 ++-- .../mimir-writes-networking.json | 9 +- .../mimir-writes-resources.json | 4 +- .../deploy/dashboards_out/mimir-writes.json | 20 +- .../mimir-mixin/deploy/kustomization.yaml | 3 + .../deploy/manifests/k8s-all-in-one.yaml | 3809 ++++++++++++----- .../deploy/mimir-mixin-alerts.yaml | 159 +- .../mimir-mixin/deploy/prometheus-alerts.yaml | 159 +- .../mimir-mixin/jsonnetfile.json | 2 +- .../mimir-mixin/jsonnetfile.lock.json | 12 +- .../grafana-builder/grafana.libsonnet | 135 +- .../jsonnet-libs/mixin-utils/utils.libsonnet | 198 +- .../operations/mimir-mixin/alerts.libsonnet | 1 + .../mimir-mixin/alerts/alertmanager.libsonnet | 24 +- .../mimir-mixin/alerts/alerts-utils.libsonnet | 2 + .../mimir-mixin/alerts/alerts.libsonnet | 141 +- .../mimir-mixin/alerts/autoscaling.libsonnet | 3 +- .../mimir-mixin/alerts/blocks.libsonnet | 55 +- .../mimir-mixin/alerts/compactor.libsonnet | 8 +- .../alerts/continuous-test.libsonnet | 12 +- .../operations/mimir-mixin/config.libsonnet | 39 +- .../mimir-mixin/dashboards.libsonnet | 1 + .../dashboards/dashboard-queries.libsonnet | 176 +- .../dashboards/dashboard-utils.libsonnet | 748 +++- .../mimir-mixin/dashboards/overview.libsonnet | 113 +- .../mimir-mixin/dashboards/queries.libsonnet | 187 +- .../mimir-mixin/dashboards/reads.libsonnet | 185 +- .../remote-ruler-reads-resources.libsonnet | 6 +- .../dashboards/remote-ruler-reads.libsonnet | 141 +- .../dashboards/rollout-progress.libsonnet | 4 +- .../mimir-mixin/dashboards/ruler.libsonnet | 4 +- .../mimir-mixin/dashboards/scaling.libsonnet | 2 +- .../dashboards/slow-queries.libsonnet | 76 +- .../mimir-mixin/dashboards/tenants.libsonnet | 146 +- .../dashboards/top-tenants.libsonnet | 124 +- .../dashboards/writes-resources.libsonnet | 4 +- .../mimir-mixin/dashboards/writes.libsonnet | 306 +- .../mimir-mixin/jsonnetfile.lock.json | 8 +- .../mimir-mixin/recording_rules.libsonnet | 4 +- 67 files changed, 9880 insertions(+), 3779 deletions(-) create mode 100644 monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-networking.json diff --git a/docker-compose/common/compose-include/mimir.yaml b/docker-compose/common/compose-include/mimir.yaml index 16d297f9..833031d8 100644 --- a/docker-compose/common/compose-include/mimir.yaml +++ b/docker-compose/common/compose-include/mimir.yaml @@ -4,7 +4,7 @@ services: labels: metrics.grafana.com/scrape: false depends_on: { minio: { condition: service_healthy } } - image: ${MIMIR_IMAGE:-docker.io/grafana/mimir:2.12.0} + image: ${MIMIR_IMAGE:-docker.io/grafana/mimir-alpine:2.13.0} configs: - source: mimir_config_file target: /etc/mimir/config.yaml @@ -30,7 +30,7 @@ services: mimirtool: labels: metrics.grafana.com/scrape: false - image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.12.0} + image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.13.0} volumes: - ../../../monitoring-mixins/crontab:/etc/crontabs/root - ../../../monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml:/rules/alloy-mixin-alerts.yaml diff --git a/docker-compose/common/config/.env b/docker-compose/common/config/.env index 8fbc4d8c..f17f6a3a 100644 --- a/docker-compose/common/config/.env +++ b/docker-compose/common/config/.env @@ -2,8 +2,8 @@ ALLOY_IMAGE=grafana/alloy:v1.2.0 LOKI_IMAGE=grafana/loki:3.1.0 GRAFANA_IMAGE=grafana/grafana:11.1.0 TEMPO_IMAGE=grafana/tempo:2.5.0 -MIMIR_IMAGE=grafana/mimir:2.12.0 -MIMIRTOOL_IMAGE=grafana/mimirtool:2.12.0 +MIMIR_IMAGE=grafana/mimir-alpine:2.13.0 +MIMIRTOOL_IMAGE=grafana/mimirtool:2.13.0 PYROSCOPE_IMAGE=grafana/pyroscope:1.6.1 NGINX_IMAGE=nginxinc/nginx-unprivileged:1.25-alpine MINIO_IMAGE=minio/minio:RELEASE.2024-06-22T05-26-45Z diff --git a/docker-compose/common/config/mimir/gateway_mimir.conf b/docker-compose/common/config/mimir/gateway_mimir.conf index 9e616b02..b8d13f77 100644 --- a/docker-compose/common/config/mimir/gateway_mimir.conf +++ b/docker-compose/common/config/mimir/gateway_mimir.conf @@ -26,6 +26,21 @@ server { location = /multitenant_alertmanager/status { proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; } + location = /api/v1/alerts { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/config { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/state { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/receivers { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/templates/test { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } # https://github.com/grafana/mimir/releases/tag/mimir-2.12.0 # Alertmanager deprecated the v1 API. All endpoints have a v2 equivalent. location = /api/v2/alerts { diff --git a/docker-compose/microservices-mode/metrics/compose.yaml b/docker-compose/microservices-mode/metrics/compose.yaml index 288e9ee9..db1f3e83 100644 --- a/docker-compose/microservices-mode/metrics/compose.yaml +++ b/docker-compose/microservices-mode/metrics/compose.yaml @@ -50,7 +50,7 @@ services: distributor: depends_on: { ingester: { condition: service_healthy } } - image: &mimirImage ${MIMIR_IMAGE:-docker.io/grafana/mimir:2.12.0} + image: &mimirImage ${MIMIR_IMAGE:-docker.io/grafana/mimir-alpine:2.13.0} configs: - source: mimir_config_file target: /etc/mimir/config.yaml @@ -227,7 +227,7 @@ services: mimirtool: labels: metrics.grafana.com/scrape: false - image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.12.0} + image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.13.0} volumes: - ../../../monitoring-mixins/crontab:/etc/crontabs/root - ../../../monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml:/rules/alloy-mixin-alerts.yaml diff --git a/docker-compose/monolithic-mode/all-in-one/compose.yaml b/docker-compose/monolithic-mode/all-in-one/compose.yaml index ea51475d..97390fdf 100644 --- a/docker-compose/monolithic-mode/all-in-one/compose.yaml +++ b/docker-compose/monolithic-mode/all-in-one/compose.yaml @@ -125,7 +125,7 @@ services: metrics.grafana.com/scrape: false profiles.grafana.com/service_name: mimir depends_on: { minio: { condition: service_healthy } } - image: ${MIMIR_IMAGE:-docker.io/grafana/mimir:2.12.0} + image: ${MIMIR_IMAGE:-docker.io/grafana/mimir-alpine:2.13.0} configs: - source: mimir_config_file target: /etc/mimir/config.yaml @@ -153,7 +153,7 @@ services: mimirtool: labels: metrics.grafana.com/scrape: false - image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.12.0} + image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.13.0} volumes: - ../../../monitoring-mixins/crontab:/etc/crontabs/root - ../../../monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml:/rules/alloy-mixin-alerts.yaml diff --git a/docker-compose/read-write-mode/metrics/compose.yaml b/docker-compose/read-write-mode/metrics/compose.yaml index f3331920..cc1feb92 100644 --- a/docker-compose/read-write-mode/metrics/compose.yaml +++ b/docker-compose/read-write-mode/metrics/compose.yaml @@ -50,7 +50,7 @@ services: mimir-backend: depends_on: { minio: { condition: service_healthy } } - image: &mimirImage ${MIMIR_IMAGE:-docker.io/grafana/mimir:2.12.0} + image: &mimirImage ${MIMIR_IMAGE:-docker.io/grafana/mimir-alpine:2.13.0} configs: - source: mimir_config_file target: /etc/mimir/config.yaml @@ -107,7 +107,7 @@ services: mimirtool: labels: metrics.grafana.com/scrape: false - image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.12.0} + image: ${MIMIRTOOL_IMAGE:-docker.io/grafana/mimirtool:2.13.0} volumes: - ../../../monitoring-mixins/crontab:/etc/crontabs/root - ../../../monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml:/rules/alloy-mixin-alerts.yaml diff --git a/kubernetes/common/gateway/nginx/gateway_mimir.conf b/kubernetes/common/gateway/nginx/gateway_mimir.conf index 9e616b02..b8d13f77 100644 --- a/kubernetes/common/gateway/nginx/gateway_mimir.conf +++ b/kubernetes/common/gateway/nginx/gateway_mimir.conf @@ -26,6 +26,21 @@ server { location = /multitenant_alertmanager/status { proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; } + location = /api/v1/alerts { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/config { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/state { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/receivers { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/templates/test { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } # https://github.com/grafana/mimir/releases/tag/mimir-2.12.0 # Alertmanager deprecated the v1 API. All endpoints have a v2 equivalent. location = /api/v2/alerts { diff --git a/kubernetes/microservices-mode/traces/configs/gateway_mimir.conf b/kubernetes/microservices-mode/traces/configs/gateway_mimir.conf index 9e616b02..b8d13f77 100644 --- a/kubernetes/microservices-mode/traces/configs/gateway_mimir.conf +++ b/kubernetes/microservices-mode/traces/configs/gateway_mimir.conf @@ -26,6 +26,21 @@ server { location = /multitenant_alertmanager/status { proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; } + location = /api/v1/alerts { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/config { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/state { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/receivers { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/templates/test { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } # https://github.com/grafana/mimir/releases/tag/mimir-2.12.0 # Alertmanager deprecated the v1 API. All endpoints have a v2 equivalent. location = /api/v2/alerts { diff --git a/kubernetes/monolithic-mode/traces/configs/gateway_mimir.conf b/kubernetes/monolithic-mode/traces/configs/gateway_mimir.conf index 9e616b02..b8d13f77 100644 --- a/kubernetes/monolithic-mode/traces/configs/gateway_mimir.conf +++ b/kubernetes/monolithic-mode/traces/configs/gateway_mimir.conf @@ -26,6 +26,21 @@ server { location = /multitenant_alertmanager/status { proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; } + location = /api/v1/alerts { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/config { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/state { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/receivers { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } + location = /api/v1/grafana/templates/test { + proxy_pass http://${MIMIR_ALERT_MANAGER_HOST}:8080$request_uri; + } # https://github.com/grafana/mimir/releases/tag/mimir-2.12.0 # Alertmanager deprecated the v1 API. All endpoints have a v2 equivalent. location = /api/v2/alerts { diff --git a/monitoring-mixins/k8s-all-in-one.yaml b/monitoring-mixins/k8s-all-in-one.yaml index d48e9b2e..af787502 100644 --- a/monitoring-mixins/k8s-all-in-one.yaml +++ b/monitoring-mixins/k8s-all-in-one.yaml @@ -14197,7 +14197,7 @@ data: "span": 12, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|alertmanager)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -17473,7 +17473,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|compactor|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -21354,6 +21354,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -21412,7 +21413,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -21582,6 +21583,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -21640,7 +21642,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -21810,6 +21812,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -21868,7 +21871,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -22325,7 +22328,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-write|distributor|ingester)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -22796,7 +22799,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -23012,7 +23015,7 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -23022,7 +23025,27 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -23279,7 +23302,13 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -23328,22 +23357,40 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Write latency", @@ -23601,7 +23648,13 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -23650,22 +23703,40 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Read latency", @@ -23694,7 +23765,217 @@ data: }, "unit": "reqps" }, - "overrides": [ ] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "instant queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#429D48", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_range($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "range queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#F1C731", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_labels($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label names\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#2A66CF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_label_name_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label values\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#9E44C1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "series queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#FFAB57", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_read($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "remote read queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C79424", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_metadata($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "metadata queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#84D586", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_exemplars($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplar queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#A1C4FC", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_active_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"active series\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C788DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_names($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label name cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#3F6833", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label value cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#447EBC", + "mode": "fixed" + } + } + ] + } + ] }, "id": 11, "links": [ ], @@ -23707,114 +23988,26 @@ data: "sort": "none" } }, - "seriesOverrides": [ - { - "alias": "instant queries", - "color": "#429D48" - }, - { - "alias": "range queries", - "color": "#F1C731" - }, - { - "alias": "\"label names\" queries", - "color": "#2A66CF" - }, - { - "alias": "\"label values\" queries", - "color": "#9E44C1" - }, - { - "alias": "series queries", - "color": "#FFAB57" - }, - { - "alias": "remote read queries", - "color": "#C79424" - }, - { - "alias": "metadata queries", - "color": "#84D586" - }, - { - "alias": "exemplar queries", - "color": "#A1C4FC" - }, - { - "alias": "\"active series\" queries", - "color": "#C788DE" - } - ], "span": 3, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))", + "expr": "sum by (route) (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "instant queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "range queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum by (route) (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", - "legendFormat": "\"label names\" queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "\"label values\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "series queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_read\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "remote read queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_metadata\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "metadata queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_exemplars\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "exemplar queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_active_series\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"active series\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_names\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label name cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_values\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label value cardinality\" queries", + "legendFormat": "other", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)\"}[$__rate_interval]))", + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "other", "legendLink": null @@ -24381,6 +24574,35 @@ data: "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, @@ -25074,7 +25296,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query splitting and results cache", + "title": "Query-frontend – query splitting and results cache", "titleSize": "h6" }, { @@ -25216,7 +25438,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query sharding", + "title": "Query-frontend – query sharding", "titleSize": "h6" }, { @@ -26128,8 +26350,8 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, @@ -26176,8 +26398,8 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, @@ -27168,6 +27390,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -27226,7 +27449,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -27396,6 +27619,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -27454,7 +27678,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -27624,6 +27848,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -27682,7 +27907,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -27852,6 +28077,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -27910,7 +28136,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -28080,6 +28306,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -28138,7 +28365,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -28308,6 +28535,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -28366,7 +28594,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -30821,7 +31049,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|store-gateway|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -32808,7 +33036,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -32857,19 +33085,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -32919,7 +33147,7 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null @@ -34811,11 +35039,1198 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "reqps" + "unit": "reqps" + }, + "overrides": [ ] + }, + "id": 49, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{operation}}", + "legendLink": null + } + ], + "title": "Operations / sec", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "max": 1, + "min": 0, + "noValue": "0", + "unit": "percentunit" + } + }, + "id": 50, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) >= 0", + "format": "time_series", + "legendFormat": "{{operation}}", + "legendLink": null + } + ], + "title": "Error rate", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 51, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Attributes", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 52, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Exists", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Blocks object store (querier accesses)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 53, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Get", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 54, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: GetRange", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 55, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Upload", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 56, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Delete", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "mimir" + ], + "templating": { + "list": [ + { + "current": { + "text": "Metrics", + "value": "Metrics" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(cortex_build_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "namespace", + "multi": true, + "name": "namespace", + "options": [ ], + "query": "label_values(cortex_build_info{cluster=~\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Mimir / Reads", + "uid": "e327503188913dc38ad571c647eef643", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: + grafana_dashboard_folder: /dashboards/Mimir + labels: + grafana_dashboard: "1" + name: mimir-reads.json + namespace: monitoring-system +--- +apiVersion: v1 +data: + mimir-remote-ruler-reads-networking.json: |- + { + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "8.0.0" + } + ], + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "mimir" + ], + "targetBlank": false, + "title": "Mimir dashboards", + "type": "dashboards" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 1, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 2, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 3, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 4, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Summary", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 5, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 6, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 7, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 8, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-query-frontend", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" }, "overrides": [ ] }, - "id": 49, + "id": 9, "links": [ ], "options": { "legend": { @@ -34829,26 +36244,41 @@ data: "span": 3, "targets": [ { - "expr": "sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "{{operation}}", + "legendFormat": "{{pod}}", "legendLink": null } ], - "title": "Operations / sec", + "title": "Receive bandwidth", "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { "defaults": { - "max": 1, + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, "min": 0, - "noValue": "0", - "unit": "percentunit" - } + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] }, - "id": 50, + "id": 10, "links": [ ], "options": { "legend": { @@ -34862,18 +36292,21 @@ data: "span": 3, "targets": [ { - "expr": "sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) >= 0", + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "{{operation}}", + "legendFormat": "{{pod}}", "legendLink": null } ], - "title": "Error rate", + "title": "Transmit bandwidth", "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -34892,13 +36325,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 51, + "id": 11, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -34911,48 +36343,28 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval]))", + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "highest", + "legendLink": null } ], - "title": "Latency of op: Attributes", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Inflight requests (per pod)", + "type": "timeseries" }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -34971,13 +36383,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 52, + "id": 12, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -34990,51 +36401,33 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "highest", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval]))", + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "limit", + "legendLink": null } ], - "title": "Latency of op: Exists", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Blocks object store (querier accesses)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -35047,14 +36440,14 @@ data: "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 1, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "min": 0, @@ -35062,13 +36455,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "Bps" }, "overrides": [ ] }, - "id": 53, + "id": 13, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -35081,44 +36473,14 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "{{pod}}", + "legendLink": null } ], - "title": "Latency of op: Get", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Receive bandwidth", + "type": "timeseries" }, { "datasource": "$datasource", @@ -35126,14 +36488,14 @@ data: "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 1, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "min": 0, @@ -35141,13 +36503,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "Bps" }, "overrides": [ ] }, - "id": 54, + "id": 14, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -35160,48 +36521,21 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "{{pod}}", + "legendLink": null } ], - "title": "Latency of op: GetRange", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Transmit bandwidth", + "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -35220,13 +36554,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 55, + "id": 15, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -35239,48 +36572,28 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval]))", + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "highest", + "legendLink": null } ], - "title": "Latency of op: Upload", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Inflight requests (per pod)", + "type": "timeseries" }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -35299,13 +36612,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 56, + "id": 16, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -35318,51 +36630,33 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "highest", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval]))", + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "limit", + "legendLink": null } ], - "title": "Latency of op: Delete", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -35388,17 +36682,16 @@ data: "type": "datasource" }, { - "allValue": ".+", + "allValue": ".*", "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": "prod", + "value": "prod" }, "datasource": "$datasource", "hide": 0, "includeAll": true, "label": "cluster", - "multi": true, + "multi": false, "name": "cluster", "options": [ ], "query": "label_values(cortex_build_info, cluster)", @@ -35412,17 +36705,16 @@ data: "useTags": false }, { - "allValue": ".+", + "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": "prod", + "value": "prod" }, "datasource": "$datasource", "hide": 0, - "includeAll": true, + "includeAll": false, "label": "namespace", - "multi": true, + "multi": false, "name": "namespace", "options": [ ], "query": "label_values(cortex_build_info{cluster=~\"$cluster\"}, namespace)", @@ -35467,8 +36759,8 @@ data: ] }, "timezone": "utc", - "title": "Mimir / Reads", - "uid": "e327503188913dc38ad571c647eef643", + "title": "Mimir / Remote ruler reads networking", + "uid": "9e8cfff65f91632f8a25981c6fe44bc9", "version": 0 } kind: ConfigMap @@ -35477,7 +36769,7 @@ metadata: grafana_dashboard_folder: /dashboards/Mimir labels: grafana_dashboard: "1" - name: mimir-reads.json + name: mimir-remote-ruler-reads-networking.json namespace: monitoring-system --- apiVersion: v1 @@ -35794,7 +37086,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -36076,7 +37368,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -36358,7 +37650,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -36924,7 +38216,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -37252,7 +38544,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -37414,7 +38706,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", + "title": "Ruler-query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", "titleSize": "h6" }, { @@ -37721,7 +39013,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -37966,7 +39258,7 @@ data: "stacking": "none", "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" }, "xField": "Workload", "xTickLabelRotation": 0, @@ -40908,7 +42200,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by(user) (rate(cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval]))\n /\nsum by(user) (rate(cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval])) > 0\n", + "expr": "sum by(user) (cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"})\n /\nsum by(user) (cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}) > 0\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null @@ -41951,6 +43243,146 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Required Replicas" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "displayName", + "value": "Service" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "displayName", + "value": "Namespace" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "reason" + }, + "properties": [ + { + "id": "displayName", + "value": "Reason" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -41979,115 +43411,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Required Replicas", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "__name__", - "thresholds": [ ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "cluster", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Service", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "deployment", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "namespace", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Reason", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "reason", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "sort_desc(\n cluster_namespace_deployment_reason:required_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n > ignoring(reason) group_left\n cluster_namespace_deployment:actual_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\n", @@ -42335,13 +43658,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42389,13 +43712,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42443,13 +43766,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42497,13 +43820,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42551,13 +43874,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42606,13 +43929,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -42626,7 +43949,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Accross tenants", + "title": "Across tenants", "titleSize": "h6" }, { @@ -42672,7 +43995,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42720,7 +44043,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42768,7 +44091,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42816,7 +44139,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42864,7 +44187,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42913,7 +44236,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -42973,7 +44296,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43021,7 +44344,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43069,7 +44392,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43117,7 +44440,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43165,7 +44488,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43214,7 +44537,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -43402,7 +44725,7 @@ data: "span": 12, "targets": [ { - "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", + "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", "instant": false, "legendFormat": "", "range": true, @@ -43686,6 +45009,30 @@ data: ], "query": ".*", "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + "label": "Component", + "multi": false, + "name": "component", + "options": [ + { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + { + "selected": false, + "text": "ruler-query-frontend", + "value": "ruler-query-frontend" + } + ], + "query": "query-frontend, ruler-query-frontend", + "type": "custom" } ] }, @@ -43853,7 +45200,7 @@ data: "span": 4, "targets": [ { - "expr": "sum(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "in-memory", "legendLink": null @@ -43865,19 +45212,19 @@ data: "legendLink": null }, { - "expr": "sum(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "owned", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -44092,7 +45439,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "series", "legendLink": null @@ -44209,13 +45556,13 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -44283,13 +45630,13 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets ({{ name }})", "legendLink": null @@ -44975,7 +46322,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval])\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]))\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]),\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "rate", "legendLink": null @@ -45343,6 +46690,46 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 27, "legend": { @@ -45371,40 +46758,6 @@ data: "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -45457,6 +46810,46 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 28, "legend": { @@ -45485,40 +46878,6 @@ data: "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -46156,7 +47515,7 @@ data: }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, "span": 6, @@ -46473,6 +47832,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -46501,43 +47920,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -46599,6 +47984,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 3, "legend": { @@ -46627,43 +48072,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n)", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)", "format": "table", "instant": true, "legendFormat": "", @@ -46758,7 +48169,7 @@ data: "span": 12, "targets": [ { - "expr": "sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n\nand\ntopk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ end())\n)\n - sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ start())\n)\n)\n", + "expr": "(( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\nand\ntopk($limit,\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n -\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n)\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null @@ -46785,6 +48196,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 5, "legend": { @@ -46813,40 +48284,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -46971,6 +48408,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 7, "legend": { @@ -46999,40 +48496,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_discarded_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*|mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -47157,6 +48620,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 9, "legend": { @@ -47185,43 +48708,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -47283,6 +48772,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplars/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 10, "legend": { @@ -47311,40 +48860,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "exemplars/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -47409,6 +48924,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 11, "legend": { @@ -47437,40 +49012,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", @@ -47535,6 +49076,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 12, "legend": { @@ -47563,40 +49164,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", @@ -47661,6 +49228,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Compaction Jobs" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 13, "legend": { @@ -47689,40 +49316,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Compaction Jobs", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit,\n sum by (user) (cortex_bucket_index_estimated_compaction_jobs{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"})\n and ignoring(user)\n (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"}[$__rate_interval])) == 0)\n)\n", @@ -48115,6 +49708,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -48173,7 +49767,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -48343,6 +49937,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -48401,7 +49996,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -48571,6 +50166,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -48629,7 +50225,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -49262,7 +50858,7 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "desc" } }, @@ -49821,7 +51417,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|ingester|mimir-write)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -50170,7 +51766,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\n\n", + "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\nWith classic storage we the sum of series from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum series of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 4, @@ -50198,7 +51794,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -50246,7 +51842,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\n\n", + "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\nWith classic storage we the sum of exemplars from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum exemplars of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 5, @@ -50274,7 +51870,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -52015,7 +53611,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - shipper", + "title": "Ingester – shipper", "titleSize": "h6" }, { @@ -52193,7 +53789,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB head", + "title": "Ingester – TSDB head", "titleSize": "h6" }, { @@ -52513,7 +54109,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB write ahead log (WAL)", + "title": "Ingester – TSDB write ahead log (WAL)", "titleSize": "h6" }, { @@ -52620,7 +54216,7 @@ data: }, { "datasource": "$datasource", - "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", + "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is replicated to a number of ingesters. With classic storage we the sum of rates from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum rate of each ingest partition.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -52658,7 +54254,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "ingested exemplars", "legendLink": null @@ -52707,7 +54303,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "appended exemplars", "legendLink": null @@ -62377,10 +63973,14 @@ spec: The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors expr: | - 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - > 1 + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 for: 15m labels: severity: critical @@ -62396,18 +63996,6 @@ spec: for: 15m labels: severity: warning - - alert: MimirQueriesIncorrect - annotations: - message: | - The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect - expr: | - 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - for: 15m - labels: - severity: warning - alert: MimirInconsistentRuntimeConfig annotations: message: | @@ -62523,11 +64111,24 @@ spec: expr: | (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load + # Only if there are more timeseries than would be expected due to continuous testing load ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) + ( # Classic storage timeseries + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) + or + ( # Ingest storage timeseries + sum by(cluster, namespace) ( + max by(ingester_id, cluster, namespace) ( + label_replace(cortex_ingester_memory_series, + "ingester_id", "$1", + "pod", ".*-([0-9]+)$" + ) + ) + ) + ) ) > 100000 for: 1h labels: @@ -62565,9 +64166,9 @@ spec: severity: warning - alert: MimirStoreGatewayTooManyFailedOperations annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors - while doing {{ $labels.operation }} on the object storage. + message: Mimir store-gateway in {{ $labels.cluster }}/{{ $labels.namespace + }} is experiencing {{ $value | humanizePercentage }} errors while doing + {{ $labels.operation }} on the object storage. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations expr: | sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 @@ -63095,8 +64696,8 @@ spec: severity: warning - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got + a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -63109,8 +64710,8 @@ spec: severity: critical - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got + a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -63161,7 +64762,7 @@ spec: }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts @@ -63311,6 +64912,126 @@ spec: for: 1h labels: severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "starting" phase is not reducing consumption lag of write requests + read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (2 * 60) + for: 3m + labels: + severity: critical + threshold: very_high_for_short_period + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > 30 + for: 15m + labels: + severity: critical + threshold: relatively_high_for_long_period + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterStuckProcessingRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is stuck processing write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterstuckprocessingrecordsfromkafka + expr: | + # Alert if the reader is not processing any records, but there buffered records to process in the Kafka client. + (sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_total[5m])) == 0) + and + # NOTE: the cortex_ingest_storage_reader_buffered_fetch_records_total metric is a gauge showing the current number of buffered records. + (sum by (cluster, namespace, pod) (cortex_ingest_storage_reader_buffered_fetch_records_total) > 0) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/monitoring-mixins/mimir-mixin/deploy.libsonnet b/monitoring-mixins/mimir-mixin/deploy.libsonnet index 3e7c04d0..73862e78 100644 --- a/monitoring-mixins/mimir-mixin/deploy.libsonnet +++ b/monitoring-mixins/mimir-mixin/deploy.libsonnet @@ -71,6 +71,7 @@ write: '(.*mimir-)?(mimir|distributor|ingester|mimir-write).*', read: '(.*mimir-)?(mimir|mimir-read|query-frontend|querier|ruler-query-frontend|ruler-querier).*', backend: '(.*mimir-)?(mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter).*', + remote_ruler_read: '(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*', }, }, } diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-alertmanager-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-alertmanager-resources.json index fd9e13e6..72add59b 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-alertmanager-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-alertmanager-resources.json @@ -574,7 +574,7 @@ "span": 12, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|alertmanager)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-compactor-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-compactor-resources.json index 0dc61f17..b14a2465 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-compactor-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-compactor-resources.json @@ -685,7 +685,7 @@ "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|compactor|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-networking.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-networking.json index 8bf226f0..1f890345 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-networking.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-networking.json @@ -189,6 +189,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -247,7 +248,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -417,6 +418,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -475,7 +477,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -645,6 +647,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -703,7 +706,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-resources.json index 1510fbf7..05eb1827 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview-resources.json @@ -328,7 +328,7 @@ "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-write|distributor|ingester)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -799,7 +799,7 @@ "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json index fc0f4f78..81474ea0 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -91,7 +91,27 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -348,7 +368,13 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -397,22 +423,40 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" + }, + { + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_native" } ], "title": "Write latency", @@ -670,7 +714,13 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -719,22 +769,40 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" + }, + { + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Read latency", @@ -763,7 +831,217 @@ }, "unit": "reqps" }, - "overrides": [ ] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "instant queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#429D48", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_range($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "range queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#F1C731", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_labels($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label names\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#2A66CF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_label_name_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label values\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#9E44C1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "series queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#FFAB57", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_read($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "remote read queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C79424", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_metadata($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "metadata queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#84D586", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_exemplars($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplar queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#A1C4FC", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_active_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"active series\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C788DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_names($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label name cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#3F6833", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label value cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#447EBC", + "mode": "fixed" + } + } + ] + } + ] }, "id": 11, "links": [ ], @@ -776,114 +1054,26 @@ "sort": "none" } }, - "seriesOverrides": [ - { - "alias": "instant queries", - "color": "#429D48" - }, - { - "alias": "range queries", - "color": "#F1C731" - }, - { - "alias": "\"label names\" queries", - "color": "#2A66CF" - }, - { - "alias": "\"label values\" queries", - "color": "#9E44C1" - }, - { - "alias": "series queries", - "color": "#FFAB57" - }, - { - "alias": "remote read queries", - "color": "#C79424" - }, - { - "alias": "metadata queries", - "color": "#84D586" - }, - { - "alias": "exemplar queries", - "color": "#A1C4FC" - }, - { - "alias": "\"active series\" queries", - "color": "#C788DE" - } - ], "span": 3, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "instant queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", + "expr": "sum by (route) (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "range queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum by (route) (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", - "legendFormat": "\"label names\" queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "\"label values\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "series queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_read\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "remote read queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_metadata\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "metadata queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_exemplars\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "exemplar queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_active_series\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"active series\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_names\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label name cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_values\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label value cardinality\" queries", + "legendFormat": "other", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)\"}[$__rate_interval]))", + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "other", "legendLink": null @@ -1450,6 +1640,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-queries.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-queries.json index 91350623..b1d5461b 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-queries.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-queries.json @@ -642,7 +642,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query splitting and results cache", + "title": "Query-frontend – query splitting and results cache", "titleSize": "h6" }, { @@ -784,7 +784,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query sharding", + "title": "Query-frontend – query sharding", "titleSize": "h6" }, { @@ -1696,8 +1696,8 @@ "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, @@ -1744,8 +1744,8 @@ "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-networking.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-networking.json index 2f1499b7..41118c23 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-networking.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-networking.json @@ -189,6 +189,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -247,7 +248,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -417,6 +418,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -475,7 +477,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -645,6 +647,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -703,7 +706,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -873,6 +876,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -931,7 +935,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -1101,6 +1105,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -1159,7 +1164,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -1329,6 +1334,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -1387,7 +1393,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-resources.json index 6c9001dc..48994182 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads-resources.json @@ -2326,7 +2326,7 @@ "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|store-gateway|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json index 6eb15943..53a86d20 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json @@ -1852,7 +1852,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1901,19 +1901,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1963,7 +1963,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-networking.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-networking.json new file mode 100644 index 00000000..310de3c3 --- /dev/null +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-networking.json @@ -0,0 +1,1052 @@ +{ + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "8.0.0" + } + ], + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "mimir" + ], + "targetBlank": false, + "title": "Mimir dashboards", + "type": "dashboards" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 1, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 2, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 3, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 4, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Summary", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 5, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 6, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 7, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 8, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-query-frontend", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 9, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 10, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 11, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 12, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-query-scheduler", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 13, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 14, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 15, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 16, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-querier", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "mimir" + ], + "templating": { + "list": [ + { + "current": { + "text": "Metrics", + "value": "Metrics" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(cortex_build_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(cortex_build_info{cluster=~\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Mimir / Remote ruler reads networking", + "uid": "9e8cfff65f91632f8a25981c6fe44bc9", + "version": 0 + } \ No newline at end of file diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-resources.json index b7faedd9..7aac9d0a 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads-resources.json @@ -309,7 +309,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -591,7 +591,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -873,7 +873,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads.json index c997b964..dd2b95f7 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-remote-ruler-reads.json @@ -445,7 +445,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -773,7 +773,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -935,7 +935,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", + "title": "Ruler-query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", "titleSize": "h6" }, { @@ -1242,7 +1242,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-rollout-progress.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-rollout-progress.json index 09611ad1..bd74b313 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-rollout-progress.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-rollout-progress.json @@ -122,7 +122,7 @@ "stacking": "none", "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" }, "xField": "Workload", "xTickLabelRotation": 0, diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-ruler.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-ruler.json index 86ed8f83..252333c3 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-ruler.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-ruler.json @@ -1648,7 +1648,7 @@ "span": 4, "targets": [ { - "expr": "sum by(user) (rate(cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval]))\n /\nsum by(user) (rate(cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval])) > 0\n", + "expr": "sum by(user) (cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"})\n /\nsum by(user) (cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}) > 0\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-scaling.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-scaling.json index 56c00d0c..019c0f98 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-scaling.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-scaling.json @@ -62,6 +62,146 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Required Replicas" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "displayName", + "value": "Service" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "displayName", + "value": "Namespace" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "reason" + }, + "properties": [ + { + "id": "displayName", + "value": "Reason" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -90,115 +230,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Required Replicas", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "__name__", - "thresholds": [ ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "cluster", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Service", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "deployment", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "namespace", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Reason", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "reason", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "sort_desc(\n cluster_namespace_deployment_reason:required_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n > ignoring(reason) group_left\n cluster_namespace_deployment:actual_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\n", diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json index 2f20031e..e86b6f6e 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json @@ -73,13 +73,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -127,13 +127,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -181,13 +181,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -235,13 +235,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -289,13 +289,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -344,13 +344,13 @@ "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -364,7 +364,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Accross tenants", + "title": "Across tenants", "titleSize": "h6" }, { @@ -410,7 +410,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -458,7 +458,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -506,7 +506,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -554,7 +554,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -602,7 +602,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -651,7 +651,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -711,7 +711,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -759,7 +759,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -807,7 +807,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -855,7 +855,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -903,7 +903,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -952,7 +952,7 @@ "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -1140,7 +1140,7 @@ "span": 12, "targets": [ { - "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", + "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", "instant": false, "legendFormat": "", "range": true, @@ -1424,6 +1424,30 @@ ], "query": ".*", "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + "label": "Component", + "multi": false, + "name": "component", + "options": [ + { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + { + "selected": false, + "text": "ruler-query-frontend", + "value": "ruler-query-frontend" + } + ], + "query": "query-frontend, ruler-query-frontend", + "type": "custom" } ] }, diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-tenants.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-tenants.json index c071774f..108e6e79 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-tenants.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-tenants.json @@ -116,7 +116,7 @@ "span": 4, "targets": [ { - "expr": "sum(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "in-memory", "legendLink": null @@ -128,19 +128,19 @@ "legendLink": null }, { - "expr": "sum(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "owned", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -355,7 +355,7 @@ "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "series", "legendLink": null @@ -472,13 +472,13 @@ "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -546,13 +546,13 @@ "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets ({{ name }})", "legendLink": null @@ -1238,7 +1238,7 @@ "span": 3, "targets": [ { - "expr": "sum(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval])\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]))\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]),\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "rate", "legendLink": null @@ -1606,6 +1606,46 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 27, "legend": { @@ -1634,40 +1674,6 @@ "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -1720,6 +1726,46 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 28, "legend": { @@ -1748,40 +1794,6 @@ "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -2419,7 +2431,7 @@ }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, "span": 6, diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json index 27df82a0..5e0879ab 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json @@ -63,6 +63,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -91,43 +151,9 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -189,6 +215,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 3, "legend": { @@ -217,43 +303,9 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n)", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)", "format": "table", "instant": true, "legendFormat": "", @@ -348,7 +400,7 @@ "span": 12, "targets": [ { - "expr": "sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n\nand\ntopk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ end())\n)\n - sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ start())\n)\n)\n", + "expr": "(( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\nand\ntopk($limit,\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n -\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n)\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null @@ -375,6 +427,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 5, "legend": { @@ -403,40 +515,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -561,6 +639,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 7, "legend": { @@ -589,40 +727,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_discarded_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*|mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -747,6 +851,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 9, "legend": { @@ -775,43 +939,9 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -873,6 +1003,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplars/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 10, "legend": { @@ -901,40 +1091,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "exemplars/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -999,6 +1155,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 11, "legend": { @@ -1027,40 +1243,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", @@ -1125,6 +1307,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 12, "legend": { @@ -1153,40 +1395,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", @@ -1251,6 +1459,66 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Compaction Jobs" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 13, "legend": { @@ -1279,40 +1547,6 @@ "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Compaction Jobs", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit,\n sum by (user) (cortex_bucket_index_estimated_compaction_jobs{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"})\n and ignoring(user)\n (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"}[$__rate_interval])) == 0)\n)\n", diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-networking.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-networking.json index 5a16fc11..17f9cc23 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-networking.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-networking.json @@ -189,6 +189,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -247,7 +248,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -417,6 +418,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -475,7 +477,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -645,6 +647,7 @@ }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -703,7 +706,7 @@ "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-resources.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-resources.json index ec6bd6f9..dc5fee60 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-resources.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes-resources.json @@ -504,7 +504,7 @@ "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "desc" } }, @@ -1063,7 +1063,7 @@ "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|ingester|mimir-write)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes.json index 432e52d0..27a25413 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-writes.json @@ -214,7 +214,7 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\n\n", + "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\nWith classic storage we the sum of series from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum series of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 4, @@ -242,7 +242,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -290,7 +290,7 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\n\n", + "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\nWith classic storage we the sum of exemplars from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum exemplars of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 5, @@ -318,7 +318,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -2059,7 +2059,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - shipper", + "title": "Ingester – shipper", "titleSize": "h6" }, { @@ -2237,7 +2237,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB head", + "title": "Ingester – TSDB head", "titleSize": "h6" }, { @@ -2557,7 +2557,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB write ahead log (WAL)", + "title": "Ingester – TSDB write ahead log (WAL)", "titleSize": "h6" }, { @@ -2664,7 +2664,7 @@ }, { "datasource": "$datasource", - "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", + "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is replicated to a number of ingesters. With classic storage we the sum of rates from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum rate of each ingest partition.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -2702,7 +2702,7 @@ "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "ingested exemplars", "legendLink": null @@ -2751,7 +2751,7 @@ "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "appended exemplars", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/kustomization.yaml b/monitoring-mixins/mimir-mixin/deploy/kustomization.yaml index 0ee6c341..d6d830a4 100644 --- a/monitoring-mixins/mimir-mixin/deploy/kustomization.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/kustomization.yaml @@ -59,6 +59,9 @@ configMapGenerator: - name: mimir-reads.json files: - dashboards_out/mimir-reads.json +- name: mimir-remote-ruler-reads-networking.json + files: + - dashboards_out/mimir-remote-ruler-reads-networking.json - name: mimir-remote-ruler-reads-resources.json files: - dashboards_out/mimir-remote-ruler-reads-resources.json diff --git a/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml index 6eb450f9..823e4ef5 100644 --- a/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -577,7 +577,7 @@ data: "span": 12, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|alertmanager)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|alertmanager)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -3853,7 +3853,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|compactor|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|compactor|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -7734,6 +7734,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -7792,7 +7793,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -7962,6 +7963,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -8020,7 +8022,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -8190,6 +8192,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -8248,7 +8251,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -8705,7 +8708,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-write|distributor|ingester)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-write|distributor|ingester)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -9176,7 +9179,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|mimir-backend|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -9392,7 +9395,7 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -9402,7 +9405,27 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -9659,7 +9682,13 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -9708,22 +9737,40 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Write latency", @@ -9981,7 +10028,13 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * +Inf)", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A_classic" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -10030,22 +10083,40 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "99th percentile", - "refId": "A" + "refId": "A_classic" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "99th percentile", + "refId": "A_native" + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "50th percentile", - "refId": "B" + "refId": "B_classic" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})", + "expr": "histogram_quantile(0.50, sum (cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) * 1e3 < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "50th percentile", + "refId": "B_native" + }, + { + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}) /\nsum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})\n < ($latency_metrics * +Inf)", "format": "time_series", "legendFormat": "Average", - "refId": "C" + "refId": "C_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"})) /\nsum(histogram_count(cluster_job_route:cortex_request_duration_seconds:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}))\n < ($latency_metrics * -Inf)", + "format": "time_series", + "legendFormat": "Average", + "refId": "C_native" } ], "title": "Read latency", @@ -10074,7 +10145,217 @@ data: }, "unit": "reqps" }, - "overrides": [ ] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "instant queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#429D48", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_range($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "range queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#F1C731", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_labels($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label names\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#2A66CF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_label_name_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label values\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#9E44C1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "series queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#FFAB57", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_read($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "remote read queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C79424", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_metadata($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "metadata queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#84D586", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_exemplars($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplar queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#A1C4FC", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_active_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"active series\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C788DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_names($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label name cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#3F6833", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label value cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#447EBC", + "mode": "fixed" + } + } + ] + } + ] }, "id": 11, "links": [ ], @@ -10087,114 +10368,26 @@ data: "sort": "none" } }, - "seriesOverrides": [ - { - "alias": "instant queries", - "color": "#429D48" - }, - { - "alias": "range queries", - "color": "#F1C731" - }, - { - "alias": "\"label names\" queries", - "color": "#2A66CF" - }, - { - "alias": "\"label values\" queries", - "color": "#9E44C1" - }, - { - "alias": "series queries", - "color": "#FFAB57" - }, - { - "alias": "remote read queries", - "color": "#C79424" - }, - { - "alias": "metadata queries", - "color": "#84D586" - }, - { - "alias": "exemplar queries", - "color": "#A1C4FC" - }, - { - "alias": "\"active series\" queries", - "color": "#C788DE" - } - ], "span": 3, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))", + "expr": "sum by (route) (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "instant queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "range queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", + "expr": "sum by (route) (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", - "legendFormat": "\"label names\" queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval])) < ($latency_metrics * +Inf)", "format": "time_series", - "legendFormat": "\"label values\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "series queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_read\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "remote read queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_metadata\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "metadata queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_exemplars\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "exemplar queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_active_series\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"active series\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_names\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label name cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_values\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label value cardinality\" queries", + "legendFormat": "other", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)\"}[$__rate_interval]))", + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "legendFormat": "other", "legendLink": null @@ -10761,6 +10954,35 @@ data: "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, @@ -11454,7 +11676,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query splitting and results cache", + "title": "Query-frontend – query splitting and results cache", "titleSize": "h6" }, { @@ -11596,7 +11818,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend - query sharding", + "title": "Query-frontend – query sharding", "titleSize": "h6" }, { @@ -12508,8 +12730,8 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, @@ -12556,8 +12778,8 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, "span": 4, @@ -13548,6 +13770,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -13606,7 +13829,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -13776,6 +13999,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -13834,7 +14058,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -14004,6 +14228,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -14062,7 +14287,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -14232,6 +14457,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -14290,7 +14516,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -14460,6 +14686,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -14518,7 +14745,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -14688,6 +14915,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -14746,7 +14974,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -17201,7 +17429,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|store-gateway|mimir-backend)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|store-gateway|mimir-backend)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -19188,7 +19416,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -19237,19 +19465,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -19299,7 +19527,7 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null @@ -21191,11 +21419,1198 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "reqps" + "unit": "reqps" + }, + "overrides": [ ] + }, + "id": 49, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{operation}}", + "legendLink": null + } + ], + "title": "Operations / sec", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "max": 1, + "min": 0, + "noValue": "0", + "unit": "percentunit" + } + }, + "id": 50, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) >= 0", + "format": "time_series", + "legendFormat": "{{operation}}", + "legendLink": null + } + ], + "title": "Error rate", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 51, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Attributes", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 52, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Exists", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Blocks object store (querier accesses)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 53, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Get", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 54, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: GetRange", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 55, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Upload", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 56, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "format": "time_series", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "Average", + "refId": "C" + } + ], + "title": "Latency of op: Delete", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "mimir" + ], + "templating": { + "list": [ + { + "current": { + "text": "Metrics", + "value": "Metrics" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(cortex_build_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "namespace", + "multi": true, + "name": "namespace", + "options": [ ], + "query": "label_values(cortex_build_info{cluster=~\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Mimir / Reads", + "uid": "e327503188913dc38ad571c647eef643", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: + grafana_dashboard_folder: /dashboards/Mimir + labels: + grafana_dashboard: "1" + name: mimir-reads.json + namespace: monitoring-system +--- +apiVersion: v1 +data: + mimir-remote-ruler-reads-networking.json: |- + { + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "8.0.0" + } + ], + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "mimir" + ], + "targetBlank": false, + "title": "Mimir dashboards", + "type": "dashboards" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 1, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 2, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 3, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 4, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|mimir-backend|ruler_query_frontend|ruler-query-scheduler|ruler|ruler_querier).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Summary", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 5, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "id": 6, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Transmit bandwidth", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 7, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + } + ], + "title": "Inflight requests (per pod)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", + "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "id": 8, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 3, + "targets": [ + { + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "avg", + "legendLink": null + }, + { + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"}))", + "format": "time_series", + "legendFormat": "highest", + "legendLink": null + }, + { + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-frontend).*\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler-query-frontend", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" }, "overrides": [ ] }, - "id": 49, + "id": 9, "links": [ ], "options": { "legend": { @@ -21209,26 +22624,41 @@ data: "span": 3, "targets": [ { - "expr": "sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "{{operation}}", + "legendFormat": "{{pod}}", "legendLink": null } ], - "title": "Operations / sec", + "title": "Receive bandwidth", "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { "defaults": { - "max": 1, + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, "min": 0, - "noValue": "0", - "unit": "percentunit" - } + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] }, - "id": 50, + "id": 10, "links": [ ], "options": { "legend": { @@ -21242,18 +22672,21 @@ data: "span": 3, "targets": [ { - "expr": "sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\"}[$__rate_interval])) >= 0", + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "{{operation}}", + "legendFormat": "{{pod}}", "legendLink": null } ], - "title": "Error rate", + "title": "Transmit bandwidth", "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -21272,13 +22705,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 51, + "id": 11, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21291,48 +22723,28 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"attributes\"}[$__rate_interval]))", + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "highest", + "legendLink": null } ], - "title": "Latency of op: Attributes", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Inflight requests (per pod)", + "type": "timeseries" }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -21351,13 +22763,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 52, + "id": 12, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21370,51 +22781,33 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"}))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "highest", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"exists\"}[$__rate_interval]))", + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-query-scheduler).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "limit", + "legendLink": null } ], - "title": "Latency of op: Exists", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Blocks object store (querier accesses)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -21427,14 +22820,14 @@ data: "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 1, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "min": 0, @@ -21442,13 +22835,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "Bps" }, "overrides": [ ] }, - "id": 53, + "id": 13, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21461,44 +22853,14 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "sum by(pod) (rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "{{pod}}", + "legendLink": null } ], - "title": "Latency of op: Get", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Receive bandwidth", + "type": "timeseries" }, { "datasource": "$datasource", @@ -21506,14 +22868,14 @@ data: "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 1, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "min": 0, @@ -21521,13 +22883,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "Bps" }, "overrides": [ ] }, - "id": 54, + "id": 14, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21540,48 +22901,21 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "sum by(pod) (rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"get_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "{{pod}}", + "legendLink": null } ], - "title": "Latency of op: GetRange", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Transmit bandwidth", + "type": "timeseries" }, { "datasource": "$datasource", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -21600,13 +22934,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 55, + "id": 15, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21619,48 +22952,28 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"upload\"}[$__rate_interval]))", + "expr": "max(cortex_inflight_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "highest", + "legendLink": null } ], - "title": "Latency of op: Upload", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Inflight requests (per pod)", + "type": "timeseries" }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { + "custom": { + "fillOpacity": 0 + }, "defaults": { "custom": { "drawStyle": "line", @@ -21679,13 +22992,12 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 56, + "id": 16, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -21698,51 +23010,33 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "avg(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" + "legendFormat": "avg", + "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "max(sum by(pod) (cortex_tcp_connections{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"}))", "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" + "legendFormat": "highest", + "legendLink": null }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\",component=\"querier\",operation=\"delete\"}[$__rate_interval]))", + "expr": "min(cortex_tcp_connections_limit{cluster=~\"$cluster\", namespace=~\"$namespace\",pod=~\"(.*mimir-)?(mimir|ruler-querier|mimir-backend).*\"})", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "limit", + "legendLink": null } ], - "title": "Latency of op: Delete", - "type": "timeseries", - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "title": "Ingress TCP connections (per pod)", + "type": "timeseries" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -21768,17 +23062,16 @@ data: "type": "datasource" }, { - "allValue": ".+", + "allValue": ".*", "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": "prod", + "value": "prod" }, "datasource": "$datasource", "hide": 0, "includeAll": true, "label": "cluster", - "multi": true, + "multi": false, "name": "cluster", "options": [ ], "query": "label_values(cortex_build_info, cluster)", @@ -21792,17 +23085,16 @@ data: "useTags": false }, { - "allValue": ".+", + "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": "prod", + "value": "prod" }, "datasource": "$datasource", "hide": 0, - "includeAll": true, + "includeAll": false, "label": "namespace", - "multi": true, + "multi": false, "name": "namespace", "options": [ ], "query": "label_values(cortex_build_info{cluster=~\"$cluster\"}, namespace)", @@ -21847,8 +23139,8 @@ data: ] }, "timezone": "utc", - "title": "Mimir / Reads", - "uid": "e327503188913dc38ad571c647eef643", + "title": "Mimir / Remote ruler reads networking", + "uid": "9e8cfff65f91632f8a25981c6fe44bc9", "version": 0 } kind: ConfigMap @@ -21857,7 +23149,7 @@ metadata: grafana_dashboard_folder: /dashboards/Mimir labels: grafana_dashboard: "1" - name: mimir-reads.json + name: mimir-remote-ruler-reads-networking.json namespace: monitoring-system --- apiVersion: v1 @@ -22174,7 +23466,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -22456,7 +23748,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -22738,7 +24030,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -23304,7 +24596,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-frontend (dedicated to ruler)", + "title": "Ruler-query-frontend", "titleSize": "h6" }, { @@ -23632,7 +24924,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler (dedicated to ruler)", + "title": "Ruler-query-scheduler", "titleSize": "h6" }, { @@ -23794,7 +25086,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", + "title": "Ruler-query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions", "titleSize": "h6" }, { @@ -24101,7 +25393,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier (dedicated to ruler)", + "title": "Ruler-querier", "titleSize": "h6" } ], @@ -24346,7 +25638,7 @@ data: "stacking": "none", "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" }, "xField": "Workload", "xTickLabelRotation": 0, @@ -27288,7 +28580,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by(user) (rate(cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval]))\n /\nsum by(user) (rate(cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}[$__rate_interval])) > 0\n", + "expr": "sum by(user) (cortex_prometheus_notifications_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"})\n /\nsum by(user) (cortex_prometheus_notifications_queue_capacity{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}) > 0\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null @@ -28331,6 +29623,146 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Required Replicas" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cluster" + }, + "properties": [ + { + "id": "displayName", + "value": "Cluster" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "displayName", + "value": "Service" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "displayName", + "value": "Namespace" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "reason" + }, + "properties": [ + { + "id": "displayName", + "value": "Reason" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -28359,115 +29791,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Required Replicas", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "__name__", - "thresholds": [ ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Cluster", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "cluster", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Service", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "deployment", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "namespace", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "Reason", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "reason", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "sort_desc(\n cluster_namespace_deployment_reason:required_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n > ignoring(reason) group_left\n cluster_namespace_deployment:actual_replicas:count{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\n", @@ -28715,13 +30038,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -28769,13 +30092,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -28823,13 +30146,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -28877,13 +30200,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -28931,13 +30254,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -28986,13 +30309,13 @@ data: "span": 2, "targets": [ { - "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", + "expr": "quantile_over_time(0.5, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -29006,7 +30329,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Accross tenants", + "title": "Across tenants", "titleSize": "h6" }, { @@ -29052,7 +30375,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29100,7 +30423,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29148,7 +30471,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29196,7 +30519,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29244,7 +30567,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29293,7 +30616,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -29353,7 +30676,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29401,7 +30724,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29449,7 +30772,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29497,7 +30820,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29545,7 +30868,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29594,7 +30917,7 @@ data: "span": 2, "targets": [ { - "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", + "expr": "topk(10, quantile_over_time(0.99, {cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))", "format": "time_series", "legendFormat": "{{user_agent}}", "legendLink": null @@ -29782,7 +31105,7 @@ data: "span": 12, "targets": [ { - "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"query-frontend.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", + "expr": "{cluster=~\"$cluster\",namespace=~\"$namespace\",name=~\"$component.*\"} |= \"query stats\" != \"/api/v1/read\" | logfmt | user=~\"${tenant_id}\" | user_agent=~\"${user_agent}\" | response_time > ${min_duration} | label_format response_time_seconds=\"{{ if .response_time }} {{ duration .response_time }} {{ end }}\",param_step_seconds=\"{{ if .param_step }} {{ div .param_step 1000 }} {{ end }}\",length_seconds=\"{{ if .length }} {{ duration .length }} {{ end }}\"", "instant": false, "legendFormat": "", "range": true, @@ -30066,6 +31389,30 @@ data: ], "query": ".*", "type": "textbox" + }, + { + "current": { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + "label": "Component", + "multi": false, + "name": "component", + "options": [ + { + "selected": true, + "text": "query-frontend", + "value": "query-frontend" + }, + { + "selected": false, + "text": "ruler-query-frontend", + "value": "ruler-query-frontend" + } + ], + "query": "query-frontend, ruler-query-frontend", + "type": "custom" } ] }, @@ -30233,7 +31580,7 @@ data: "span": 4, "targets": [ { - "expr": "sum(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n - cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "in-memory", "legendLink": null @@ -30245,19 +31592,19 @@ data: "legendLink": null }, { - "expr": "sum(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_owned_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "owned", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -30472,7 +31819,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "series", "legendLink": null @@ -30589,13 +31936,13 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_series_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "active ({{ name }})", "legendLink": null @@ -30663,13 +32010,13 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets", "legendLink": null }, { - "expr": "sum by (name) (\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n) > 0\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, name) (cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, name) (\n max by (ingester_id, cluster, namespace, name) (\n label_replace(\n cortex_ingester_active_native_histogram_buckets_custom_tracker{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "buckets ({{ name }})", "legendLink": null @@ -31355,7 +32702,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval])\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]))\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", user=\"$user\"}[$__rate_interval]),\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "rate", "legendLink": null @@ -31723,6 +33070,46 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 27, "legend": { @@ -31751,40 +33138,6 @@ data: "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -31837,6 +33190,46 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, "fill": 1, "id": 28, "legend": { @@ -31865,40 +33258,6 @@ data: "span": 6, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (rule_group) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\", user=\"$user\"}))", @@ -32536,7 +33895,7 @@ data: }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, "span": 6, @@ -32853,6 +34212,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 2, "legend": { @@ -32881,43 +34300,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_active_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -32979,6 +34364,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 3, "legend": { @@ -33007,43 +34452,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n)", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)", "format": "table", "instant": true, "legendFormat": "", @@ -33138,7 +34549,7 @@ data: "span": 12, "targets": [ { - "expr": "sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} )\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} )\n)\n\nand\ntopk($limit, sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ end())\n)\n - sum by (user) (\n (\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n -\n sum by (user, cluster, namespace) (cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start())\n )\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"} @ start())\n)\n)\n", + "expr": "(( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} \n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\nand\ntopk($limit,\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ end()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n -\n (\n ( # Classic storage\n sum by (cluster, namespace, user) ((\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n)\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n (\n cortex_ingester_memory_series_created_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n -\n cortex_ingester_memory_series_removed_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"} @ start()\n)\n,\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n\n )\n)\n", "format": "time_series", "legendFormat": "{{ user }}", "legendLink": null @@ -33165,6 +34576,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 5, "legend": { @@ -33193,40 +34664,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -33351,6 +34788,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "samples/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 7, "legend": { @@ -33379,40 +34876,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "samples/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_discarded_samples_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*|mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -33537,6 +35000,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "series" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 9, "legend": { @@ -33565,43 +35088,9 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "series", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "topk($limit,\n sum by (user) (\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n )\n)\n", + "expr": "topk($limit, ( # Classic storage\n sum by (cluster, namespace, user) (cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, user) (\n max by (ingester_id, cluster, namespace, user) (\n label_replace(\n cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n)\n", "format": "table", "instant": true, "legendFormat": "", @@ -33663,6 +35152,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplars/s" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 10, "legend": { @@ -33691,40 +35240,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "exemplars/s", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}[5m])))", @@ -33789,134 +35304,68 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "sort": { - "col": 3, - "desc": true - }, - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "rules", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", - "format": "table", - "instant": true, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Top $limit biggest groups", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "rules" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "By rule group size", - "titleSize": "h6" - }, - { - "collapse": true, - "height": "250px", - "panels": [ - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", "fill": 1, - "id": 12, + "id": 11, "legend": { "avg": false, "current": false, @@ -33943,40 +35392,158 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "seconds", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, + "targets": [ { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" + "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" } ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $limit biggest groups", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "By rule group size", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "seconds" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "sort": { + "col": 3, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, "targets": [ { "expr": "topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ruler|cortex|mimir-backend.*))\"}))", @@ -34041,6 +35608,66 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Compaction Jobs" + }, + { + "id": "decimals", + "value": 0 + }, + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "displayName", + "value": "user" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "unit", + "value": "string" + } + ] + } + ] + }, "fill": 1, "id": 13, "legend": { @@ -34069,40 +35696,6 @@ data: "span": 12, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Compaction Jobs", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value", - "thresholds": [ ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ ], - "type": "string", - "unit": "short" - } - ], "targets": [ { "expr": "topk($limit,\n sum by (user) (cortex_bucket_index_estimated_compaction_jobs{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"})\n and ignoring(user)\n (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|compactor.*|cortex|mimir-backend.*))\"}[$__rate_interval])) == 0)\n)\n", @@ -34495,6 +36088,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -34553,7 +36147,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -34723,6 +36317,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -34781,7 +36376,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -34951,6 +36546,7 @@ data: }, { "datasource": "$datasource", + "description": "### Ingress TCP connections (per pod)\nThe number of ingress TCP connections (HTTP and gRPC protocol).\n", "fieldConfig": { "custom": { "fillOpacity": 0 @@ -35009,7 +36605,7 @@ data: "legendLink": null } ], - "title": "TCP connections (per pod)", + "title": "Ingress TCP connections (per pod)", "type": "timeseries" } ], @@ -35642,7 +37238,7 @@ data: "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "desc" } }, @@ -36201,7 +37797,7 @@ data: "span": 4, "targets": [ { - "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n)\nand\ncount by(persistentvolumeclaim) (\n kube_persistentvolumeclaim_labels{\n cluster=~\"$cluster\", namespace=~\"$namespace\",\n label_name=~\"((mimir|ingester|mimir-write)).*\"\n }\n)\n", + "expr": "max by(persistentvolumeclaim) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"} /\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*((mimir|ingester|mimir-write)).*\"}\n)\n", "format": "time_series", "legendFormat": "{{persistentvolumeclaim}}", "legendLink": null @@ -36550,7 +38146,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\n\n", + "description": "### In-memory series\nThe number of series not yet flushed to object storage that are held in ingester memory.\nWith classic storage we the sum of series from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum series of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 4, @@ -36578,7 +38174,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_memory_series{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -36626,7 +38222,7 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\n\n", + "description": "### Exemplars in ingesters\nNumber of TSDB exemplars currently in ingesters' storage.\nWith classic storage we the sum of exemplars from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum exemplars of each ingest partition.\n\n", "fill": 1, "format": "short", "id": 5, @@ -36654,7 +38250,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n/ on(cluster, namespace) group_left\nmax by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"}))\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cortex_ingester_tsdb_exemplar_exemplars_in_storage{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "instant": true, "refId": "A" @@ -38395,7 +39991,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - shipper", + "title": "Ingester – shipper", "titleSize": "h6" }, { @@ -38573,7 +40169,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB head", + "title": "Ingester – TSDB head", "titleSize": "h6" }, { @@ -38893,7 +40489,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - TSDB write ahead log (WAL)", + "title": "Ingester – TSDB write ahead log (WAL)", "titleSize": "h6" }, { @@ -39000,7 +40596,7 @@ data: }, { "datasource": "$datasource", - "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", + "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is replicated to a number of ingesters. With classic storage we the sum of rates from all ingesters is divided by the replication factor.\nWith ingest storage we take the maximum rate of each ingest partition.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -39038,7 +40634,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "ingested exemplars", "legendLink": null @@ -39087,7 +40683,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"}\n / on(cluster, namespace) group_left\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\n", + "expr": "( # Classic storage\n sum by (cluster, namespace, ) (cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"})\n / on (cluster, namespace) group_left()\n max by (cluster, namespace) (cortex_distributor_replication_factor{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|distributor.*|cortex|mimir-write.*))\"})\n)\nor\n( # Ingest storage\n sum by (cluster, namespace, ) (\n max by (ingester_id, cluster, namespace, ) (\n label_replace(\n cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\"},\n \"ingester_id\", \"$1\", \"pod\", \".*-([0-9]+)$\"\n )\n )\n )\n)\n", "format": "time_series", "legendFormat": "appended exemplars", "legendLink": null @@ -39352,10 +40948,14 @@ spec: The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors expr: | - 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - > 1 + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 for: 15m labels: severity: critical @@ -39371,18 +40971,6 @@ spec: for: 15m labels: severity: warning - - alert: MimirQueriesIncorrect - annotations: - message: | - The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect - expr: | - 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - for: 15m - labels: - severity: warning - alert: MimirInconsistentRuntimeConfig annotations: message: | @@ -39498,11 +41086,24 @@ spec: expr: | (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load + # Only if there are more timeseries than would be expected due to continuous testing load ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) + ( # Classic storage timeseries + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) + or + ( # Ingest storage timeseries + sum by(cluster, namespace) ( + max by(ingester_id, cluster, namespace) ( + label_replace(cortex_ingester_memory_series, + "ingester_id", "$1", + "pod", ".*-([0-9]+)$" + ) + ) + ) + ) ) > 100000 for: 1h labels: @@ -39540,9 +41141,9 @@ spec: severity: warning - alert: MimirStoreGatewayTooManyFailedOperations annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors - while doing {{ $labels.operation }} on the object storage. + message: Mimir store-gateway in {{ $labels.cluster }}/{{ $labels.namespace + }} is experiencing {{ $value | humanizePercentage }} errors while doing + {{ $labels.operation }} on the object storage. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations expr: | sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 @@ -40070,8 +41671,8 @@ spec: severity: warning - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got + a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -40084,8 +41685,8 @@ spec: severity: critical - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got + a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -40136,7 +41737,7 @@ spec: }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts @@ -40286,6 +41887,126 @@ spec: for: 1h labels: severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "starting" phase is not reducing consumption lag of write requests + read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (2 * 60) + for: 3m + labels: + severity: critical + threshold: very_high_for_short_period + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} in "running" phase is too far behind in its consumption of write requests + from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > 30 + for: 15m + labels: + severity: critical + threshold: relatively_high_for_long_period + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterStuckProcessingRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} is stuck processing write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterstuckprocessingrecordsfromkafka + expr: | + # Alert if the reader is not processing any records, but there buffered records to process in the Kafka client. + (sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_total[5m])) == 0) + and + # NOTE: the cortex_ingest_storage_reader_buffered_fetch_records_total metric is a gauge showing the current number of buffered records. + (sum by (cluster, namespace, pod) (cortex_ingest_storage_reader_buffered_fetch_records_total) > 0) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml b/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml index 2a14a39a..da26d871 100644 --- a/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml @@ -16,10 +16,14 @@ groups: The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors expr: | - 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - > 1 + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 for: 15m labels: severity: critical @@ -35,18 +39,6 @@ groups: for: 15m labels: severity: warning - - alert: MimirQueriesIncorrect - annotations: - message: | - The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect - expr: | - 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - for: 15m - labels: - severity: warning - alert: MimirInconsistentRuntimeConfig annotations: message: | @@ -159,11 +151,24 @@ groups: expr: | (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load + # Only if there are more timeseries than would be expected due to continuous testing load ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) + ( # Classic storage timeseries + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) + or + ( # Ingest storage timeseries + sum by(cluster, namespace) ( + max by(ingester_id, cluster, namespace) ( + label_replace(cortex_ingester_memory_series, + "ingester_id", "$1", + "pod", ".*-([0-9]+)$" + ) + ) + ) + ) ) > 100000 for: 1h labels: @@ -199,7 +204,7 @@ groups: severity: warning - alert: MimirStoreGatewayTooManyFailedOperations annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. + message: Mimir store-gateway in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations expr: | sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 @@ -715,7 +720,7 @@ groups: severity: warning - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -728,7 +733,7 @@ groups: severity: critical - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -773,7 +778,7 @@ groups: message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts @@ -913,6 +918,114 @@ groups: for: 1h labels: severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (2 * 60) + for: 3m + labels: + severity: critical + threshold: very_high_for_short_period + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > 30 + for: 15m + labels: + severity: critical + threshold: relatively_high_for_long_period + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterStuckProcessingRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is stuck processing write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterstuckprocessingrecordsfromkafka + expr: | + # Alert if the reader is not processing any records, but there buffered records to process in the Kafka client. + (sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_total[5m])) == 0) + and + # NOTE: the cortex_ingest_storage_reader_buffered_fetch_records_total metric is a gauge showing the current number of buffered records. + (sum by (cluster, namespace, pod) (cortex_ingest_storage_reader_buffered_fetch_records_total) > 0) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/monitoring-mixins/mimir-mixin/deploy/prometheus-alerts.yaml b/monitoring-mixins/mimir-mixin/deploy/prometheus-alerts.yaml index b4ff3658..d0b83fce 100644 --- a/monitoring-mixins/mimir-mixin/deploy/prometheus-alerts.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/prometheus-alerts.yaml @@ -23,10 +23,14 @@ spec: The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors expr: | - 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - > 1 + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 for: 15m labels: severity: critical @@ -42,18 +46,6 @@ spec: for: 15m labels: severity: warning - - alert: MimirQueriesIncorrect - annotations: - message: | - The Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirqueriesincorrect - expr: | - 100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - for: 15m - labels: - severity: warning - alert: MimirInconsistentRuntimeConfig annotations: message: | @@ -166,11 +158,24 @@ spec: expr: | (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load + # Only if there are more timeseries than would be expected due to continuous testing load ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) + ( # Classic storage timeseries + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) + or + ( # Ingest storage timeseries + sum by(cluster, namespace) ( + max by(ingester_id, cluster, namespace) ( + label_replace(cortex_ingester_memory_series, + "ingester_id", "$1", + "pod", ".*-([0-9]+)$" + ) + ) + ) + ) ) > 100000 for: 1h labels: @@ -206,7 +211,7 @@ spec: severity: warning - alert: MimirStoreGatewayTooManyFailedOperations annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. + message: Mimir store-gateway in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations expr: | sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 @@ -722,7 +727,7 @@ spec: severity: warning - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -735,7 +740,7 @@ spec: severity: critical - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + message: Mimir Ingester in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted expr: | # alert when there are more than one corruptions @@ -780,7 +785,7 @@ spec: message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 labels: severity: critical - name: mimir_compactor_alerts @@ -920,6 +925,114 @@ spec: for: 1h labels: severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (2 * 60) + for: 3m + labels: + severity: critical + threshold: very_high_for_short_period + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > 30 + for: 15m + labels: + severity: critical + threshold: relatively_high_for_long_period + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterStuckProcessingRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is stuck processing write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterstuckprocessingrecordsfromkafka + expr: | + # Alert if the reader is not processing any records, but there buffered records to process in the Kafka client. + (sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_total[5m])) == 0) + and + # NOTE: the cortex_ingest_storage_reader_buffered_fetch_records_total metric is a gauge showing the current number of buffered records. + (sum by (cluster, namespace, pod) (cortex_ingest_storage_reader_buffered_fetch_records_total) > 0) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/monitoring-mixins/mimir-mixin/jsonnetfile.json b/monitoring-mixins/mimir-mixin/jsonnetfile.json index e698513b..379aa82c 100644 --- a/monitoring-mixins/mimir-mixin/jsonnetfile.json +++ b/monitoring-mixins/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "operations/mimir-mixin" } }, - "version": "release-2.12" + "version": "main" } ], "legacyImports": true diff --git a/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json index 10079845..ed3b1b05 100644 --- a/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "f95501009c9b29bed87fe9d57c1a6e72e210f137", - "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" + "version": "167b75f241cb61513b399cad7f87052108a26b85", + "sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "f95501009c9b29bed87fe9d57c1a6e72e210f137", - "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" + "version": "167b75f241cb61513b399cad7f87052108a26b85", + "sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "operations/mimir-mixin" } }, - "version": "c7aab9e039d63397d2293114ad063b03626e247b", - "sum": "dgNk0zx57kRIPKynxUiSFMWemw6sHP7/c0Sg33lVoWE=" + "version": "43140f879bad033091c76607c9bdb49a0665eae4", + "sum": "qUwsWDdDE8ly5DPJdnvrO89m0hsi4MKDmYlD9rXKiUM=" } ], "legacyImports": false diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index cc43f483..43557580 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -456,18 +456,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], + httpStatusColors:: { + '1xx': '#EAB839', + '2xx': '#7EB26D', + '3xx': '#6ED0E0', + '4xx': '#EF843C', + '5xx': '#E24D42', + OK: '#7EB26D', + success: '#7EB26D', + 'error': '#E24D42', + cancel: '#A9A9A9', + }, + qpsPanel(selector, statusLabelName='status_code'):: { - aliasColors: { - '1xx': '#EAB839', - '2xx': '#7EB26D', - '3xx': '#6ED0E0', - '4xx': '#EF843C', - '5xx': '#E24D42', - OK: '#7EB26D', - success: '#7EB26D', - 'error': '#E24D42', - cancel: '#A9A9A9', - }, + aliasColors: $.httpStatusColors, targets: [ { expr: @@ -484,6 +486,65 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], } + $.stack, + // Assumes that the metricName is for a histogram (as opposed to qpsPanel above) + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + qpsPanelNativeHistogram(metricName, selector, statusLabelName='status_code'):: { + local sumByStatus(nativeClassicQuery) = { + local template = + ||| + sum by (status) ( + label_replace(label_replace(%(metricQuery)s, + "status", "${1}xx", "%(label)s", "([0-9]).."), + "status", "${1}", "%(label)s", "([a-zA-Z]+)")) + |||, + native: template % { metricQuery: nativeClassicQuery.native, label: statusLabelName }, + classic: template % { metricQuery: nativeClassicQuery.classic, label: statusLabelName }, + }, + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, + targets: [ + { + expr: utils.showClassicHistogramQuery(sumByStatus(utils.ncHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(sumByStatus(utils.ncHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A', + }, + ], + } + $.stack, + latencyPanel(metricName, selector, multiplier='1e3'):: { nullPointMode: 'null as zero', targets: [ @@ -509,6 +570,58 @@ local utils = import 'mixin-utils/utils.libsonnet'; yaxes: $.yaxes('ms'), }, + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + latencyPanelNativeHistogram(metricName, selector, multiplier='1e3'):: { + nullPointMode: 'null as zero', + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: 10, + }, + unit: 'ms', + }, + }, + targets: [ + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C_classic', + }, + ], + yaxes: $.yaxes('ms'), + }, + selector:: { eq(label, value):: { label: label, op: '=', value: value }, neq(label, value):: { label: label, op: '!=', value: value }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index d669aa55..ada9574b 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -1,86 +1,178 @@ local g = import 'grafana-builder/grafana.libsonnet'; { - // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. - // Metric name should be provided without _bucket suffix. - nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier=''):: + // The ncHistogramQuantile (native classic histogram quantile) function is + // used to calculate histogram quantiles from native histograms or classic + // histograms. Metric name should be provided without _bucket suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier='', from_recording=false):: local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(%(rateOpen)s%(metric)s_bucket%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { classicSumBy: classicSumBy, metric: metric, multiplierStr: multiplierStr, percentile: percentile, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { metric: metric, multiplierStr: multiplierStr, nativeSumBy: nativeSumBy, percentile: percentile, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, - // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. - // Metric name should be provided without _sum suffix. - nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + // The ncHistogramSumRate (native classic histogram sum rate) function is + // used to calculate the histogram rate of the sum from native histograms or + // classic histograms. Metric name should be provided without _sum suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramSumRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + classic: '%(rateOpen)s%(metric)s_sum%(suffix)s{%(selector)s}%(rateClose)s' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + native: 'histogram_sum(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, - // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. - // Metric name should be provided without _count suffix. - nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + // The ncHistogramCountRate (native classic histogram count rate) function is + // used to calculate the histogram rate of count from native histograms or + // classic histograms. Metric name should be provided without _count suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramCountRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + classic: '%(rateOpen)s%(metric)s_count%(suffix)s{%(selector)s}%(rateClose)s' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + native: 'histogram_count(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, // TODO(krajorama) Switch to histogram_avg function for native histograms later. - nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier=''):: + // ncHistogramAverageRate (native classic histogram average rate) function is + // used to calculate the histogram average rate from native histograms or + // classic histograms. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier='', from_recording=false):: local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; { classic: ||| %(multiplier)ssum(%(sumMetricQuery)s) / sum(%(countMetricQuery)s) ||| % { - sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic, - countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic, + sumMetricQuery: $.ncHistogramSumRate(metric, selector, rate_interval, from_recording).classic, + countMetricQuery: $.ncHistogramCountRate(metric, selector, rate_interval, from_recording).classic, multiplier: multiplierStr, }, native: ||| %(multiplier)ssum(%(sumMetricQuery)s) / sum(%(countMetricQuery)s) ||| % { - sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native, - countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native, + sumMetricQuery: $.ncHistogramSumRate(metric, selector, rate_interval, from_recording).native, + countMetricQuery: $.ncHistogramCountRate(metric, selector, rate_interval, from_recording).native, multiplier: multiplierStr, }, }, + // ncHistogramSumBy (native classic histogram sum by) function is used to + // generate a query that sums the results of a subquery by the given labels. + // The function can be used with native histograms or classic histograms. + ncHistogramSumBy(query, sum_by=[], multiplier=''):: + local sumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(', ', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.classic, + sumBy: sumBy, + }, + native: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.native, + sumBy: sumBy, + }, + }, + + // ncHistogramLeRate (native classic histogram le rate) calculates the rate + // of requests that have a value less than or equal to the given "le" value. + // The "le" value matcher for classic histograms can handle both Prometheus + // or OpenMetrics formats, where whole numbers may or may not have ".0" at + // the end. + ncHistogramLeRate(metric, selector, le, rate_interval='$__rate_interval'):: + local isWholeNumber(str) = str != '' && std.foldl(function(acc, c) acc && (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9'), std.stringChars(str), true); + { + native: 'histogram_fraction(0, %(le)s, rate(%(metric)s{%(selector)s}[%(rateInterval)s]))*histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + le: if isWholeNumber(le) then le + '.0' else le, // Treated as float number. + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + classic: 'rate(%(metric)s_bucket{%(selector)s, le=~"%(le)s"}[%(rateInterval)s])' % { + // le is treated as string, thus it needs to account for Prometheus text format not having '.0', but OpenMetrics having it. + // Also the resulting string in yaml is stored directly, so the \\ needs to be escaped to \\\\. + le: if isWholeNumber(le) then '%(le)s|%(le)s\\\\.0' % { le: le } else le, + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // ncHistogramComment (native classic histogram comment) helps attach + // comments to the query and also keep multiline strings where applicable. + ncHistogramComment(query, comment):: { + native: ||| + %s%s + ||| % [comment, query.native], + classic: ||| + %s%s + ||| % [comment, query.classic], + }, + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], @@ -106,7 +198,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; }, { record: '%(labels_underscore)s:%(metric)s:avg' % vars, - expr: 'sum(rate(%(metric)s_sum[1m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_sum[%(interval)s])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, }, { record: '%(labels_underscore)s:%(metric)s_bucket:sum_rate' % vars, @@ -205,12 +297,72 @@ local g = import 'grafana-builder/grafana.libsonnet'; noop(label):: { label: label, op: 'nop' }, }, - toPrometheusSelector(selector):: + // latencyRecordingRulePanelNativeHistogram - build a latency panel for a recording rule. + // - metric: the base metric name (middle part of recording rule name) + // - selectors: list of selectors which will be added to first part of + // recording rule name, and to the query selector itself. + // - extra_selectors (optional): list of selectors which will be added to the + // query selector, but not to the beginnig of the recording rule name. + // Useful for external labels. + // - multiplier (optional): assumes results are in seconds, will multiply + // by 1e3 to get ms. Can be turned off. + // - sum_by (optional): additional labels to use in the sum by clause, will also be used in the legend + latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[]):: + local labels = std.join('_', [matcher.label for matcher in selectors]); + local legend = std.join('', ['{{ %(lb)s }} ' % lb for lb in sum_by]); + local metricStr = '%(labels)s:%(metric)s' % { labels: labels, metric: metric }; + local selectorStr = $.toPrometheusSelectorNaked(selectors + extra_selectors); + { + nullPointMode: 'null as zero', + yaxes: g.yaxes('ms'), + targets: [ + { + expr: $.showClassicHistogramQuery($.ncHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_native', + }, + { + expr: $.showClassicHistogramQuery($.ncHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_native', + }, + { + expr: $.showClassicHistogramQuery($.ncHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_native', + }, + ], + }, + + toPrometheusSelectorNaked(selector):: local pairs = [ '%(label)s%(op)s"%(value)s"' % matcher for matcher in std.filter(function(matcher) matcher.op != 'nop', selector) ]; - '{%s}' % std.join(', ', pairs), + '%s' % std.join(', ', pairs), + + toPrometheusSelector(selector):: '{%s}' % $.toPrometheusSelectorNaked(selector), // withRunbookURL - Add/Override the runbook_url annotations for all alerts inside a list of rule groups. // - url_format: an URL format for the runbook, the alert name will be substituted in the URL. diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet index 10f936e8..832c7843 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet @@ -6,5 +6,6 @@ (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + (import 'alerts/autoscaling.libsonnet') + + (if $._config.ingest_storage_enabled then import 'alerts/ingest-storage.libsonnet' else {}) + (import 'alerts/continuous-test.libsonnet'), } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet index 20ae34a1..06c92bb7 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet @@ -6,8 +6,8 @@ { alert: $.alertName('AlertmanagerSyncConfigsFailing'), expr: ||| - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - |||, + rate(cortex_alertmanager_sync_configs_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), 'for': '30m', labels: { severity: 'critical', @@ -21,8 +21,8 @@ { alert: $.alertName('AlertmanagerRingCheckFailing'), expr: ||| - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - |||, + rate(cortex_alertmanager_ring_check_errors_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -36,8 +36,8 @@ { alert: $.alertName('AlertmanagerPartialStateMergeFailing'), expr: ||| - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_partial_state_merges_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -51,8 +51,8 @@ { alert: $.alertName('AlertmanagerReplicationFailing'), expr: ||| - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_state_replication_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -66,8 +66,8 @@ { alert: $.alertName('AlertmanagerPersistStateFailing'), expr: ||| - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - |||, + rate(cortex_alertmanager_state_persist_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(15), 'for': '1h', labels: { severity: 'critical', @@ -81,8 +81,8 @@ { alert: $.alertName('AlertmanagerInitialSyncFailed'), expr: ||| - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - |||, + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet index b550937d..c586c6dc 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -54,4 +54,6 @@ for group in groups ], + alertRangeInterval(multiple):: + ($._config.base_alerts_range_interval_minutes * multiple) + 'm', } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet index efe1f119..3c9a6a3b 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet @@ -34,14 +34,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"%(excluded_routes)s"}[%(range_interval)s])) / - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m])) - > 1 + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) + ) * 100 > 1 ||| % { group_by: $._config.alert_aggregation_labels, job_label: $._config.per_job_label, excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes), + range_interval: $.alertRangeInterval(1), }, 'for': '15m', labels: { @@ -78,23 +83,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % $._config, }, }, - { - alert: $.alertName('QueriesIncorrect'), - expr: ||| - 100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - The %(product)s cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results. - ||| % $._config, - }, - }, { alert: $.alertName('InconsistentRuntimeConfig'), expr: ||| @@ -130,8 +118,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('FrontendQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_frontend_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -145,8 +137,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('SchedulerQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_scheduler_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '7m', // We don't want to block for longer. labels: { severity: 'critical', @@ -161,19 +157,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), expr: ||| ( - sum by(%s, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operation_failures_total[%(range_interval)s]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total[%(range_interval)s]) ) / - sum by(%s, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operations_total[%(range_interval)s]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total[%(range_interval)s]) ) ) * 100 > 5 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % { + group_by: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -215,13 +214,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('KVStoreFailure'), expr: ||| ( - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[%(range_interval)s])) / - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[%(range_interval)s])) ) # We want to get alerted only in case there's a constant failure. == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -252,11 +253,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| (min by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_memory_users) == 0) and on (%(alert_aggregation_labels)s) - # Only if there are more time-series than would be expected due to continuous testing load + # Only if there are more timeseries than would be expected due to continuous testing load ( - sum by(%(alert_aggregation_labels)s) (cortex_ingester_memory_series) - / - max by(%(alert_aggregation_labels)s) (cortex_distributor_replication_factor) + ( # Classic storage timeseries + sum by(%(alert_aggregation_labels)s) (cortex_ingester_memory_series) + / + max by(%(alert_aggregation_labels)s) (cortex_distributor_replication_factor) + ) + or + ( # Ingest storage timeseries + sum by(%(alert_aggregation_labels)s) ( + max by(ingester_id, %(alert_aggregation_labels)s) ( + label_replace(cortex_ingester_memory_series, + "ingester_id", "$1", + "%(per_instance_label)s", ".*-([0-9]+)$" + ) + ) + ) + ) ) > 100000 ||| % $._config, labels: { @@ -316,15 +330,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('StoreGatewayTooManyFailedOperations'), 'for': '5m', expr: ||| - sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[%(range_interval)s])) > 0 ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), }, labels: { severity: 'warning', }, annotations: { - message: '%(product)s store-gateway %(alert_instance_variable)s in %(alert_aggregation_variables)s is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage.' % $._config, + message: '%(product)s store-gateway in %(alert_aggregation_variables)s is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage.' % $._config, }, }, ] + [ @@ -502,7 +517,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; %(kube_statefulset_status_replicas_updated)s ) ) and ( - changes(%(kube_statefulset_status_replicas_updated)s[15m:1m]) + changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -513,6 +528,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'), kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'), kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -533,7 +549,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; != %(kube_deployment_status_replicas_updated)s ) and ( - changes(%(kube_deployment_status_replicas_updated)s[15m:1m]) + changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -542,6 +558,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; aggregation_labels: $._config.alert_aggregation_labels, kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'), kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -619,11 +636,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedPushes'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -638,11 +657,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedQueries'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -657,11 +678,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerMissedEvaluations'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -675,9 +698,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('RulerFailedRingCheck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[%(range_interval)s])) > 0 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -692,11 +717,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerRemoteEvaluationFailing'), expr: ||| 100 * ( - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %(job_regex)s}[%(range_interval)s])) / - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %(job_regex)s}[%(range_interval)s])) ) > 1 - ||| % [$._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend), $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend)], + ||| % { + alert_aggregation_labels: $._config.alert_aggregation_labels, + job_regex: $.jobMatcher($._config.job_names.ruler_query_frontend), + range_interval: $.alertRangeInterval(5), + }, 'for': '5m', labels: { severity: 'warning', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet index 31c9ea1e..28a4028a 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet @@ -40,13 +40,14 @@ expr: ||| ( # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + label_replace(rate(keda_scaler_errors[%(range_interval)s]), "namespace", "$1", "exported_namespace", "(.*)") # Match only Mimir namespaces. * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) ) > 0 ||| % { aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(5), }, labels: { severity: 'critical', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet index 9cfc25f3..ea65b669 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet @@ -14,17 +14,18 @@ (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) and # Only if the ingester has ingested samples over the last 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[4h])) > 0) and # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[1h] offset 4h)) > 0) ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, per_instance_label: $._config.per_instance_label, alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + recording_rules_range_interval: $._config.recording_rules_range_interval, }, labels: { severity: 'critical', @@ -41,11 +42,12 @@ expr: ||| (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) and - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[4h])) > 0) ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, per_instance_label: $._config.per_instance_label, alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + recording_rules_range_interval: $._config.recording_rules_range_interval, }, labels: { severity: 'critical', @@ -79,8 +81,8 @@ alert: $.alertName('IngesterTSDBHeadCompactionFailed'), 'for': '15m', expr: ||| - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_compactions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -91,8 +93,8 @@ { alert: $.alertName('IngesterTSDBHeadTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_head_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -103,8 +105,8 @@ { alert: $.alertName('IngesterTSDBCheckpointCreationFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -115,8 +117,8 @@ { alert: $.alertName('IngesterTSDBCheckpointDeletionFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -127,8 +129,8 @@ { alert: $.alertName('IngesterTSDBWALTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -140,42 +142,46 @@ alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0) > 1 and # and there is only one zone count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'single-zone', }, annotations: { - message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, + message: '%(product)s Ingester in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, }, }, { alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0)) > 1 and # and there are multiple zones count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'multi-zone', }, annotations: { - message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, + message: '%(product)s Ingester in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, }, }, { alert: $.alertName('IngesterTSDBWALWritesFailed'), 'for': '3m', expr: ||| - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_writes_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, @@ -214,10 +220,13 @@ }, }, { - // Alert if the bucket index has not been updated for a given user. + // Alert if the bucket index has not been updated for a given user. The default update interval is 900 seconds + // so we alert if we've missed two updates plus a 300 second buffer to avoid false-positives. It's important + // that this alert fire before queriers start to return errors because the bucket index is too old (3600 seconds + // by default). alert: $.alertName('BucketIndexNotUpdated'), expr: ||| - min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 ||| % $._config, labels: { severity: 'critical', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet index e9b4c8ce..6206ff6d 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet @@ -114,8 +114,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '1m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -129,8 +129,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '30m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 1 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet index 1313ee46..264f0e96 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet @@ -9,8 +9,10 @@ alert: $.alertName('ContinuousTestNotRunningOnWrites'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, @@ -24,8 +26,10 @@ alert: $.alertName('ContinuousTestNotRunningOnReads'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet index f594bba5..1caf2aea 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet @@ -57,11 +57,13 @@ }, // Some dashboards show panels grouping together multiple components of a given "path". - // This mapping configures which components belong to each group. + // This mapping configures which components belong to each group. A component can belong + // to multiple groups. local componentGroups = { write: ['distributor', 'ingester', 'mimir_write'], read: ['query_frontend', 'querier', 'ruler_query_frontend', 'ruler_querier', 'mimir_read'], backend: ['query_scheduler', 'ruler_query_scheduler', 'ruler', 'store_gateway', 'compactor', 'alertmanager', 'overrides_exporter', 'mimir_backend'], + remote_ruler_read: ['ruler_query_frontend', 'ruler_query_scheduler', 'ruler_querier'], }, // These are used by the dashboards and allow for the simultaneous display of @@ -133,6 +135,7 @@ write: componentsGroupMatcher(componentGroups.write), read: componentsGroupMatcher(componentGroups.read), backend: componentsGroupMatcher(componentGroups.backend), + remote_ruler_read: componentsGroupMatcher(componentGroups.remote_ruler_read), }, all_instances: std.join('|', std.map(function(name) componentNameRegexp[name], componentGroups.write + componentGroups.read + componentGroups.backend)), @@ -175,6 +178,7 @@ per_cluster_label: 'cluster', per_namespace_label: 'namespace', per_job_label: 'job', + per_component_loki_label: 'name', // Grouping labels, to uniquely identify and group by {jobs, clusters} job_labels: [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_job_label], @@ -194,6 +198,9 @@ // Used to add extra annotations to all alerts, Careful: takes precedence over default annotations. alert_extra_annotations: {}, + // Whether alerts for experimental ingest storage are enabled. + ingest_storage_enabled: true, + cortex_p99_latency_threshold_seconds: 2.5, // Whether resources dashboards are enabled (based on cAdvisor metrics). @@ -278,7 +285,7 @@ sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s)(rate(container_cpu_usage_seconds_total[1m])), + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s)(rate(container_cpu_usage_seconds_total[%(recording_rules_range_interval)s])), "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it @@ -527,15 +534,8 @@ disk_utilization: ||| max by(persistentvolumeclaim) ( - kubelet_volume_stats_used_bytes{%(namespaceMatcher)s} / - kubelet_volume_stats_capacity_bytes{%(namespaceMatcher)s} - ) - and - count by(persistentvolumeclaim) ( - kube_persistentvolumeclaim_labels{ - %(namespaceMatcher)s, - %(containerMatcher)s - } + kubelet_volume_stats_used_bytes{%(namespaceMatcher)s, %(persistentVolumeClaimMatcher)s} / + kubelet_volume_stats_capacity_bytes{%(namespaceMatcher)s, %(persistentVolumeClaimMatcher)s} ) |||, }, @@ -641,6 +641,10 @@ enabled: false, hpa_name: $._config.autoscaling_hpa_prefix + 'cortex-gw.*', }, + ingester: { + enabled: false, + hpa_name: $._config.autoscaling_hpa_prefix + 'ingester-zone-a', + }, }, @@ -649,6 +653,9 @@ 'debug_pprof', ], + // All query methods from IngesterServer interface. Basically everything except Push. + ingester_read_path_routes_regex: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + // The default datasource used for dashboards. dashboard_datasource: 'default', datasource_regex: '', @@ -658,6 +665,10 @@ // Set to four times the scrape interval to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ recording_rules_range_interval: '1m', + // Used to calculate range interval in alerts with default range selector under 10 minutes. + // Needed to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ + base_alerts_range_interval_minutes: 1, + // Used to inject rows into dashboards at specific places that support it. injectRows: {}, @@ -671,5 +682,11 @@ // Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is // not used (default), then rejected requests are already counted as failures. show_rejected_requests_on_writes_dashboard: false, + + // Show panels that use queries for gRPC-based ingestion (distributor -> ingester) + show_grpc_ingestion_panels: true, + + // Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters) + show_ingest_storage_panels: false, }, } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet index c4e9d6f9..a9fb0265 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet @@ -23,6 +23,7 @@ (import 'dashboards/overview-networking.libsonnet') + (import 'dashboards/reads-resources.libsonnet') + (import 'dashboards/remote-ruler-reads-resources.libsonnet') + + (import 'dashboards/remote-ruler-reads-networking.libsonnet') + (import 'dashboards/reads-networking.libsonnet') + (import 'dashboards/writes-resources.libsonnet') + (import 'dashboards/writes-networking.libsonnet') + diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index ca75f19c..e57ea4b7 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,4 +1,30 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { + // Helper function to produce failure rate in percentage queries for native and classic histograms. + // Takes a metric name and a selector as strings and returns a dictionary with classic and native queries. + ncHistogramFailureRate(metric, selector):: { + local template = ||| + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.ncHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic, + countQuery: utils.ncHistogramCountRate(metric, selector).classic, + }, + native: template % { + countFailQuery: utils.ncHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native, + countQuery: utils.ncHistogramCountRate(metric, selector).native, + }, + }, + // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, // but they're defined in a common place just to share them between different dashboards. @@ -17,6 +43,7 @@ perClusterLabel: $._config.per_cluster_label, recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here. groupPrefixJobs: $._config.group_prefix_jobs, + instance: $._config.per_instance_label, }, write_http_routes_regex: 'api_(v1|prom)_push|otlp_v1_metrics', @@ -25,79 +52,95 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { + // deprecated, will be removed writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + requestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, + readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // Write failures rate as percentage of total requests. - writeFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + writeFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), // Read failures rate as percentage of total requests. - readFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + readFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector), }, distributor: { + // deprecated, will be removed writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + + local p = self, + requestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + writeFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), }, query_frontend: { + // deprecated, will be removed readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, - instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, - rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, - labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, - labelValuesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_label_name_values"}[$__rate_interval]))' % variables, - seriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_series"}[$__rate_interval]))' % variables, - remoteReadQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_read"}[$__rate_interval]))' % variables, - metadataQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_metadata"}[$__rate_interval]))' % variables, - exemplarsQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_exemplars"}[$__rate_interval]))' % variables, - activeSeriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_active_series"}[$__rate_interval])) > 0' % variables, - labelNamesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_names"}[$__rate_interval])) > 0' % variables, - labelValuesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_values"}[$__rate_interval])) > 0' % variables, - otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)"}[$__rate_interval]))' % variables, + + local p = self, + requestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // These query routes are used in the overview and other dashboard, everythign else is considered "other" queries. + // Has to be a list to keep the same colors as before, see overridesNonErrorColorsPalette. + local overviewRoutes = [ + { name: 'instantQuery', displayName: 'instant queries', route: '/api/v1/query', routeLabel: '_api_v1_query' }, + { name: 'rangeQuery', displayName: 'range queries', route: '/api/v1/query_range', routeLabel: '_api_v1_query_range' }, + { name: 'labelNames', displayName: '"label names" queries', route: '/api/v1/labels', routeLabel: '_api_v1_labels' }, + { name: 'labelValues', displayName: '"label values" queries', route: '/api/v1/label_name_values', routeLabel: '_api_v1_label_name_values' }, + { name: 'series', displayName: 'series queries', route: '/api/v1/series', routeLabel: '_api_v1_series' }, + { name: 'remoteRead', displayName: 'remote read queries', route: '/api/v1/read', routeLabel: '_api_v1_read' }, + { name: 'metadata', displayName: 'metadata queries', route: '/api/v1/metadata', routeLabel: '_api_v1_metadata' }, + { name: 'exemplars', displayName: 'exemplar queries', route: '/api/v1/query_exemplars', routeLabel: '_api_v1_query_exemplars' }, + { name: 'activeSeries', displayName: '"active series" queries', route: '/api/v1/cardinality_active_series', routeLabel: '_api_v1_cardinality_active_series' }, + { name: 'labelNamesCardinality', displayName: '"label name cardinality" queries', route: '/api/v1/cardinality_label_names', routeLabel: '_api_v1_cardinality_label_names' }, + { name: 'labelValuesCardinality', displayName: '"label value cardinality" queries', route: '/api/v1/cardinality_label_values', routeLabel: '_api_v1_cardinality_label_values' }, + ], + local overviewRoutesRegex = '(prometheus|api_prom)(%s)' % std.join('|', [r.routeLabel for r in overviewRoutes]), + overviewRoutesOverrides: [ + { + matcher: { + id: 'byRegexp', + // To distinguish between query and query_range, we need to match the route with a negative lookahead. + options: '/.*%s($|[^_])/' % r.routeLabel, + }, + properties: [ + { + id: 'displayName', + value: r.displayName, + }, + ], + } + for r in overviewRoutes + ], + overviewRoutesPerSecondMetric: 'cortex_request_duration_seconds', + overviewRoutesPerSecondSelector: '%(queryFrontendMatcher)s,route=~"%(overviewRoutesRegex)s"' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + nonOverviewRoutesPerSecondSelector: '%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~"%(overviewRoutesRegex)s"' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + + local queryPerSecond(name) = 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"}[$__rate_interval]))' % + (variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }), + instantQueriesPerSecond: queryPerSecond('instantQuery'), + rangeQueriesPerSecond: queryPerSecond('rangeQuery'), + labelNamesQueriesPerSecond: queryPerSecond('labelNames'), + labelValuesQueriesPerSecond: queryPerSecond('labelValues'), + seriesQueriesPerSecond: queryPerSecond('series'), + remoteReadQueriesPerSecond: queryPerSecond('remoteRead'), + metadataQueriesPerSecond: queryPerSecond('metadata'), + exemplarsQueriesPerSecond: queryPerSecond('exemplars'), + activeSeriesQueriesPerSecond: queryPerSecond('activeSeries'), + labelNamesCardinalityQueriesPerSecond: queryPerSecond('labelNamesCardinality'), + labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'), // Read failures rate as percentage of total requests. - readFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + readFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector), }, ruler: { @@ -192,5 +235,32 @@ sum(rate(thanos_objstore_bucket_operations_total{%(namespaceMatcher)s}[$__rate_interval])) ||| % variables, }, + + ingester: { + ingestOrClassicDeduplicatedQuery(perIngesterQuery, groupByLabels=''):: ||| + ( # Classic storage + sum by (%(groupByCluster)s, %(groupByLabels)s) (%(perIngesterQuery)s) + / on (%(groupByCluster)s) group_left() + max by (%(groupByCluster)s) (cortex_distributor_replication_factor{%(distributor)s}) + ) + or + ( # Ingest storage + sum by (%(groupByCluster)s, %(groupByLabels)s) ( + max by (ingester_id, %(groupByCluster)s, %(groupByLabels)s) ( + label_replace( + %(perIngesterQuery)s, + "ingester_id", "$1", "%(instance)s", ".*-([0-9]+)$" + ) + ) + ) + ) + ||| % { + perIngesterQuery: perIngesterQuery, + instance: variables.instance, + groupByLabels: groupByLabels, + groupByCluster: $._config.group_by_cluster, + distributor: variables.distributorMatcher, + }, + }, }, } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index a269ccc9..3f346933 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1,22 +1,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'grafana-builder/grafana.libsonnet') { - local resourceRequestColor = '#FFC000', - local resourceLimitColor = '#E02F44', - local successColor = '#7EB26D', - local warningColor = '#EAB839', - local errorColor = '#E24D42', + _colors:: { + resourceRequest: '#FFC000', + resourceLimit: '#E02F44', + success: '#7EB26D', + clientError: '#EF843C', + warning: '#EAB839', + failed: '#E24D42', // "error" is reserved word in Jsonnet. + }, // Colors palette picked from Grafana UI, excluding red-ish colors which we want to keep reserved for errors / failures. - local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE'], + local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE', '#3F6833', '#447EBC', '#967302', '#5794F2'], local resourceRequestStyle = $.overrideFieldByName('request', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceRequestColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceRequest }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), local resourceLimitStyle = $.overrideFieldByName('limit', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceLimitColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceLimit }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), @@ -196,14 +199,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; qpsPanel(selector, statusLabelName='status_code'):: super.qpsPanel(selector, statusLabelName) + $.aliasColors({ - '1xx': warningColor, - '2xx': successColor, + '1xx': $._colors.warning, + '2xx': $._colors.success, '3xx': '#6ED0E0', '4xx': '#EF843C', - '5xx': errorColor, - OK: successColor, - success: successColor, - 'error': errorColor, + '5xx': $._colors.failed, + OK: $._colors.success, + success: $._colors.success, + 'error': $._colors.failed, cancel: '#A9A9A9', }) + { fieldConfig+: { @@ -223,18 +226,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // shows all values on tooltip, descending. Also turns on exemplars, unless 4th parameter is false. hiddenLegendQueryPanel(queries, legends, legendLink=null, exemplars=true):: $.queryPanel(queries, legends, legendLink) + + $.showAllTooltip + { - options: { + options+: { legend+: { showLegend: false, // Work round Grafana turning showLegend back on when we have // schemaVersion<37. https://github.com/grafana/grafana/issues/54472 displayMode: 'hidden', }, - tooltip+: { - mode: 'multi', - sort: 'desc', - }, }, fieldConfig+: { defaults+: { @@ -260,15 +260,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Set the failure color only if there's just 1 legend and it doesn't contain any placeholder. $.aliasColors( if (std.type(legends) == 'string' && std.length(std.findSubstr('{', legends[0])) == 0) then { - [legends]: errorColor, + [legends]: $._colors.failed, } else {} ), successFailurePanel(successMetric, failureMetric):: $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.aliasColors({ - successful: successColor, - failed: errorColor, + successful: $._colors.success, + failed: $._colors.failed, }), // successFailureCustomPanel is like successFailurePanel() but allows to customize the legends @@ -277,8 +277,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; successFailureCustomPanel(queries, legends):: $.queryPanel(queries, legends) + $.aliasColors({ - [legends[0]]: successColor, - [legends[1]]: errorColor, + [legends[0]]: $._colors.success, + [legends[1]]: $._colors.failed, }), // Displays started, completed and failed rate. @@ -288,8 +288,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack + $.aliasColors({ started: '#34CCEB', - completed: successColor, - failed: errorColor, + completed: $._colors.success, + failed: $._colors.failed, }), resourceUtilizationAndLimitLegend(resourceName):: @@ -332,6 +332,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(instanceName, containerName):: $.timeseriesPanel('CPU') + $.queryPanel($.resourceUtilizationAndLimitQueries('cpu', instanceName, containerName), $.resourceUtilizationAndLimitLegend('{{%s}}' % $._config.per_instance_label)) + + $.showAllTooltip + { fieldConfig+: { overrides+: [ @@ -345,12 +346,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, }, - options+: { - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, }, // The provided componentName should be the name of a component among the ones defined in $._config.instance_names. @@ -362,6 +357,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerMemoryWorkingSetPanel(instanceName, containerName):: $.timeseriesPanel('Memory (workingset)') + $.queryPanel($.resourceUtilizationAndLimitQueries('memory_working', instanceName, containerName), $.resourceUtilizationAndLimitLegend('{{%s}}' % $._config.per_instance_label)) + + $.showAllTooltip + { fieldConfig+: { overrides+: [ @@ -375,12 +371,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, }, - options+: { - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, }, // The provided componentName should be the name of a component among the ones defined in $._config.instance_names. @@ -392,6 +382,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerMemoryRSSPanel(instanceName, containerName):: $.timeseriesPanel('Memory (RSS)') + $.queryPanel($.resourceUtilizationAndLimitQueries('memory_rss', instanceName, containerName), $.resourceUtilizationAndLimitLegend('{{%s}}' % $._config.per_instance_label)) + + $.showAllTooltip + { fieldConfig+: { overrides+: [ @@ -405,12 +396,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, }, - options+: { - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, }, // The provided componentName should be the name of a component among the ones defined in $._config.instance_names. @@ -422,6 +407,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerGoHeapInUsePanel(instanceName, containerName):: $.timeseriesPanel('Memory (go heap inuse)') + $.queryPanel($.resourceUtilizationQuery('memory_go_heap', instanceName, containerName), '{{%s}}' % $._config.per_instance_label) + + $.showAllTooltip + { fieldConfig+: { defaults+: { @@ -431,12 +417,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, }, - options+: { - tooltip: { - mode: 'multi', - sort: 'desc', - }, - }, }, // The provided componentName should be the name of a component among the ones defined in $._config.instance_names. @@ -513,7 +493,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( $._config.resources_panel_queries[$._config.deployment_type].disk_utilization % { namespaceMatcher: $.namespaceMatcher(), - containerMatcher: $.containerLabelNameMatcher(containerName), + persistentVolumeClaimMatcher: $.containerPersistentVolumeClaimMatcher(containerName), instanceLabel: $._config.per_instance_label, instanceName: instanceName, instanceDataDir: $._config.instance_data_mountpoint, @@ -533,9 +513,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerDiskSpaceUtilizationPanel($._config.instance_names[componentName], $._config.container_names[componentName]), // The provided containerName should be a regexp from $._config.container_names. - containerLabelNameMatcher(containerName):: - // Check only the prefix so that a multi-zone deployment matches too. - 'label_name=~"(%s).*"' % containerName, + containerPersistentVolumeClaimMatcher(containerName):: + 'persistentvolumeclaim=~".*(%s).*"' % containerName, // The provided componentName should be the name of a component among the ones defined in $._config.instance_names. containerNetworkingRowByComponent(title, componentName):: @@ -566,7 +545,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ) .addPanel( - $.timeseriesPanel('TCP connections (per pod)') + + local title = 'Ingress TCP connections (per pod)'; + + $.timeseriesPanel(title) + + $.panelDescription( + title, + 'The number of ingress TCP connections (HTTP and gRPC protocol).' + ) + $.queryPanel([ 'avg(sum by(%(per_instance_label)s) (cortex_tcp_connections{%(namespaceMatcher)s,%(instanceLabel)s=~"%(instanceName)s"}))' % vars, 'max(sum by(%(per_instance_label)s) (cortex_tcp_connections{%(namespaceMatcher)s,%(instanceLabel)s=~"%(instanceName)s"}))' % vars, @@ -593,179 +578,203 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_kv_request_duration_seconds', '{%s, kv_name=~"%s"}' % [$.jobMatcher($._config.job_names[jobName]), kvName]) ), - cpuAndMemoryBasedAutoScalingRow(componentTitle):: - local component = std.asciiLower(componentTitle); - local field = std.strReplace(component, '-', '_'); - super.row('%s - autoscaling' % [componentTitle]) - .addPanel( - local title = 'Replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active - * on (%(cluster_labels)s, horizontalpodautoscaler) - kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ], - [ - 'Max {{ scaletargetref_name }}', - 'Current {{ scaletargetref_name }}', - 'Min {{ scaletargetref_name }}', - ], - ) + - $.panelDescription( - title, + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingActualReplicas(componentName):: + local title = 'Replicas'; + local componentTitle = std.strReplace(componentName, '_', '-'); + + $.timeseriesPanel(title) + + $.queryPanel( + [ ||| - The maximum and current number of %s replicas. - Note: The current number of replicas can still show 1 replica even when scaled to 0. - Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. - ||| % [component] - ) + - { - fieldConfig+: { - overrides: [ - $.overrideField('byRegexp', '/Max .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - $.overrideField('byRegexp', '/Current .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - ]), - $.overrideField('byRegexp', '/Min .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - ], + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), }, + ], + [ + 'Max {{ scaletargetref_name }}', + 'Current {{ scaletargetref_name }}', + 'Min {{ scaletargetref_name }}', + ], + ) + + $.panelDescription( + title, + ||| + The minimum, maximum, and current number of %s replicas. + ||| % [componentTitle] + ) + + { + fieldConfig+: { + overrides: [ + $.overrideField('byRegexp', '/Max .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + $.overrideField('byRegexp', '/Current .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + ]), + $.overrideField('byRegexp', '/Min .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + ], }, - ) - .addPanel( - local title = 'Scaling metric (CPU): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( + }, + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingDesiredReplicasByAverageValueScalingMetricPanel(componentName, scalingMetricName, scalingMetricID):: + local title = if scalingMetricName != '' then 'Scaling metric (%s): Desired replicas' % scalingMetricName else 'Desired replicas'; + local scalerSelector = if scalingMetricID != '' then ('.*%s.*' % scalingMetricID) else '.+'; + + $.timeseriesPanel(title) + + $.queryPanel( + [ + ||| + sum by (scaler) ( + label_replace( + keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, + "namespace", "$1", "exported_namespace", "(.*)" + ) + / + on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*cpu.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) + kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + "metric", "$1", "metric_name", "(.+)" + ), + "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. + ) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + cluster_label: $._config.per_cluster_label, + hpa_prefix: $._config.autoscaling_hpa_prefix, + hpa_name: $._config.autoscaling[componentName].hpa_name, + namespace: $.namespaceMatcher(), + scaler_selector: scalerSelector, + }, + ], [ + '{{ scaler }}', + ] + ) + + $.panelDescription( + title, + ||| + This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. + It should represent the desired number of replicas, ignoring the min/max constraints applied later. + ||| + ), + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingDesiredReplicasByValueScalingMetricPanel(componentName, scalingMetricName, scalingMetricID):: + local title = if scalingMetricName != '' then 'Scaling metric (%s): Desired replicas' % scalingMetricName else 'Desired replicas'; + local scalerSelector = if scalingMetricID != '' then ('.*%s.*' % scalingMetricID) else '.+'; + + $.timeseriesPanel(title) + + $.queryPanel( + [ ||| - ), - ) - .addPanel( - local title = 'Scaling metric (memory): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( + sum by (scaler) ( + label_replace( + keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, + "namespace", "$1", "exported_namespace", "(.*)" + ) + / + on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*memory.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) + kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + "metric", "$1", "metric_name", "(.+)" + ), + "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. - ||| - ), + * + on(%(aggregation_labels)s, scaledObject) group_left label_replace( + kube_horizontalpodautoscaler_status_current_replicas{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" + ) + ) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + cluster_label: $._config.per_cluster_label, + hpa_prefix: $._config.autoscaling_hpa_prefix, + hpa_name: $._config.autoscaling[componentName].hpa_name, + namespace: $.namespaceMatcher(), + scaler_selector: scalerSelector, + }, + ], [ + '{{ scaler }}', + ] + ) + + $.panelDescription( + title, + ||| + This panel shows the scaling metric exposed by KEDA divided by the target/threshold and multiplied by the current number of replicas. + It should represent the desired number of replicas, ignoring the min/max constraints applied later. + ||| + ), + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingFailuresPanel(componentName):: + local title = 'Autoscaler failures rate'; + + $.timeseriesPanel(title) + + $.queryPanel( + $.filterKedaScalerErrorsByHPA($._config.autoscaling[componentName].hpa_name), + '{{scaler}} failures' + ) + + $.panelDescription( + title, + ||| + The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom + metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + ||| + ), + + cpuAndMemoryBasedAutoScalingRow(componentTitle):: + local componentName = std.strReplace(std.asciiLower(componentTitle), '-', '_'); + super.row('%s – autoscaling' % [componentTitle]) + .addPanel( + $.autoScalingActualReplicas(componentName) ) .addPanel( - local title = 'Autoscaler failures rate'; - $.timeseriesPanel(title) + - $.queryPanel( - $.filterKedaScalerErrorsByHPA($._config.autoscaling[field].hpa_name), - '{{scaler}} failures' - ) + - $.panelDescription( - title, - ||| - The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom - metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. - ||| - ), + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel(componentName, 'CPU', 'cpu') + ) + .addPanel( + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel(componentName, 'memory', 'memory') + ) + .addPanel( + $.autoScalingFailuresPanel(componentName) ), newStatPanel(queries, legends='', unit='percentunit', decimals=1, thresholds=[], instant=false, novalue=''):: @@ -902,6 +911,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, + tablePanel(queries, labelStyles):: + super.tablePanel(queries, labelStyles={}) + { + // Hides styles field, as it makes Grafana 11 use the deprecate "Table (old)" plugin. + styles:: super.styles, + local stylesToProps(s) = + if std.type(s) == 'string' then [ + $.overrideProperty('displayName', s), + $.overrideProperty('decimals', 0), + $.overrideProperty('unit', 'short'), + ] else [ + if std.objectHas(s, 'alias') then $.overrideProperty('displayName', s.alias), + if std.objectHas(s, 'type') && s.type == 'hidden' then $.overrideProperty('custom.hidden', true), + $.overrideProperty('decimals', if std.objectHas(s, 'decimals') then s.decimals else 2), + $.overrideProperty('unit', if std.objectHas(s, 'unit') then s.unit else 'short'), + ], + fieldConfig+: { + overrides+: [ + // Hide time column by default, like jsonnet-lib/grafana-builder does. + $.overrideFieldByName('Time', [ + $.overrideProperty('displayName', 'Time'), + $.overrideProperty('custom.hidden', true), + ]), + ] + [ + $.overrideFieldByName(label, std.prune(stylesToProps(labelStyles[label]))) + for label in std.objectFields(labelStyles) + ], + }, + }, + // Enables stacking of timeseries on top of each. // It overrites the "stack" mixin from jsonnet-lib/grafana-builder, to make it compatible with timeseriesPanel. stack:: { @@ -918,6 +956,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, + // Shows all series' values in the tooltip and sorts them in descending order. + showAllTooltip:: { + options+: { + tooltip+: { + mode: 'multi', + sort: 'desc', + }, + }, + }, + // Switches a panel from lines (default) to bars. bars:: { fieldConfig+: { @@ -993,9 +1041,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; thresholds: { mode: 'absolute', steps: [ - { color: successColor, value: null }, - { color: warningColor, value: 0.01 }, // 1% - { color: errorColor, value: 0.05 }, // 5% + { color: $._colors.success, value: null }, + { color: $._colors.warning, value: 0.01 }, // 1% + { color: $._colors.failed, value: 0.05 }, // 5% ], }, }, @@ -1256,6 +1304,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, + latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[]):: + utils.latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors, multiplier, sum_by) + { + // Hide yaxes from JSON Model; it's not supported by timeseriesPanel. + yaxes:: super.yaxes, + fieldConfig+: { + defaults+: { + unit: 'ms', + min: 0, + }, + }, + }, + filterNodeDiskContainer(containerName):: ||| ignoring(%(instanceLabel)s) group_right() ( @@ -1343,6 +1403,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), legends)), }, + overridesNonErrorColorsPalette(overrides):: std.mapWithIndex(function(idx, override) ( + // Do not define an override if we exausted the colors in the palette. + // Grafana will automatically choose another color. + if idx >= std.length(nonErrorColorsPalette) then override else + { + matcher: override.matcher, + properties: override.properties + [ + { + id: 'color', + value: { + fixedColor: nonErrorColorsPalette[idx], + mode: 'fixed', + }, + }, + ], + } + ), overrides), + // Panel query override functions overrideField(matcherId, options, overrideProperties):: { matcher: { @@ -1424,4 +1502,232 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }, }, + + capitalize(str):: std.asciiUpper(str[0]) + str[1:], + + commonReadsDashboardsRows( + queryFrontendJobName, + querySchedulerJobName, + querierJobName, + queryRoutesRegex, + rowTitlePrefix='', + showQueryCacheRow=false, + ):: + [ + $.row($.capitalize(rowTitlePrefix + 'query-frontend')) + .addPanel( + $.timeseriesPanel('Requests / sec') + + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher(queryFrontendJobName), queryRoutesRegex]) + ) + .addPanel( + $.timeseriesPanel('Latency') + + $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector(queryFrontendJobName) + [utils.selector.re('route', queryRoutesRegex)]) + ) + .addPanel( + $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher(queryFrontendJobName), queryRoutesRegex], '' + ) + ), + local description = ||| +

+ The query scheduler is an optional service that moves + the internal queue from the query-frontend into a + separate component. + If this service is not deployed, + these panels will show "No data." +

+ |||; + $.row($.capitalize(rowTitlePrefix + 'query-scheduler')) + .addPanel( + local title = 'Requests / sec'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher(querySchedulerJobName)) + ) + .addPanel( + local title = 'Latency (Time in Queue)'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher(querySchedulerJobName)) + ) + .addPanel( + local title = 'Queue length'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.hiddenLegendQueryPanel( + 'sum(min_over_time(cortex_query_scheduler_queue_length{%s}[$__interval]))' % [$.jobMatcher(querySchedulerJobName)], + 'Queue length' + ) + + { + fieldConfig+: { + defaults+: { + unit: 'queries', + }, + }, + }, + ), + local description = ||| +

+ The query scheduler can optionally create subqueues + in order to enforce round-robin query queuing fairness + across additional queue dimensions beyond the default. + + By default, query queuing fairness is only applied by tenant ID. + Queries without additional queue dimensions are labeled 'none'. +

+ |||; + local metricName = 'cortex_query_scheduler_queue_duration_seconds'; + local selector = '{%s}' % $.jobMatcher(querySchedulerJobName); + local labels = ['additional_queue_dimensions']; + local labelReplaceArgSets = [ + { + dstLabel: 'additional_queue_dimensions', + replacement: 'none', + srcLabel: + 'additional_queue_dimensions', + regex: '^$', + }, + ]; + $.row($.capitalize(rowTitlePrefix + 'query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions')) + .addPanel( + local title = '99th Percentile Latency by Queue Dimension'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.latencyPanelLabelBreakout( + metricName=metricName, + selector=selector, + percentiles=['0.99'], + includeAverage=false, + labels=labels, + labelReplaceArgSets=labelReplaceArgSets, + ) + ) + .addPanel( + local title = '50th Percentile Latency by Queue Dimension'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.latencyPanelLabelBreakout( + metricName=metricName, + selector=selector, + percentiles=['0.50'], + includeAverage=false, + labels=labels, + labelReplaceArgSets=labelReplaceArgSets, + ) + ) + .addPanel( + local title = 'Average Latency by Queue Dimension'; + $.timeseriesPanel(title) + + $.panelDescription(title, description) + + $.latencyPanelLabelBreakout( + metricName=metricName, + selector=selector, + percentiles=[], + includeAverage=true, + labels=labels, + labelReplaceArgSets=labelReplaceArgSets, + ) + ), + ] + + ( + if (!showQueryCacheRow) then [] else [ + $.row('Cache – query results') + .addPanel( + $.timeseriesPanel('Requests / sec') + + $.queryPanel( + ||| + sum ( + rate(thanos_memcached_operations_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) + or ignoring(backend) + rate(thanos_cache_operations_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) + ) + ||| % { + frontend: $.jobMatcher(queryFrontendJobName), + }, + 'Requests/s' + ) + + { fieldConfig+: { defaults+: { unit: 'ops' } } }, + ) + .addPanel( + $.timeseriesPanel('Latency') + + $.backwardsCompatibleLatencyPanel( + 'thanos_memcached_operation_duration_seconds', + 'thanos_cache_operation_duration_seconds', + '{%s, name="frontend-cache"}' % $.jobMatcher(queryFrontendJobName) + ) + ), + ] + ) + [ + $.row($.capitalize(rowTitlePrefix + 'querier')) + .addPanel( + $.timeseriesPanel('Requests / sec') + + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher(querierJobName), $.queries.read_http_routes_regex]) + ) + .addPanel( + $.timeseriesPanel('Latency') + + $.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector(querierJobName) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) + ) + .addPanel( + $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher(querierJobName), $.queries.read_http_routes_regex], '' + ) + ), + ], + + ingestStorageIngesterEndToEndLatencyWhenStartingPanel():: + $.timeseriesPanel('Kafka record end-to-end latency when starting') + + $.panelDescription( + 'Kafka record end-to-end latency when starting', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester during catch-up phase, when ingesters are starting. + If ingesters are not starting and catching up in the selected time range, this panel will be empty. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + + ingestStorageIngesterEndToEndLatencyWhenRunningPanel():: + $.timeseriesPanel('Kafka record end-to-end latency when ingesters are running') + + $.panelDescription( + 'Kafka record end-to-end latency when ingesters are running', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester, when ingesters are running. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet index 5b754d73..26920ee4 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet @@ -33,6 +33,7 @@ local filename = 'mimir-overview.json'; assert std.md5(filename) == 'ffcd83628d7d4b5a03d1cafd159e6c9c' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Overview') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRow( $.row('%(product)s cluster health' % $._config) @@ -53,9 +54,21 @@ local filename = 'mimir-overview.json'; 'Status', [ // Write failures. - if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), + // Write failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), // Read failures. - if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), + // Read failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), // Rule evaluation failures. $.queries.ruler.evaluations.failuresRate, // Alerting notifications. @@ -84,7 +97,7 @@ local filename = 'mimir-overview.json'; // Object storage failures. $.queries.storage.failuresRate, ], - ['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] + ['Writes', 'Writes', 'Reads', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] ) ) .addPanel( @@ -115,19 +128,23 @@ local filename = 'mimir-overview.json'; ) .addPanel( $.timeseriesPanel(std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + - $.qpsPanel( + $.qpsPanelNativeHistogram( if $._config.gateway_enabled then - $.queries.gateway.writeRequestsPerSecond + $.queries.gateway.requestsPerSecondMetric else - $.queries.distributor.writeRequestsPerSecond + $.queries.distributor.requestsPerSecondMetric, + if $._config.gateway_enabled then + $.queries.gateway.writeRequestsPerSecondSelector + else + $.queries.distributor.writeRequestsPerSecondSelector ) ) .addPanel( $.timeseriesPanel(std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + ( if $._config.gateway_enabled then - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) else - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) ) ) .addPanel( @@ -159,58 +176,60 @@ local filename = 'mimir-overview.json'; ) .addPanel( $.timeseriesPanel(std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + - $.qpsPanel( + $.qpsPanelNativeHistogram( + if $._config.gateway_enabled then + $.queries.gateway.requestsPerSecondMetric + else + $.queries.query_frontend.requestsPerSecondMetric, if $._config.gateway_enabled then - $.queries.gateway.readRequestsPerSecond + $.queries.gateway.readRequestsPerSecondSelector else - $.queries.query_frontend.readRequestsPerSecond + $.queries.query_frontend.readRequestsPerSecondSelector ) ) .addPanel( $.timeseriesPanel(std.stripChars('Read latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + ( if $._config.gateway_enabled then - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) else - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) + $.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) ) ) .addPanel( - local legends = [ - 'instant queries', - 'range queries', - '"label names" queries', - '"label values" queries', - 'series queries', - 'remote read queries', - 'metadata queries', - 'exemplar queries', - '"active series" queries', - '"label name cardinality" queries', - '"label value cardinality" queries', - 'other', - ]; - $.timeseriesPanel('Queries / sec') + - $.queryPanel( - [ - $.queries.query_frontend.instantQueriesPerSecond, - $.queries.query_frontend.rangeQueriesPerSecond, - $.queries.query_frontend.labelNamesQueriesPerSecond, - $.queries.query_frontend.labelValuesQueriesPerSecond, - $.queries.query_frontend.seriesQueriesPerSecond, - $.queries.query_frontend.remoteReadQueriesPerSecond, - $.queries.query_frontend.metadataQueriesPerSecond, - $.queries.query_frontend.exemplarsQueriesPerSecond, - $.queries.query_frontend.activeSeriesQueriesPerSecond, - $.queries.query_frontend.labelNamesCardinalityQueriesPerSecond, - $.queries.query_frontend.labelValuesCardinalityQueriesPerSecond, - $.queries.query_frontend.otherQueriesPerSecond, + { + targets: [ + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.overviewRoutesPerSecondSelector), ['route'])), + format: 'time_series', + legendLink: null, + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.overviewRoutesPerSecondSelector), ['route'])), + format: 'time_series', + legendLink: null, + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.nonOverviewRoutesPerSecondSelector))), + format: 'time_series', + legendFormat: 'other', + legendLink: null, + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.nonOverviewRoutesPerSecondSelector))), + format: 'time_series', + legendFormat: 'other', + legendLink: null, + }, ], - legends, - ) + - $.panelSeriesNonErrorColorsPalette(legends) + - $.stack + - { fieldConfig+: { defaults+: { unit: 'reqps' } } }, + } + + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + overrides+: $.overridesNonErrorColorsPalette($.queries.query_frontend.overviewRoutesOverrides), + }, + } + + $.stack ) ) diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet index 129a1d92..64c32560 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet @@ -56,7 +56,7 @@ local filename = 'mimir-queries.json'; ) ) .addRow( - $.row('Query-frontend - query splitting and results cache') + $.row('Query-frontend – query splitting and results cache') .addPanel( $.timeseriesPanel('Intervals per query') + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval_and_results_cache"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + @@ -120,7 +120,7 @@ local filename = 'mimir-queries.json'; ) ) .addRow( - $.row('Query-frontend - query sharding') + $.row('Query-frontend – query sharding') .addPanel( $.timeseriesPanel('Sharded queries ratio') + $.queryPanel(||| @@ -149,6 +149,33 @@ local filename = 'mimir-queries.json'; ), ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + $.row('Query-frontend – strong consistency (ingest storage)') + .addPanel( + $.timeseriesPanel('Queries with strong read consistency ratio') + + $.panelDescription( + 'Queries with strong read consistency ratio', + ||| + Ratio between queries with strong read consistency and all other queries on query-frontends. + ||| + ) + + $.queryPanel( + [ + ||| + # Display the ratio by container so that it gives a quick visual clue whether requests are coming + # from user queries (query-frontend) or rule evaluations (ruler-query-frontend). + sum by(container) (rate(cortex_query_frontend_queries_consistency_total{%s,consistency="strong"}[$__rate_interval])) + / + sum by(container) (rate(cortex_query_frontend_queries_total{%s}[$__rate_interval])) + ||| % [$.namespaceMatcher(), $.namespaceMatcher()], + ], + ['{{container}}'], + ) + + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + + $.stack + ) + ) .addRow( $.row('Ingester') .addPanel( @@ -167,6 +194,160 @@ local filename = 'mimir-queries.json'; { fieldConfig+: { defaults+: { unit: 'short' } } }, ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester – strong consistency (ingest storage)')) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency / sec') + + $.panelDescription( + 'Requests with strong read consistency / sec', + ||| + Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency ratio') + + $.panelDescription( + 'Requests with strong read consistency ratio', + ||| + Ratio between requests with strong read consistency and all read requests on ingesters. + ||| + ) + + $.queryPanel( + [ + ||| + ( + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ], + ['successful', 'failed'], + ) + + $.aliasColors({ failed: $._colors.failed, successful: $._colors.success }) + + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + + $.stack + ) + .addPanel( + $.timeseriesPanel('Strong read consistency queries — wait latency') + + $.panelDescription( + 'Strong read consistency queries — wait latency', + ||| + How long does the request wait to guarantee strong read consistency. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + $.row('') + .addPanel( + $.timeseriesPanel('Fetch last produced offset requests / sec') + + $.panelDescription( + 'Rate of requests to fetch last produced offset for partition', + ||| + Shows rate of requests to fetch last produced offset for partition, and rate of failed requests. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Fetch last produced offset latency') + + $.panelDescription( + 'Latency', + ||| + How long does it take to fetch "last produced offset" of partition. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + .addPanel( + $.ingestStorageIngesterEndToEndLatencyWhenRunningPanel(), + ) + ) .addRow( $.row('Querier') .addPanel( @@ -269,6 +450,7 @@ local filename = 'mimir-queries.json'; '{{stage}}' ) + $.stack + + $.showAllTooltip + { fieldConfig+: { defaults+: { unit: 's' } } }, ) .addPanel( @@ -280,6 +462,7 @@ local filename = 'mimir-queries.json'; '{{stage}}' ) + $.stack + + $.showAllTooltip + { fieldConfig+: { defaults+: { unit: 's' } } }, ) .addPanel( diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet index dd149a93..30582414 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet @@ -130,185 +130,27 @@ local filename = 'mimir-reads.json'; ) ) ) - .addRow( - $.row('Query-frontend') - .addPanel( - $.timeseriesPanel('Requests / sec') + - $.qpsPanel($.queries.query_frontend.readRequestsPerSecond) - ) - .addPanel( - $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend), $.queries.read_http_routes_regex], '' - ) - ) - ) - .addRow( - local description = ||| -

- The query scheduler is an optional service that moves - the internal queue from the query-frontend into a - separate component. - If this service is not deployed, - these panels will show "No data." -

- |||; - $.row('Query-scheduler') - .addPanel( - local title = 'Requests / sec'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) - ) - .addPanel( - local title = 'Latency (Time in Queue)'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) - ) - .addPanel( - local title = 'Queue length'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.hiddenLegendQueryPanel( - 'sum(min_over_time(cortex_query_scheduler_queue_length{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.query_scheduler)], - 'Queue length' - ) + - { - fieldConfig+: { - defaults+: { - unit: 'queries', - }, - }, - }, - ) - ) - .addRow( - local description = ||| -

- The query scheduler can optionally create subqueues - in order to enforce round-robin query queuing fairness - across additional queue dimensions beyond the default. - - By default, query queuing fairness is only applied by tenant ID. - Queries without additional queue dimensions are labeled 'none'. -

- |||; - local metricName = 'cortex_query_scheduler_queue_duration_seconds'; - local selector = '{%s}' % $.jobMatcher($._config.job_names.query_scheduler); - local labels = ['additional_queue_dimensions']; - local labelReplaceArgSets = [ - { - dstLabel: 'additional_queue_dimensions', - replacement: 'none', - srcLabel: - 'additional_queue_dimensions', - regex: '^$', - }, - ]; - $.row('Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions') - .addPanel( - local title = '99th Percentile Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=['0.99'], - includeAverage=false, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) - ) - .addPanel( - local title = '50th Percentile Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=['0.50'], - includeAverage=false, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) - ) - .addPanel( - local title = 'Average Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=[], - includeAverage=true, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) - ) - ) - .addRow( - $.row('Cache – query results') - .addPanel( - $.timeseriesPanel('Requests / sec') + - $.queryPanel( - ||| - sum ( - rate(thanos_memcached_operations_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) - or ignoring(backend) - rate(thanos_cache_operations_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) - ) - ||| % { - frontend: $.jobMatcher($._config.job_names.query_frontend), - }, - 'Requests/s' - ) + - { fieldConfig+: { defaults+: { unit: 'ops' } } }, - ) - .addPanel( - $.timeseriesPanel('Latency') + - $.backwardsCompatibleLatencyPanel( - 'thanos_memcached_operation_duration_seconds', - 'thanos_cache_operation_duration_seconds', - '{%s, name="frontend-cache"}' % $.jobMatcher($._config.job_names.query_frontend) - ) - ) - ) - .addRow( - $.row('Querier') - .addPanel( - $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.querier), $.queries.read_http_routes_regex]) - ) - .addPanel( - $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier), $.queries.read_http_routes_regex], '' - ) - ) - ) + .addRows($.commonReadsDashboardsRows( + queryFrontendJobName=$._config.job_names.query_frontend, + querySchedulerJobName=$._config.job_names.query_scheduler, + querierJobName=$._config.job_names.querier, + queryRoutesRegex=$.queries.read_http_routes_regex, + showQueryCacheRow=true, + )) .addRow( $.row('Ingester') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], '' ) ) ) @@ -339,7 +181,7 @@ local filename = 'mimir-reads.json'; ) .addRowIf( $._config.autoscaling.querier.enabled, - $.row('Querier - autoscaling') + $.row('Querier – autoscaling') .addPanel( local title = 'Replicas'; $.timeseriesPanel(title) + @@ -360,9 +202,6 @@ local filename = 'mimir-reads.json'; ||| max by (scaletargetref_name) ( kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active. - * on (%(cluster_labels)s, horizontalpodautoscaler) - kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} # Add the scaletargetref_name label which is more readable than "kube-hpa-..." + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} @@ -395,8 +234,6 @@ local filename = 'mimir-reads.json'; title, ||| The maximum, and current number of querier replicas. - Please note that the current number of replicas can still show 1 replica even when scaled to 0. - Since HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. ||| ) + { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet index ee82ed22..df04952f 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet @@ -7,7 +7,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ($.dashboard('Remote ruler reads resources') + { uid: std.md5(filename) }) .addClusterSelectorTemplates(false) .addRow( - $.row('Query-frontend (dedicated to ruler)') + $.row('Ruler-query-frontend') .addPanel( $.containerCPUUsagePanelByComponent('ruler_query_frontend'), ) @@ -19,7 +19,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ) ) .addRow( - $.row('Query-scheduler (dedicated to ruler)') + $.row('Ruler-query-scheduler') .addPanel( $.containerCPUUsagePanelByComponent('ruler_query_scheduler'), ) @@ -31,7 +31,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ) ) .addRow( - $.row('Querier (dedicated to ruler)') + $.row('Ruler-querier') .addPanel( $.containerCPUUsagePanelByComponent('ruler_querier'), ) diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet index df1f48f4..9810193e 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet @@ -53,140 +53,35 @@ local filename = 'mimir-remote-ruler-reads.json'; ), ) ) - .addRow( - $.row('Query-frontend (dedicated to ruler)') - .addPanel( - $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.ruler_query_frontend), rulerRoutesRegex]) - ) - .addPanel( - $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ruler_query_frontend) + [utils.selector.re('route', rulerRoutesRegex)]) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler_query_frontend), rulerRoutesRegex], '' - ) - ) - ) - .addRow( - local description = ||| -

- The query scheduler is an optional service that moves - the internal queue from the query-frontend into a - separate component. - If this service is not deployed, - these panels will show "No data." -

- |||; - $.row('Query-scheduler (dedicated to ruler)') - .addPanel( - local title = 'Requests / sec'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ruler_query_scheduler)) - ) - .addPanel( - local title = 'Latency (Time in Queue)'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ruler_query_scheduler)) - ) - .addPanel( - local title = 'Queue length'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.hiddenLegendQueryPanel( - 'sum(min_over_time(cortex_query_scheduler_queue_length{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ruler_query_scheduler)], - 'Queue length' - ) + - { fieldConfig+: { defaults+: { unit: 'queries' } } }, - ) - ) - .addRow( - local description = ||| -

- The query scheduler can optionally create subqueues - in order to enforce round-robin query queuing fairness - across additional queue dimensions beyond the default. + .addRows($.commonReadsDashboardsRows( + queryFrontendJobName=$._config.job_names.ruler_query_frontend, + querySchedulerJobName=$._config.job_names.ruler_query_scheduler, + querierJobName=$._config.job_names.ruler_querier, + queryRoutesRegex=rulerRoutesRegex, - By default, query queuing fairness is only applied by tenant ID. - Queries without additional queue dimensions are labeled 'none'. -

- |||; - local metricName = 'cortex_query_scheduler_queue_duration_seconds'; - local selector = '{%s}' % $.jobMatcher($._config.job_names.ruler_query_scheduler); - local labels = ['additional_queue_dimensions']; - local labelReplaceArgSets = [ - { - dstLabel: 'additional_queue_dimensions', - replacement: 'none', - srcLabel: - 'additional_queue_dimensions', - regex: '^$', - }, - ]; - $.row('Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions') - .addPanel( - local title = '99th Percentile Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=['0.99'], - includeAverage=false, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) - ) + rowTitlePrefix='Ruler-', + )) + .addRowIf( + $._config.autoscaling.ruler_querier.enabled, + $.row('Ruler-querier - autoscaling') .addPanel( - local title = '50th Percentile Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=['0.50'], - includeAverage=false, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) + $.autoScalingActualReplicas('ruler_querier') ) .addPanel( - local title = 'Average Latency by Queue Dimension'; - $.timeseriesPanel(title) + - $.panelDescription(title, description) + - $.latencyPanelLabelBreakout( - metricName=metricName, - selector=selector, - percentiles=[], - includeAverage=true, - labels=labels, - labelReplaceArgSets=labelReplaceArgSets, - ) + $.autoScalingFailuresPanel('ruler_querier') ) ) - .addRow( - $.row('Querier (dedicated to ruler)') + .addRowIf( + $._config.autoscaling.ruler_querier.enabled, + $.row('') .addPanel( - $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.ruler_querier), $.queries.read_http_routes_regex]) + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel('ruler_querier', 'CPU', 'cpu') ) .addPanel( - $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.ruler_querier) + [utils.selector.re('route', $.queries.read_http_routes_regex)]) + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel('ruler_querier', 'memory', 'memory') ) .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler_querier), $.queries.read_http_routes_regex], '' - ) + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel('ruler_querier', 'in-flight queries', 'queries') ) - ) - .addRowIf( - $._config.autoscaling.ruler_querier.enabled, - $.cpuAndMemoryBasedAutoScalingRow('Ruler-Querier'), ), } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/rollout-progress.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/rollout-progress.libsonnet index 4ada30bc..0aeba3f1 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -101,6 +101,7 @@ local filename = 'mimir-rollout-progress.json'; min=0, max=1 ) + + $.showAllTooltip + { id: 1, gridPos: { h: 13, w: 10, x: 0, y: 0 }, @@ -123,9 +124,6 @@ local filename = 'mimir-rollout-progress.json'; options+: { xField: 'Workload', orientation: 'horizontal', - tooltip+: { - mode: 'multi', - }, }, transformations: [ { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/ruler.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/ruler.libsonnet index 9190a413..f1e80769 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/ruler.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/ruler.libsonnet @@ -162,9 +162,9 @@ local filename = 'mimir-ruler.json'; .addPanel( $.timeseriesPanel('Queue length') + $.queryPanel(||| - sum by(user) (rate(cortex_prometheus_notifications_queue_length{%s}[$__rate_interval])) + sum by(user) (cortex_prometheus_notifications_queue_length{%s}) / - sum by(user) (rate(cortex_prometheus_notifications_queue_capacity{%s}[$__rate_interval])) > 0 + sum by(user) (cortex_prometheus_notifications_queue_capacity{%s}) > 0 ||| % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } }, ) diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/scaling.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/scaling.libsonnet index 526ccc9e..ecad60e4 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/scaling.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/scaling.libsonnet @@ -50,7 +50,7 @@ local filename = 'mimir-scaling.json'; ) ||| % [$._config.alert_aggregation_rule_prefix, $.namespaceMatcher(), $._config.alert_aggregation_rule_prefix, $.namespaceMatcher()], ], { - __name__: { alias: 'Cluster', type: 'hidden' }, + __name__: { type: 'hidden' }, cluster: { alias: 'Cluster' }, namespace: { alias: 'Namespace' }, deployment: { alias: 'Service' }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet index b133f01f..1862fadc 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet @@ -7,13 +7,13 @@ local filename = 'mimir-slow-queries.json'; ($.dashboard('Slow queries') + { uid: std.md5(filename) }) .addClusterSelectorTemplates(false) .addRow( - $.row('Accross tenants') + $.row('Across tenants') .addPanel( $.timeseriesPanel('Response time') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], unit='s', @@ -23,8 +23,8 @@ local filename = 'mimir-slow-queries.json'; $.timeseriesPanel('Fetched series') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], ) @@ -33,8 +33,8 @@ local filename = 'mimir-slow-queries.json'; $.timeseriesPanel('Fetched chunks') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], unit='bytes', @@ -44,8 +44,8 @@ local filename = 'mimir-slow-queries.json'; $.timeseriesPanel('Response size') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], unit='bytes', @@ -55,8 +55,8 @@ local filename = 'mimir-slow-queries.json'; $.timeseriesPanel('Time span') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], unit='s', @@ -66,8 +66,8 @@ local filename = 'mimir-slow-queries.json'; $.timeseriesPanel('Query wall time') + $.lokiMetricsQueryPanel( [ - 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], - 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], + 'quantile_over_time(0.5, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by ()' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], ], ['p99', 'p50'], unit='s', @@ -87,7 +87,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 response time') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', unit='s', ) @@ -95,14 +95,14 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 fetched series') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', ) ) .addPanel( $.timeseriesPanel('P99 fetched chunks') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', unit='bytes', ) @@ -110,7 +110,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 response size') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', unit='bytes', ) @@ -118,7 +118,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 time span') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', unit='s', ) @@ -126,7 +126,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 query wall time') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user}}', unit='s', ) @@ -146,7 +146,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 response time') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(response_time) [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', unit='s', ) @@ -154,14 +154,14 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 fetched series') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_series_count[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', ) ) .addPanel( $.timeseriesPanel('P99 fetched chunks') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap fetched_chunk_bytes[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', unit='bytes', ) @@ -169,7 +169,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 response size') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap response_size_bytes[$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', unit='bytes', ) @@ -177,7 +177,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 time span') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap duration_seconds(length) [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', unit='s', ) @@ -185,7 +185,7 @@ local filename = 'mimir-slow-queries.json'; .addPanel( $.timeseriesPanel('P99 query wall time') + $.lokiMetricsQueryPanel( - 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label], + 'topk(10, quantile_over_time(0.99, {%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | unwrap query_wall_time_seconds [$__auto]) by (user_agent))' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label], '{{user_agent}}', unit='s', ) @@ -218,7 +218,7 @@ local filename = 'mimir-slow-queries.json'; 'length_seconds="{{ if .length }} {{ duration .length }} {{ end }}"', ], // Filter out the remote read endpoint. - expr: '{%s=~"$cluster",%s=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | label_format %s' % [$._config.per_cluster_label, $._config.per_namespace_label, std.join(',', extraFields)], + expr: '{%s=~"$cluster",%s=~"$namespace",%s=~"$component.*"} |= "query stats" != "/api/v1/read" | logfmt | user=~"${tenant_id}" | user_agent=~"${user_agent}" | response_time > ${min_duration} | label_format %s' % [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_component_loki_label, std.join(',', extraFields)], instant: false, legendFormat: '', range: true, @@ -400,6 +400,30 @@ local filename = 'mimir-slow-queries.json'; }, query: defaultValue, }, + { + multi: false, + name: 'component', + label: 'Component', + type: 'custom', + current: { + selected: true, + text: 'query-frontend', + value: 'query-frontend', + }, + options: [ + { + selected: true, + text: 'query-frontend', + value: 'query-frontend', + }, + { + selected: false, + text: 'ruler-query-frontend', + value: 'ruler-query-frontend', + }, + ], + query: 'query-frontend, ruler-query-frontend', + }, ], }, } + { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/tenants.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/tenants.libsonnet index c50005fe..5d1c185d 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/tenants.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/tenants.libsonnet @@ -1,7 +1,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; local filename = 'mimir-tenants.json'; -(import 'dashboard-utils.libsonnet') { +(import 'dashboard-utils.libsonnet') + +(import 'dashboard-queries.libsonnet') { local user_limits_overrides_query(limit_name) = ||| max(cortex_limits_overrides{%(overrides_exporter)s, limit_name="%(limit_name)s", user="$user"}) or @@ -41,54 +42,19 @@ local filename = 'mimir-tenants.json'; $.timeseriesPanel(title) + $.queryPanel( [ - ||| - sum( - ( - cortex_ingester_memory_series_created_total{%(ingester)s, user="$user"} - - cortex_ingester_memory_series_removed_total{%(ingester)s, user="$user"} - ) - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) + local perIngesterInMemorySeries = ||| + ( + cortex_ingester_memory_series_created_total{%(ingester)s, user="$user"} + - cortex_ingester_memory_series_removed_total{%(ingester)s, user="$user"} ) ||| % { ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + }; + $.queries.ingester.ingestOrClassicDeduplicatedQuery(perIngesterInMemorySeries), user_limits_overrides_query('max_global_series_per_user'), - ||| - sum( - cortex_ingester_active_series{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ||| - sum( - cortex_ingester_owned_series{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ||| - sum by (name) ( - cortex_ingester_active_series_custom_tracker{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) > 0 - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_series{%s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)]), + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_owned_series{%s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)]), + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_series_custom_tracker{%s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)], groupByLabels='name'), ], [ 'in-memory', @@ -130,6 +96,7 @@ local filename = 'mimir-tenants.json'; '{{pod}}', ], ) + + $.showAllTooltip + { fieldConfig+: { defaults+: { custom+: { fillOpacity: 0 } }, @@ -141,10 +108,6 @@ local filename = 'mimir-tenants.json'; ], }, options+: { - tooltip+: { - mode: 'multi', - sort: 'desc', - }, legend+: { showLegend: false }, }, } + @@ -178,6 +141,7 @@ local filename = 'mimir-tenants.json'; '{{pod}}', ], ) + + $.showAllTooltip + { fieldConfig+: { defaults+: { custom+: { fillOpacity: 0 } }, @@ -189,10 +153,6 @@ local filename = 'mimir-tenants.json'; ], }, options+: { - tooltip+: { - mode: 'multi', - sort: 'desc', - }, legend+: { showLegend: false }, }, } + @@ -213,17 +173,7 @@ local filename = 'mimir-tenants.json'; local title = 'Series with exemplars'; $.timeseriesPanel(title) + $.queryPanel( - ||| - sum( - cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{%(ingester)s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)]), 'series', ) + { options+: { legend+: { showLegend: false } } } + @@ -261,28 +211,8 @@ local filename = 'mimir-tenants.json'; $.timeseriesPanel(title) + $.queryPanel( [ - ||| - sum( - cortex_ingester_active_native_histogram_series{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ||| - sum by (name) ( - cortex_ingester_active_native_histogram_series_custom_tracker{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) > 0 - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_native_histogram_series{%(ingester)s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)]), + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_native_histogram_series_custom_tracker{%(ingester)s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)], groupByLabels='name'), ], [ 'active', @@ -303,28 +233,8 @@ local filename = 'mimir-tenants.json'; $.timeseriesPanel(title) + $.queryPanel( [ - ||| - sum( - cortex_ingester_active_native_histogram_buckets{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ||| - sum by (name) ( - cortex_ingester_active_native_histogram_buckets_custom_tracker{%(ingester)s, user="$user"} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) > 0 - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_native_histogram_buckets{%(ingester)s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)]), + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_native_histogram_buckets_custom_tracker{%(ingester)s, user="$user"}' % [$.jobMatcher($._config.job_names.ingester)], groupByLabels='name'), ], [ 'buckets', @@ -562,17 +472,7 @@ local filename = 'mimir-tenants.json'; local title = 'Ingester appended exemplars rate'; $.timeseriesPanel(title) + $.queryPanel( - ||| - sum( - rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{%(ingester)s, user="$user"}[$__rate_interval]) - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + $.queries.ingester.ingestOrClassicDeduplicatedQuery('rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total{%(ingester)s, user="$user"}[$__rate_interval])' % [$.jobMatcher($._config.job_names.ingester)]), 'rate', ) + { options+: { legend+: { showLegend: false } } } + @@ -856,7 +756,8 @@ local filename = 'mimir-tenants.json'; (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{%s}[$__rate_interval])) == 0) ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{ job }}', - ) + { + ) + + $.showAllTooltip + { fieldConfig+: { defaults+: { custom+: { @@ -865,11 +766,6 @@ local filename = 'mimir-tenants.json'; }, }, }, - options+: { - tooltip+: { - mode: 'multi', - }, - }, } + $.panelDescription( 'Estimated Compaction Jobs', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet index 4359ee3d..676e0de3 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet @@ -1,23 +1,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; local filename = 'mimir-top-tenants.json'; -(import 'dashboard-utils.libsonnet') { - local in_memory_series_per_user_query(at='') = ||| - sum by (user) ( +(import 'dashboard-utils.libsonnet') + +(import 'dashboard-queries.libsonnet') { + local in_memory_series_per_user_query(at='') = ( + local perIngesterQuery = ||| ( - sum by (user, %(group_by_cluster)s) (cortex_ingester_memory_series_created_total{%(ingester)s} %(at)s) + cortex_ingester_memory_series_created_total{%(ingester)s} %(at)s - - sum by (user, %(group_by_cluster)s) (cortex_ingester_memory_series_removed_total{%(ingester)s} %(at)s) + cortex_ingester_memory_series_removed_total{%(ingester)s} %(at)s ) - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s} %(at)s) - ) - ||| % { - at: at, - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, + ||| % { + at: at, + ingester: $.jobMatcher($._config.job_names.ingester), + }; + $.queries.ingester.ingestOrClassicDeduplicatedQuery(perIngesterQuery, groupByLabels='user') + ), [filename]: assert std.md5(filename) == 'bc6e12d4fe540e4a1785b9d3ca0ffdd9' : 'UID of the dashboard has changed, please update references to dashboard.'; @@ -46,20 +44,14 @@ local filename = 'mimir-top-tenants.json'; $.tablePanel( [ ||| - topk($limit, - sum by (user) ( - cortex_ingester_active_series{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ], - { 'Value #A': { alias: 'series' } } + topk($limit, %s) + ||| % [ + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_active_series{%s}' % [$.jobMatcher($._config.job_names.ingester)], groupByLabels='user'), + ], + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -72,8 +64,10 @@ local filename = 'mimir-top-tenants.json'; $.tablePanel( [ 'topk($limit, %(in_memory_series_per_user)s)' % { in_memory_series_per_user: in_memory_series_per_user_query() }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -85,9 +79,17 @@ local filename = 'mimir-top-tenants.json'; $.timeseriesPanel(title) + $.queryPanel( ||| - %(in_memory_series_per_user)s + (%(in_memory_series_per_user)s) and - topk($limit, %(in_memory_series_per_user_at_end)s - %(in_memory_series_per_user_at_start)s) + topk($limit, + ( + %(in_memory_series_per_user_at_end)s + ) + - + ( + %(in_memory_series_per_user_at_start)s + ) + ) ||| % { in_memory_series_per_user: in_memory_series_per_user_query(), in_memory_series_per_user_at_end: in_memory_series_per_user_query(at='@ end()'), @@ -107,8 +109,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -143,8 +147,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_discarded_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.ingester + $._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -178,20 +184,14 @@ local filename = 'mimir-top-tenants.json'; $.tablePanel( [ ||| - topk($limit, - sum by (user) ( - cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ) - ||| % { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, - }, - ], - { 'Value #A': { alias: 'series' } } + topk($limit, %s) + ||| % [ + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{%s}' % [$.jobMatcher($._config.job_names.ingester)], groupByLabels='user'), + ], + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -205,8 +205,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'exemplars/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'exemplars/s' }, + } ) ), ) @@ -221,8 +223,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'rules' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'rules' }, + } ) ), ) @@ -236,8 +240,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'seconds' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'seconds' }, + } ) ) ) @@ -256,8 +262,10 @@ local filename = 'mimir-top-tenants.json'; (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{%s}[$__rate_interval])) == 0) ) ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], - ], - { Value: { alias: 'Compaction Jobs', decimals: 0 } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'Compaction Jobs', decimals: 0 }, + } ) ), ), diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes-resources.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes-resources.libsonnet index aba99c5c..b0ea0617 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes-resources.libsonnet @@ -57,10 +57,8 @@ local filename = 'mimir-writes-resources.json'; 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label ) + + $.showAllTooltip + { - options+: { - tooltip+: { sort: 'desc' }, - }, fieldConfig+: { defaults+: { custom+: { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet index 9d80feb0..25064a6a 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet @@ -3,6 +3,7 @@ local filename = 'mimir-writes.json'; (import 'dashboard-utils.libsonnet') + (import 'dashboard-queries.libsonnet') { + [filename]: assert std.md5(filename) == '8280707b8f16e7b87b840fc1cc92d4c5' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Writes') + { uid: std.md5(filename) }) @@ -60,36 +61,32 @@ local filename = 'mimir-writes.json'; .addPanel( local title = 'In-memory series'; $.panel(title) + - $.statPanel(||| - sum(cortex_ingester_memory_series{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s})) - ||| % ($._config) { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - }, format='short') + + $.statPanel( + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_memory_series{%s}' % [$.jobMatcher($._config.job_names.ingester)]), + format='short' + ) + $.panelDescription( title, ||| The number of series not yet flushed to object storage that are held in ingester memory. + With classic storage we the sum of series from all ingesters is divided by the replication factor. + With ingest storage we take the maximum series of each ingest partition. ||| ), ) .addPanel( local title = 'Exemplars in ingesters'; $.panel(title) + - $.statPanel(||| - sum(cortex_ingester_tsdb_exemplar_exemplars_in_storage{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s})) - ||| % ($._config) { - ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - }, format='short') + + $.statPanel( + $.queries.ingester.ingestOrClassicDeduplicatedQuery('cortex_ingester_tsdb_exemplar_exemplars_in_storage{%s}' % [$.jobMatcher($._config.job_names.ingester)]), + format='short' + ) + $.panelDescription( title, ||| Number of TSDB exemplars currently in ingesters' storage. + With classic storage we the sum of exemplars from all ingesters is divided by the replication factor. + With ingest storage we take the maximum exemplars of each ingest partition. ||| ), ) @@ -162,10 +159,39 @@ local filename = 'mimir-writes.json'; 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], '' ) ) + .addPanelIf( + $._config.show_ingest_storage_panels, + $.timeseriesPanel('Sync write to Kafka latency (ingest storage)') + + $.panelDescription( + 'Sync write to Kafka latency (ingest storage)', + ||| + Latency of synchronous write operation used to store data into Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) ) .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($)) - .addRow( - $.row('Ingester') + .addRowIf( + $._config.show_grpc_ingestion_panels, + ($.row('Ingester')) .addPanel( $.timeseriesPanel('Requests / sec') + $.panelDescription( @@ -206,6 +232,153 @@ local filename = 'mimir-writes.json'; ) ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester – Kafka records processing (ingest storage)')) + .addPanel( + $.timeseriesPanel('Kafka fetches / sec') + + $.panelDescription( + 'Kafka fetches / sec', + ||| + Rate of fetches received from Kafka brokers. A fetch can contain multiple records (a write request received on the write path is mapped into a single record). + Read errors are any errors reported on connection to Kafka brokers, and are separate from "failed" fetches. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_fetches_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + // cortex_ingest_storage_reader_read_errors_total metric is reported by Kafka client. + 'sum (rate (cortex_ingest_storage_reader_read_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + 'read errors', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed, 'read errors': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka records / sec') + + $.panelDescription( + 'Kafka records / sec', + ||| + Rate of processed records from Kafka. Failed records are categorized as "client" errors (e.g. per-tenant limits) or server errors. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_records_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_records_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="client"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="server"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed (client)', + 'failed (server)', + ], + ) + $.aliasColors({ successful: $._colors.success, 'failed (client)': $._colors.clientError, 'failed (server)': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka record processing latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time used to process a single record (write request). This time is spent by appending data to per-tenant TSDB. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + $.row('Ingester – end-to-end latency (ingest storage)') + .addPanel( + $.ingestStorageIngesterEndToEndLatencyWhenRunningPanel(), + ) + .addPanel( + $.ingestStorageIngesterEndToEndLatencyWhenStartingPanel(), + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester – last consumed offset (ingest storage)')) + .addPanel( + $.timeseriesPanel('Last consumed offset commits / sec') + + $.panelDescription( + 'Last consumed offset commits / sec', + ||| + Rate of "last consumed offset" commits issued by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_offset_commit_requests_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Last consumed offset commits latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time spent to commit "last consumed offset" by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) .addRowIf( $._config.gateway_enabled && $._config.autoscaling.gateway.enabled, $.cpuAndMemoryBasedAutoScalingRow('Gateway'), @@ -214,6 +387,70 @@ local filename = 'mimir-writes.json'; $._config.autoscaling.distributor.enabled, $.cpuAndMemoryBasedAutoScalingRow('Distributor'), ) + .addRowIf( + $._config.autoscaling.ingester.enabled, + $.row('Ingester – autoscaling') + .addPanel( + $.autoScalingActualReplicas('ingester') + { title: 'Replicas (leader zone)' } + + $.panelDescription( + 'Replicas (leader zone)', + ||| + The minimum, maximum, and current number of replicas for the leader zone of ingesters. + Other zones scale to follow this zone (with delay for downscale). + ||| + ) + ) + .addPanel( + $.timeseriesPanel('Replicas') + + $.panelDescription('Replicas', 'Number of ingester replicas per zone.') + + $.queryPanel( + [ + 'sum by (%s) (up{%s})' % [$._config.per_job_label, $.jobMatcher($._config.job_names.ingester)], + ], + [ + '{{ %(per_job_label)s }}' % $._config.per_job_label, + ], + ), + ) + .addPanel( + $.autoScalingDesiredReplicasByValueScalingMetricPanel('ingester', '', '') + { title: 'Desired replicas (leader zone)' } + ) + .addPanel( + $.autoScalingFailuresPanel('ingester') + { title: 'Autoscaler failures rate' } + ), + ) + .addRowIf( + $._config.show_ingest_storage_panels && $._config.autoscaling.ingester.enabled, + $.row('Ingester – autoscaling (ingest storage)') + .addPanel( + $.autoScalingActualReplicas('ingester') + { title: 'Replicas (ReplicaTemplate)' } + + $.panelDescription( + 'Replicas (ReplicaTemplate)', + ||| + The minimum, maximum, and current number of replicas for the ReplicaTemplate object. + Rollout-operator will keep ingester replicas updated based on this object. + ||| + ) + ) + .addPanel( + $.timeseriesPanel('Replicas') + + $.panelDescription('Replicas', 'Number of ingester replicas.') + + $.queryPanel( + [ + 'sum by (%s) (up{%s})' % [$._config.per_job_label, $.jobMatcher($._config.job_names.ingester)], + ], + [ + '{{ %(per_job_label)s }}' % $._config.per_job_label, + ], + ), + ) + .addPanel( + $.autoScalingDesiredReplicasByAverageValueScalingMetricPanel('ingester', '', '') + { title: 'Desired replicas (ReplicaTemplate)' } + ) + .addPanel( + $.autoScalingFailuresPanel('ingester') + { title: 'Autoscaler failures rate (ReplicaTemplate)' } + ), + ) .addRow( $.kvStoreRow('Distributor - key-value store for high-availability (HA) deduplication', 'distributor', 'distributor-hatracker') ) @@ -224,7 +461,7 @@ local filename = 'mimir-writes.json'; $.kvStoreRow('Ingester - key-value store for the ingesters ring', 'ingester', 'ingester-.*') ) .addRow( - $.row('Ingester - shipper') + $.row('Ingester – shipper') .addPanel( $.timeseriesPanel('Uploaded blocks / sec') + $.successFailurePanel( @@ -253,7 +490,7 @@ local filename = 'mimir-writes.json'; ) ) .addRow( - $.row('Ingester - TSDB head') + $.row('Ingester – TSDB head') .addPanel( $.timeseriesPanel('Compactions / sec') + $.successFailurePanel( @@ -283,7 +520,7 @@ local filename = 'mimir-writes.json'; ) ) .addRow( - $.row('Ingester - TSDB write ahead log (WAL)') + $.row('Ingester – TSDB write ahead log (WAL)') .addPanel( $.timeseriesPanel('WAL truncations / sec') + $.successFailurePanel( @@ -388,18 +625,10 @@ local filename = 'mimir-writes.json'; local title = 'Ingester ingested exemplars rate'; $.timeseriesPanel(title) + $.queryPanel( - ||| - sum( - %(group_prefix_jobs)s:cortex_ingester_ingested_exemplars:rate5m{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { + $.queries.ingester.ingestOrClassicDeduplicatedQuery('%(group_prefix_jobs)s:cortex_ingester_ingested_exemplars:rate5m{%(ingester)s}' % { ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, group_prefix_jobs: $._config.group_prefix_jobs, - }, + }), 'ingested exemplars', ) + { fieldConfig+: { defaults+: { unit: 'ex/s' } } } + @@ -407,7 +636,8 @@ local filename = 'mimir-writes.json'; title, ||| The rate of exemplars ingested in the ingesters. - Every exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor. + Every exemplar is replicated to a number of ingesters. With classic storage we the sum of rates from all ingesters is divided by the replication factor. + With ingest storage we take the maximum rate of each ingest partition. This ingested exemplars rate should match the distributor's received exemplars rate. ||| ), @@ -416,18 +646,10 @@ local filename = 'mimir-writes.json'; local title = 'Ingester appended exemplars rate'; $.timeseriesPanel(title) + $.queryPanel( - ||| - sum( - %(group_prefix_jobs)s:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{%(ingester)s} - / on(%(group_by_cluster)s) group_left - max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s}) - ) - ||| % { + $.queries.ingester.ingestOrClassicDeduplicatedQuery('%(group_prefix_jobs)s:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m{%(ingester)s}' % { ingester: $.jobMatcher($._config.job_names.ingester), - distributor: $.jobMatcher($._config.job_names.distributor), - group_by_cluster: $._config.group_by_cluster, group_prefix_jobs: $._config.group_prefix_jobs, - }, + }), 'appended exemplars', ) + { fieldConfig+: { defaults+: { unit: 'ex/s' } } } + diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json index 2dde1800..a91499ef 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "bf12954197422f36f0803ee217e378ad055f3837", + "sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "bf12954197422f36f0803ee217e378ad055f3837", + "sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI=" } ], "legacyImports": false diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet index 567bae04..5f0bb2a6 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet @@ -333,9 +333,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ { // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance - record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label], + record: '%s_%s:cortex_ingester_ingested_samples_total:rate%s' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label, $._config.recording_rules_range_interval], expr: ||| - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[%(recording_rules_range_interval)s])) ||| % $._config, }, ],