diff --git a/compose.override.yaml b/compose.override.yaml index 07f9aeef..044f141a 100644 --- a/compose.override.yaml +++ b/compose.override.yaml @@ -45,7 +45,7 @@ services: volumes: - ./monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin - ./monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin - # - ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin + - ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin environment: GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn} GF_DIAGNOSTICS_PROFILING_ENABLED: true @@ -83,8 +83,8 @@ services: - ./monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml - ./monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml - ./monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml - # - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml - # - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml + - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml + - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml # override compose.yaml included service pyroscope's labels and environment to enabled traces data collection pyroscope: diff --git a/docker-compose/microservices-mode/traces/grafana.override.yaml b/docker-compose/microservices-mode/traces/grafana.override.yaml index 325fbfe0..aa8d558f 100644 --- a/docker-compose/microservices-mode/traces/grafana.override.yaml +++ b/docker-compose/microservices-mode/traces/grafana.override.yaml @@ -4,5 +4,5 @@ services: grafana: volumes: - ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin - # - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin + - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin diff --git a/docker-compose/microservices-mode/traces/mimirtool.override.yaml b/docker-compose/microservices-mode/traces/mimirtool.override.yaml index f45eae45..3c6326a9 100644 --- a/docker-compose/microservices-mode/traces/mimirtool.override.yaml +++ b/docker-compose/microservices-mode/traces/mimirtool.override.yaml @@ -5,5 +5,5 @@ services: volumes: - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml diff --git a/docker-compose/monolithic-mode/all-in-one/grafana.override.yaml b/docker-compose/monolithic-mode/all-in-one/grafana.override.yaml index 9a1e65cd..9e9375cc 100644 --- a/docker-compose/monolithic-mode/all-in-one/grafana.override.yaml +++ b/docker-compose/monolithic-mode/all-in-one/grafana.override.yaml @@ -12,7 +12,7 @@ services: volumes: - ../../../monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin - ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin - # - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin + - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin environment: GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn} GF_DIAGNOSTICS_PROFILING_ENABLED: true diff --git a/docker-compose/monolithic-mode/all-in-one/mimirtool.override.yaml b/docker-compose/monolithic-mode/all-in-one/mimirtool.override.yaml index 29012b7c..09ee4c1b 100644 --- a/docker-compose/monolithic-mode/all-in-one/mimirtool.override.yaml +++ b/docker-compose/monolithic-mode/all-in-one/mimirtool.override.yaml @@ -6,5 +6,5 @@ services: - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml - ../../../monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml diff --git a/docker-compose/monolithic-mode/traces/grafana.override.yaml b/docker-compose/monolithic-mode/traces/grafana.override.yaml index 325fbfe0..aa8d558f 100644 --- a/docker-compose/monolithic-mode/traces/grafana.override.yaml +++ b/docker-compose/monolithic-mode/traces/grafana.override.yaml @@ -4,5 +4,5 @@ services: grafana: volumes: - ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin - # - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin + - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin diff --git a/docker-compose/monolithic-mode/traces/mimirtool.override.yaml b/docker-compose/monolithic-mode/traces/mimirtool.override.yaml index f45eae45..3c6326a9 100644 --- a/docker-compose/monolithic-mode/traces/mimirtool.override.yaml +++ b/docker-compose/monolithic-mode/traces/mimirtool.override.yaml @@ -5,5 +5,5 @@ services: volumes: - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml - ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml - # - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml + - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml diff --git a/monitoring-mixins/agent-flow-mixin/deploy/agent-flow-mixin-alerts.yaml b/monitoring-mixins/agent-flow-mixin/deploy/agent-flow-mixin-alerts.yaml index 36976cac..cbd36ade 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/agent-flow-mixin-alerts.yaml +++ b/monitoring-mixins/agent-flow-mixin/deploy/agent-flow-mixin-alerts.yaml @@ -3,20 +3,20 @@ groups: rules: - alert: ClusterNotConverging annotations: - message: Cluster is not converging. + message: 'Cluster is not converging: nodes report different number of peers in the cluster.' expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0 for: 10m - - alert: ClusterSplitBrain + - alert: ClusterNodeCountMismatch annotations: - message: Cluster nodes have entered a split brain state. + message: Nodes report different number of peers vs. the count of observed agent metrics. Some agent metrics may be missing or the cluster is in a split brain state. expr: | sum without (state) (cluster_node_peers) != on (cluster, namespace) group_left count by (cluster, namespace) (cluster_node_info) - for: 10m + for: 15m - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a health score > 0. + message: Cluster node is reporting a gossip protocol health score > 0. expr: | cluster_node_gossip_health_score > 0 for: 10m diff --git a/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-opentelemetry.json b/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-opentelemetry.json index b75f8ef5..51116f7b 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-opentelemetry.json +++ b/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-opentelemetry.json @@ -29,6 +29,17 @@ { "datasource": "${datasource}", "description": "Number of spans successfully pushed into the pipeline.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, @@ -82,11 +93,6 @@ { "datasource": "${datasource}", "description": "The duration of inbound RPCs.\n", - "fieldConfig": { - "defaults": { - "unit": "milliseconds" - } - }, "gridPos": { "h": 10, "w": 8, @@ -115,7 +121,7 @@ "yHistogram": true }, "yAxis": { - "unit": "s" + "unit": "ms" } }, "pluginVersion": "9.0.6", @@ -129,7 +135,7 @@ "range": true } ], - "title": "RPC server duration (traces)", + "title": "RPC server duration", "type": "heatmap" }, { @@ -140,12 +146,17 @@ "x": 0, "y": 10 }, - "title": "Batching [otelcol.processor.batch]", + "title": "Batching of logs, metrics, and traces [otelcol.processor.batch]", "type": "row" }, { "datasource": "${datasource}", - "description": "Number of units in the batch\n", + "description": "Number of spans, metric datapoints, or log lines in a batch\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, "gridPos": { "h": 10, "w": 8, @@ -174,7 +185,7 @@ "yHistogram": true }, "yAxis": { - "unit": "s" + "unit": "short" } }, "pluginVersion": "9.0.6", @@ -247,6 +258,17 @@ { "datasource": "${datasource}", "description": "Number of spans successfully sent to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, @@ -268,6 +290,17 @@ { "datasource": "${datasource}", "description": "Number of spans in failed attempts to send to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, diff --git a/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml index 20de119a..03abee20 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -1438,6 +1438,17 @@ data: { "datasource": "${datasource}", "description": "Number of spans successfully pushed into the pipeline.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, @@ -1491,11 +1502,6 @@ data: { "datasource": "${datasource}", "description": "The duration of inbound RPCs.\n", - "fieldConfig": { - "defaults": { - "unit": "milliseconds" - } - }, "gridPos": { "h": 10, "w": 8, @@ -1524,7 +1530,7 @@ data: "yHistogram": true }, "yAxis": { - "unit": "s" + "unit": "ms" } }, "pluginVersion": "9.0.6", @@ -1538,7 +1544,7 @@ data: "range": true } ], - "title": "RPC server duration (traces)", + "title": "RPC server duration", "type": "heatmap" }, { @@ -1549,12 +1555,17 @@ data: "x": 0, "y": 10 }, - "title": "Batching [otelcol.processor.batch]", + "title": "Batching of logs, metrics, and traces [otelcol.processor.batch]", "type": "row" }, { "datasource": "${datasource}", - "description": "Number of units in the batch\n", + "description": "Number of spans, metric datapoints, or log lines in a batch\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, "gridPos": { "h": 10, "w": 8, @@ -1583,7 +1594,7 @@ data: "yHistogram": true }, "yAxis": { - "unit": "s" + "unit": "short" } }, "pluginVersion": "9.0.6", @@ -1656,6 +1667,17 @@ data: { "datasource": "${datasource}", "description": "Number of spans successfully sent to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, @@ -1677,6 +1699,17 @@ data: { "datasource": "${datasource}", "description": "Number of spans in failed attempts to send to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, "gridPos": { "h": 10, "w": 8, @@ -2807,37 +2840,27 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: Cluster is not converging. + message: 'Cluster is not converging: nodes report different number of peers + in the cluster.' expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0 - for: 5m - - alert: ClusterSplitBrain + for: 10m + - alert: ClusterNodeCountMismatch annotations: - message: Cluster nodes have entered a split brain state. + message: Nodes report different number of peers vs. the count of observed + agent metrics. Some agent metrics may be missing or the cluster is in a + split brain state. expr: | sum without (state) (cluster_node_peers) != on (cluster, namespace) group_left count by (cluster, namespace) (cluster_node_info) - for: 5m - - alert: ClusterLamportClockDrift - annotations: - message: Cluster nodes' lamport clocks are not converging. - expr: stddev by (cluster, namespace) (cluster_node_lamport_time) > 4 * sqrt(count - by (cluster, namespace) (cluster_node_info)) - for: 5m + for: 15m - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a health score > 0. + message: Cluster node is reporting a gossip protocol health score > 0. expr: | cluster_node_gossip_health_score > 0 - for: 5m - - alert: ClusterLamportClockStuck - annotations: - message: Cluster nodes's lamport clocks is not progressing. - expr: | - sum by (cluster, namespace, instance) (rate(cluster_node_lamport_time[2m])) == 0 - and on (cluster, namespace, instance) (cluster_node_peers > 1) - for: 5m + for: 10m - alert: ClusterNodeNameConflict annotations: message: A node tried to join the cluster with a name conflicting with an @@ -2850,7 +2873,7 @@ spec: message: Cluster node stuck in Terminating state. expr: sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0 - for: 5m + for: 10m - alert: ClusterConfigurationDrift annotations: message: Cluster nodes are not using the same configuration file. @@ -2870,6 +2893,6 @@ spec: - alert: UnhealthyComponents annotations: message: Unhealthy Flow components detected. - expr: sum(agent_component_controller_running_components{health_type!="healthy"}) + expr: sum by (cluster, namespace) (agent_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m diff --git a/monitoring-mixins/agent-flow-mixin/deploy/prometheus-alerts.yaml b/monitoring-mixins/agent-flow-mixin/deploy/prometheus-alerts.yaml index 9eacc644..5d9facd0 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/prometheus-alerts.yaml +++ b/monitoring-mixins/agent-flow-mixin/deploy/prometheus-alerts.yaml @@ -10,35 +10,23 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: Cluster is not converging. + message: 'Cluster is not converging: nodes report different number of peers in the cluster.' expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0 - for: 5m - - alert: ClusterSplitBrain + for: 10m + - alert: ClusterNodeCountMismatch annotations: - message: Cluster nodes have entered a split brain state. + message: Nodes report different number of peers vs. the count of observed agent metrics. Some agent metrics may be missing or the cluster is in a split brain state. expr: | sum without (state) (cluster_node_peers) != on (cluster, namespace) group_left count by (cluster, namespace) (cluster_node_info) - for: 5m - - alert: ClusterLamportClockDrift - annotations: - message: Cluster nodes' lamport clocks are not converging. - expr: stddev by (cluster, namespace) (cluster_node_lamport_time) > 4 * sqrt(count by (cluster, namespace) (cluster_node_info)) - for: 5m + for: 15m - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a health score > 0. + message: Cluster node is reporting a gossip protocol health score > 0. expr: | cluster_node_gossip_health_score > 0 - for: 5m - - alert: ClusterLamportClockStuck - annotations: - message: Cluster nodes's lamport clocks is not progressing. - expr: | - sum by (cluster, namespace, instance) (rate(cluster_node_lamport_time[2m])) == 0 - and on (cluster, namespace, instance) (cluster_node_peers > 1) - for: 5m + for: 10m - alert: ClusterNodeNameConflict annotations: message: A node tried to join the cluster with a name conflicting with an existing peer. @@ -48,7 +36,7 @@ spec: annotations: message: Cluster node stuck in Terminating state. expr: sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0 - for: 5m + for: 10m - alert: ClusterConfigurationDrift annotations: message: Cluster nodes are not using the same configuration file. @@ -67,5 +55,5 @@ spec: - alert: UnhealthyComponents annotations: message: Unhealthy Flow components detected. - expr: sum(agent_component_controller_running_components{health_type!="healthy"}) > 0 + expr: sum by (cluster, namespace) (agent_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m diff --git a/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json b/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json index 39d31ba3..23da32df 100644 --- a/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "operations/agent-flow-mixin" } }, - "version": "c1793d12ef9a2bdcfcb6623187b35c12c2ace016", - "sum": "tFWKQ0tGxdEFMy/eb1in274ad4m/q+a2HrA9+WYtDqs=" + "version": "88780b8ee7ee74234ca5d86535a3ab1d553c098e", + "sum": "h7vP3f75DAvZzm85jh/R7n5jyC88MPuc4aV/o7A1/t4=" } ], "legacyImports": false diff --git a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts/clustering.libsonnet b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts/clustering.libsonnet index b4d5edc9..5e2ad3c0 100644 --- a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts/clustering.libsonnet +++ b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts/clustering.libsonnet @@ -7,23 +7,22 @@ alert.newGroup( alert.newRule( 'ClusterNotConverging', 'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0', - 'Cluster is not converging.', + 'Cluster is not converging: nodes report different number of peers in the cluster.', '10m', ), - // Cluster has entered a split brain state. alert.newRule( - 'ClusterSplitBrain', - // Assert that the set of known peers (regardless of state) for an - // agent matches the same number of running agents in the same cluster - // and namespace. + 'ClusterNodeCountMismatch', + // Assert that the number of known peers (regardless of state) reported by each + // agent matches the number of running agents in the same cluster + // and namespace as reported by a count of Prometheus metrics. ||| sum without (state) (cluster_node_peers) != on (cluster, namespace) group_left count by (cluster, namespace) (cluster_node_info) |||, - 'Cluster nodes have entered a split brain state.', - '10m', + 'Nodes report different number of peers vs. the count of observed agent metrics. Some agent metrics may be missing or the cluster is in a split brain state.', + '15m', ), // Nodes health score is not zero. @@ -32,7 +31,7 @@ alert.newGroup( ||| cluster_node_gossip_health_score > 0 |||, - 'Cluster node is reporting a health score > 0.', + 'Cluster node is reporting a gossip protocol health score > 0.', '10m', ), diff --git a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet index ac6125d5..ec059de9 100644 --- a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet +++ b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet @@ -276,7 +276,7 @@ local filename = 'agent-flow-controller.json'; // // This panel supports both native and classic histograms, though it only shows one at a time. ( - panel.newNativeHistogramHeatmap('Component evaluation histogram') + + panel.newNativeHistogramHeatmap('Component evaluation histogram', 's') + panel.withDescription(||| Detailed histogram view of how long component evaluations take. @@ -301,7 +301,7 @@ local filename = 'agent-flow-controller.json'; // // This panel supports both native and classic histograms, though it only shows one at a time. ( - panel.newNativeHistogramHeatmap('Component dependency wait histogram') + + panel.newNativeHistogramHeatmap('Component dependency wait histogram', 's') + panel.withDescription(||| Detailed histogram of how long components wait to be evaluated after their dependency is updated. diff --git a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/utils/panel.jsonnet b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/utils/panel.jsonnet index a59e6a4b..9ebf39f3 100644 --- a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/utils/panel.jsonnet +++ b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/utils/panel.jsonnet @@ -30,7 +30,7 @@ }, }, - newHeatmap(title=''):: $.new(title, 'heatmap') { + newHeatmap(title='', unit=''):: $.new(title, 'heatmap') { maxDataPoints: 30, options: { calculate: false, @@ -53,13 +53,13 @@ yHistogram: true, }, yAxis: { - unit: 's', + unit: unit, }, }, pluginVersion: '9.0.6', }, - newNativeHistogramHeatmap(title=''):: $.newHeatmap(title) { + newNativeHistogramHeatmap(title='', unit=''):: $.newHeatmap(title, unit) { options+: { cellGap: 0, color: { diff --git a/monitoring-mixins/loki-mixin/jsonnetfile.lock.json b/monitoring-mixins/loki-mixin/jsonnetfile.lock.json index ab33bffd..2b21e1f2 100644 --- a/monitoring-mixins/loki-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/loki-mixin/jsonnetfile.lock.json @@ -18,8 +18,8 @@ "subdir": "grafana-builder" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "mixin-utils" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" }, { "source": { @@ -38,7 +38,7 @@ "subdir": "production/loki-mixin" } }, - "version": "67416270f299a38b54c9ed16f3cec2bfee84b5b7", + "version": "0b7ff4817545dd6326042e7e6d31a95681aa1cdd", "sum": "9eo8rVcHcS8nvT1Bib4Z0LQxAO9Hrbjc4mC2rA54P2I=" }, { @@ -48,8 +48,8 @@ "subdir": "operations/mimir-mixin" } }, - "version": "6794cb4a1c48ec6cd272fc9a8559b1c033034865", - "sum": "dgNk0zx57kRIPKynxUiSFMWemw6sHP7/c0Sg33lVoWE=" + "version": "cd13a1b0509f877f3d84e3de5c884680563e6ab3", + "sum": "eNgijCGmvYx1X4yXMLBsdo7jo3HbDknbzSYHGt/I/MY=" }, { "source": { @@ -58,7 +58,7 @@ "subdir": "jsonnet/kube-prometheus/lib" } }, - "version": "942b7f5a7b69a8842fe2c4a1c364a2d515fb1d68", + "version": "76f2e1ef95be0df752037baa040781c5219e1fb3", "sum": "QKRgrgEZ3k9nLmLCrDBaeIGVqQZf+AvZTcnhdLk3TrA=" } ], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index 0bd0b339..cc43f483 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { dashboard(title, uid='', datasource='default', datasource_regex=''):: { // Stuff that isn't materialised. @@ -70,6 +72,40 @@ }, }, + addShowNativeLatencyVariable():: self { + templating+: { + list+: [{ + current: { + selected: true, + text: 'classic', + value: '1', + }, + description: 'Choose between showing latencies based on low precision classic or high precision native histogram metrics.', + hide: 0, + includeAll: false, + label: 'Latency metrics', + multi: false, + name: 'latency_metrics', + query: 'native : -1,classic : 1', + options: [ + { + selected: false, + text: 'native', + value: '-1', + }, + { + selected: true, + text: 'classic', + value: '1', + }, + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], + }, + }, + dashboardLinkUrl(title, url):: self { links+: [ { diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index 622598f7..d669aa55 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -1,6 +1,93 @@ local g = import 'grafana-builder/grafana.libsonnet'; { + // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. + // Metric name should be provided without _bucket suffix. + nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier=''):: + local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; + local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + classicSumBy: classicSumBy, + metric: metric, + multiplierStr: multiplierStr, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + metric: metric, + multiplierStr: multiplierStr, + nativeSumBy: nativeSumBy, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. + // Metric name should be provided without _sum suffix. + nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + + // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. + // Metric name should be provided without _count suffix. + nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // TODO(krajorama) Switch to histogram_avg function for native histograms later. + nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier=''):: + local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; + { + classic: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic, + multiplier: multiplierStr, + }, + native: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native, + multiplier: multiplierStr, + }, + }, + + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query + // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. + showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], + // showNativeHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the native query + // to dashboard variable which should take -1 or +1 as values in order to show or hide the native query. + showNativeHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * -Inf)' % [query.native, dashboard_variable], + histogramRules(metric, labels, interval='1m', record_native=false):: local vars = { metric: metric, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet index 10f936e8..03fe7d4c 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet @@ -6,5 +6,6 @@ (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + (import 'alerts/autoscaling.libsonnet') + + (import 'alerts/ingest-storage.libsonnet') + (import 'alerts/continuous-test.libsonnet'), } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet index 20ae34a1..06c92bb7 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet @@ -6,8 +6,8 @@ { alert: $.alertName('AlertmanagerSyncConfigsFailing'), expr: ||| - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - |||, + rate(cortex_alertmanager_sync_configs_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), 'for': '30m', labels: { severity: 'critical', @@ -21,8 +21,8 @@ { alert: $.alertName('AlertmanagerRingCheckFailing'), expr: ||| - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - |||, + rate(cortex_alertmanager_ring_check_errors_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -36,8 +36,8 @@ { alert: $.alertName('AlertmanagerPartialStateMergeFailing'), expr: ||| - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_partial_state_merges_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -51,8 +51,8 @@ { alert: $.alertName('AlertmanagerReplicationFailing'), expr: ||| - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_state_replication_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -66,8 +66,8 @@ { alert: $.alertName('AlertmanagerPersistStateFailing'), expr: ||| - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - |||, + rate(cortex_alertmanager_state_persist_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(15), 'for': '1h', labels: { severity: 'critical', @@ -81,8 +81,8 @@ { alert: $.alertName('AlertmanagerInitialSyncFailed'), expr: ||| - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - |||, + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet index b550937d..c586c6dc 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -54,4 +54,6 @@ for group in groups ], + alertRangeInterval(multiple):: + ($._config.base_alerts_range_interval_minutes * multiple) + 'm', } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet index efe1f119..ca67c94b 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet @@ -34,14 +34,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m])) + 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[%(range_interval)s])) / - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m])) + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) > 1 ||| % { group_by: $._config.alert_aggregation_labels, job_label: $._config.per_job_label, excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes), + range_interval: $.alertRangeInterval(1), }, 'for': '15m', labels: { @@ -81,10 +82,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('QueriesIncorrect'), expr: ||| - 100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + 100 * sum by (%(group_by)s) (rate(test_exporter_test_case_result_total{result="fail"}[%(range_interval)s])) / - sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + sum by (%(group_by)s) (rate(test_exporter_test_case_result_total[%(range_interval)s])) > 1 + ||| % { + group_by: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(5), + }, 'for': '15m', labels: { severity: 'warning', @@ -130,8 +134,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('FrontendQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_frontend_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -145,8 +153,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('SchedulerQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_scheduler_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '7m', // We don't want to block for longer. labels: { severity: 'critical', @@ -161,19 +173,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), expr: ||| ( - sum by(%s, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operation_failures_total[%(range_interval)s]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total[%(range_interval)s]) ) / - sum by(%s, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operations_total[%(range_interval)s]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total[%(range_interval)s]) ) ) * 100 > 5 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % { + group_by: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -215,13 +230,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('KVStoreFailure'), expr: ||| ( - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[%(range_interval)s])) / - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[%(range_interval)s])) ) # We want to get alerted only in case there's a constant failure. == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -316,9 +333,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('StoreGatewayTooManyFailedOperations'), 'for': '5m', expr: ||| - sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[%(range_interval)s])) > 0 ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), }, labels: { severity: 'warning', @@ -502,7 +520,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; %(kube_statefulset_status_replicas_updated)s ) ) and ( - changes(%(kube_statefulset_status_replicas_updated)s[15m:1m]) + changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -513,6 +531,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'), kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'), kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -533,7 +552,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; != %(kube_deployment_status_replicas_updated)s ) and ( - changes(%(kube_deployment_status_replicas_updated)s[15m:1m]) + changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -542,6 +561,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; aggregation_labels: $._config.alert_aggregation_labels, kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'), kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -619,11 +639,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedPushes'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -638,11 +660,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedQueries'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -657,11 +681,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerMissedEvaluations'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -675,9 +701,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('RulerFailedRingCheck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[%(range_interval)s])) > 0 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -692,11 +720,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerRemoteEvaluationFailing'), expr: ||| 100 * ( - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %(job_regex)s}[%(range_interval)s])) / - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %(job_regex)s}[%(range_interval)s])) ) > 1 - ||| % [$._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend), $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend)], + ||| % { + alert_aggregation_labels: $._config.alert_aggregation_labels, + job_regex: $.jobMatcher($._config.job_names.ruler_query_frontend), + range_interval: $.alertRangeInterval(5), + }, 'for': '5m', labels: { severity: 'warning', diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet index 31c9ea1e..28a4028a 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet @@ -40,13 +40,14 @@ expr: ||| ( # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + label_replace(rate(keda_scaler_errors[%(range_interval)s]), "namespace", "$1", "exported_namespace", "(.*)") # Match only Mimir namespaces. * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) ) > 0 ||| % { aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(5), }, labels: { severity: 'critical', diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet index 9cfc25f3..8f6ed51d 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet @@ -79,8 +79,8 @@ alert: $.alertName('IngesterTSDBHeadCompactionFailed'), 'for': '15m', expr: ||| - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_compactions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -91,8 +91,8 @@ { alert: $.alertName('IngesterTSDBHeadTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_head_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -103,8 +103,8 @@ { alert: $.alertName('IngesterTSDBCheckpointCreationFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -115,8 +115,8 @@ { alert: $.alertName('IngesterTSDBCheckpointDeletionFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -127,8 +127,8 @@ { alert: $.alertName('IngesterTSDBWALTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -140,11 +140,13 @@ alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0) > 1 and # and there is only one zone count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'single-zone', @@ -157,11 +159,13 @@ alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0)) > 1 and # and there are multiple zones count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'multi-zone', @@ -174,8 +178,8 @@ alert: $.alertName('IngesterTSDBWALWritesFailed'), 'for': '3m', expr: ||| - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_writes_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet index e9b4c8ce..6206ff6d 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet @@ -114,8 +114,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '1m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -129,8 +129,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '30m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 1 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet index 1313ee46..264f0e96 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet @@ -9,8 +9,10 @@ alert: $.alertName('ContinuousTestNotRunningOnWrites'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, @@ -24,8 +26,10 @@ alert: $.alertName('ContinuousTestNotRunningOnReads'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet index f594bba5..d40c3a23 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet @@ -649,6 +649,9 @@ 'debug_pprof', ], + // All query methods from IngesterServer interface. Basically everything except Push. + ingester_read_path_routes_regex: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + // The default datasource used for dashboards. dashboard_datasource: 'default', datasource_regex: '', @@ -658,6 +661,10 @@ // Set to four times the scrape interval to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ recording_rules_range_interval: '1m', + // Used to calculate range interval in alerts with default range selector under 10 minutes. + // Needed to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ + base_alerts_range_interval_minutes: 1, + // Used to inject rows into dashboards at specific places that support it. injectRows: {}, @@ -671,5 +678,11 @@ // Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is // not used (default), then rejected requests are already counted as failures. show_rejected_requests_on_writes_dashboard: false, + + // Show panels that use queries for gRPC-based ingestion (distributor -> ingester) + show_grpc_ingestion_panels: true, + + // Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters) + show_ingest_storage_panels: false, }, } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index ca75f19c..5ebf1bf8 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -74,18 +74,54 @@ query_frontend: { readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, - instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, - rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, - labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, - labelValuesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_label_name_values"}[$__rate_interval]))' % variables, - seriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_series"}[$__rate_interval]))' % variables, - remoteReadQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_read"}[$__rate_interval]))' % variables, - metadataQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_metadata"}[$__rate_interval]))' % variables, - exemplarsQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_exemplars"}[$__rate_interval]))' % variables, - activeSeriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_active_series"}[$__rate_interval])) > 0' % variables, - labelNamesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_names"}[$__rate_interval])) > 0' % variables, - labelValuesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_values"}[$__rate_interval])) > 0' % variables, - otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)"}[$__rate_interval]))' % variables, + // These query routes are used in the overview and other dashboard, everythign else is considered "other" queries. + // Has to be a list to keep the same colors as before, see overridesNonErrorColorsPalette. + local overviewRoutes = [ + { name: 'instantQuery', displayName: 'instant queries', route: '/api/v1/query', routeLabel: '_api_v1_query' }, + { name: 'rangeQuery', displayName: 'range queries', route: '/api/v1/query_range', routeLabel: '_api_v1_query_range' }, + { name: 'labelNames', displayName: '"label names" queries', route: '/api/v1/labels', routeLabel: '_api_v1_labels' }, + { name: 'labelValues', displayName: '"label values" queries', route: '/api/v1/label_name_values', routeLabel: '_api_v1_label_name_values' }, + { name: 'series', displayName: 'series queries', route: '/api/v1/series', routeLabel: '_api_v1_series' }, + { name: 'remoteRead', displayName: 'remote read queries', route: '/api/v1/read', routeLabel: '_api_v1_read' }, + { name: 'metadata', displayName: 'metadata queries', route: '/api/v1/metadata', routeLabel: '_api_v1_metadata' }, + { name: 'exemplars', displayName: 'exemplar queries', route: '/api/v1/query_exemplars', routeLabel: '_api_v1_query_exemplars' }, + { name: 'activeSeries', displayName: '"active series" queries', route: '/api/v1/cardinality_active_series', routeLabel: '_api_v1_cardinality_active_series' }, + { name: 'labelNamesCardinality', displayName: '"label name cardinality" queries', route: '/api/v1/cardinality_label_names', routeLabel: '_api_v1_cardinality_label_names' }, + { name: 'labelValuesCardinality', displayName: '"label value cardinality" queries', route: '/api/v1/cardinality_label_values', routeLabel: '_api_v1_cardinality_label_values' }, + ], + local overviewRoutesRegex = '(prometheus|api_prom)(%s)' % std.join('|', [r.routeLabel for r in overviewRoutes]), + overviewRoutesOverrides: [ + { + matcher: { + id: 'byRegexp', + // To distinguish between query and query_range, we need to match the route with a negative lookahead. + options: '/.*%s($|[^_])/' % r.routeLabel, + }, + properties: [ + { + id: 'displayName', + value: r.displayName, + }, + ], + } + for r in overviewRoutes + ], + overviewRoutesPerSecond: 'sum by (route) (rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + nonOverviewRoutesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + + local queryPerSecond(name) = 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"}[$__rate_interval]))' % + (variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }), + instantQueriesPerSecond: queryPerSecond('instantQuery'), + rangeQueriesPerSecond: queryPerSecond('rangeQuery'), + labelNamesQueriesPerSecond: queryPerSecond('labelNames'), + labelValuesQueriesPerSecond: queryPerSecond('labelValues'), + seriesQueriesPerSecond: queryPerSecond('series'), + remoteReadQueriesPerSecond: queryPerSecond('remoteRead'), + metadataQueriesPerSecond: queryPerSecond('metadata'), + exemplarsQueriesPerSecond: queryPerSecond('exemplars'), + activeSeriesQueriesPerSecond: queryPerSecond('activeSeries'), + labelNamesCardinalityQueriesPerSecond: queryPerSecond('labelNamesCardinality'), + labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'), // Read failures rate as percentage of total requests. readFailuresRate: ||| diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index a269ccc9..47a347cb 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1,22 +1,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'grafana-builder/grafana.libsonnet') { - local resourceRequestColor = '#FFC000', - local resourceLimitColor = '#E02F44', - local successColor = '#7EB26D', - local warningColor = '#EAB839', - local errorColor = '#E24D42', + _colors:: { + resourceRequest: '#FFC000', + resourceLimit: '#E02F44', + success: '#7EB26D', + clientError: '#EF843C', + warning: '#EAB839', + failed: '#E24D42', // "error" is reserved word in Jsonnet. + }, // Colors palette picked from Grafana UI, excluding red-ish colors which we want to keep reserved for errors / failures. - local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE'], + local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE', '#3F6833', '#447EBC', '#967302', '#5794F2'], local resourceRequestStyle = $.overrideFieldByName('request', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceRequestColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceRequest }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), local resourceLimitStyle = $.overrideFieldByName('limit', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceLimitColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceLimit }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), @@ -196,14 +199,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; qpsPanel(selector, statusLabelName='status_code'):: super.qpsPanel(selector, statusLabelName) + $.aliasColors({ - '1xx': warningColor, - '2xx': successColor, + '1xx': $._colors.warning, + '2xx': $._colors.success, '3xx': '#6ED0E0', '4xx': '#EF843C', - '5xx': errorColor, - OK: successColor, - success: successColor, - 'error': errorColor, + '5xx': $._colors.failed, + OK: $._colors.success, + success: $._colors.success, + 'error': $._colors.failed, cancel: '#A9A9A9', }) + { fieldConfig+: { @@ -260,15 +263,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Set the failure color only if there's just 1 legend and it doesn't contain any placeholder. $.aliasColors( if (std.type(legends) == 'string' && std.length(std.findSubstr('{', legends[0])) == 0) then { - [legends]: errorColor, + [legends]: $._colors.failed, } else {} ), successFailurePanel(successMetric, failureMetric):: $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.aliasColors({ - successful: successColor, - failed: errorColor, + successful: $._colors.success, + failed: $._colors.failed, }), // successFailureCustomPanel is like successFailurePanel() but allows to customize the legends @@ -277,8 +280,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; successFailureCustomPanel(queries, legends):: $.queryPanel(queries, legends) + $.aliasColors({ - [legends[0]]: successColor, - [legends[1]]: errorColor, + [legends[0]]: $._colors.success, + [legends[1]]: $._colors.failed, }), // Displays started, completed and failed rate. @@ -288,8 +291,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack + $.aliasColors({ started: '#34CCEB', - completed: successColor, - failed: errorColor, + completed: $._colors.success, + failed: $._colors.failed, }), resourceUtilizationAndLimitLegend(resourceName):: @@ -993,9 +996,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; thresholds: { mode: 'absolute', steps: [ - { color: successColor, value: null }, - { color: warningColor, value: 0.01 }, // 1% - { color: errorColor, value: 0.05 }, // 5% + { color: $._colors.success, value: null }, + { color: $._colors.warning, value: 0.01 }, // 1% + { color: $._colors.failed, value: 0.05 }, // 5% ], }, }, @@ -1343,6 +1346,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), legends)), }, + overridesNonErrorColorsPalette(overrides):: std.mapWithIndex(function(idx, override) ( + // Do not define an override if we exausted the colors in the palette. + // Grafana will automatically choose another color. + if idx >= std.length(nonErrorColorsPalette) then override else + { + matcher: override.matcher, + properties: override.properties + [ + { + id: 'color', + value: { + fixedColor: nonErrorColorsPalette[idx], + mode: 'fixed', + }, + }, + ], + } + ), overrides), + // Panel query override functions overrideField(matcherId, options, overrideProperties):: { matcher: { diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet index 5b754d73..bbd038ca 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet @@ -175,42 +175,29 @@ local filename = 'mimir-overview.json'; ) ) .addPanel( - local legends = [ - 'instant queries', - 'range queries', - '"label names" queries', - '"label values" queries', - 'series queries', - 'remote read queries', - 'metadata queries', - 'exemplar queries', - '"active series" queries', - '"label name cardinality" queries', - '"label value cardinality" queries', - 'other', - ]; - $.timeseriesPanel('Queries / sec') + - $.queryPanel( - [ - $.queries.query_frontend.instantQueriesPerSecond, - $.queries.query_frontend.rangeQueriesPerSecond, - $.queries.query_frontend.labelNamesQueriesPerSecond, - $.queries.query_frontend.labelValuesQueriesPerSecond, - $.queries.query_frontend.seriesQueriesPerSecond, - $.queries.query_frontend.remoteReadQueriesPerSecond, - $.queries.query_frontend.metadataQueriesPerSecond, - $.queries.query_frontend.exemplarsQueriesPerSecond, - $.queries.query_frontend.activeSeriesQueriesPerSecond, - $.queries.query_frontend.labelNamesCardinalityQueriesPerSecond, - $.queries.query_frontend.labelValuesCardinalityQueriesPerSecond, - $.queries.query_frontend.otherQueriesPerSecond, + { + targets: [ + { + expr: $.queries.query_frontend.overviewRoutesPerSecond, + format: 'time_series', + legendLink: null, + }, + { + expr: $.queries.query_frontend.nonOverviewRoutesPerSecond, + format: 'time_series', + legendFormat: 'other', + legendLink: null, + }, ], - legends, - ) + - $.panelSeriesNonErrorColorsPalette(legends) + - $.stack + - { fieldConfig+: { defaults+: { unit: 'reqps' } } }, + } + + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + overrides+: $.overridesNonErrorColorsPalette($.queries.query_frontend.overviewRoutesOverrides), + }, + } + + $.stack ) ) diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet index 129a1d92..93d605b0 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet @@ -167,6 +167,157 @@ local filename = 'mimir-queries.json'; { fieldConfig+: { defaults+: { unit: 'short' } } }, ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage: strong consistency)')) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency / sec') + + $.panelDescription( + 'Requests with strong read consistency / sec', + ||| + Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency ratio') + + $.panelDescription( + 'Requests with strong read consistency ratio', + ||| + Ratio between requests with strong read consistency and all read requests on ingesters. + ||| + ) + + $.queryPanel( + [ + ||| + ( + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ], + ['successful', 'failed'], + ) + + $.aliasColors({ failed: $._colors.failed, successful: $._colors.success }) + + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + + $.stack + ) + .addPanel( + $.timeseriesPanel('Strong read consistency queries — wait latency') + + $.panelDescription( + 'Strong read consistency queries — wait latency', + ||| + How long does the request wait to guarantee strong read consistency. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage: last produced offset)')) + .addPanel( + $.timeseriesPanel('Last produced offset requests / sec') + + $.panelDescription( + 'Rate of requests to fetch last produced offset for partition', + ||| + Shows rate of requests to fetch last produced offset for partition, and rate of failed requests. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Last produced offset latency') + + $.panelDescription( + 'Latency', + ||| + How long does it take to fetch "last produced offset" of partition. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) .addRow( $.row('Querier') .addPanel( diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet index dd149a93..23c4d4de 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet @@ -299,16 +299,16 @@ local filename = 'mimir-reads.json'; $.row('Ingester') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], '' ) ) ) diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet index b133f01f..b1ed99ad 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet @@ -7,7 +7,7 @@ local filename = 'mimir-slow-queries.json'; ($.dashboard('Slow queries') + { uid: std.md5(filename) }) .addClusterSelectorTemplates(false) .addRow( - $.row('Accross tenants') + $.row('Across tenants') .addPanel( $.timeseriesPanel('Response time') + $.lokiMetricsQueryPanel( diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet index 4359ee3d..96eab666 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet @@ -58,8 +58,10 @@ local filename = 'mimir-top-tenants.json'; distributor: $.jobMatcher($._config.job_names.distributor), group_by_cluster: $._config.group_by_cluster, }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -72,8 +74,10 @@ local filename = 'mimir-top-tenants.json'; $.tablePanel( [ 'topk($limit, %(in_memory_series_per_user)s)' % { in_memory_series_per_user: in_memory_series_per_user_query() }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -107,8 +111,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -143,8 +149,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_discarded_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.ingester + $._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -190,8 +198,10 @@ local filename = 'mimir-top-tenants.json'; distributor: $.jobMatcher($._config.job_names.distributor), group_by_cluster: $._config.group_by_cluster, }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -205,8 +215,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'exemplars/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'exemplars/s' }, + } ) ), ) @@ -221,8 +233,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'rules' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'rules' }, + } ) ), ) @@ -236,8 +250,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'seconds' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'seconds' }, + } ) ) ) @@ -256,8 +272,10 @@ local filename = 'mimir-top-tenants.json'; (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{%s}[$__rate_interval])) == 0) ) ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], - ], - { Value: { alias: 'Compaction Jobs', decimals: 0 } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'Compaction Jobs', decimals: 0 }, + } ) ), ), diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet index 9d80feb0..61205581 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet @@ -3,6 +3,7 @@ local filename = 'mimir-writes.json'; (import 'dashboard-utils.libsonnet') + (import 'dashboard-queries.libsonnet') { + [filename]: assert std.md5(filename) == '8280707b8f16e7b87b840fc1cc92d4c5' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Writes') + { uid: std.md5(filename) }) @@ -162,10 +163,39 @@ local filename = 'mimir-writes.json'; 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], '' ) ) + .addPanelIf( + $._config.show_ingest_storage_panels, + $.timeseriesPanel('Sync write to Kafka latency (ingest storage)') + + $.panelDescription( + 'Sync write to Kafka latency (ingest storage)', + ||| + Latency of synchronous write operation used to store data into Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) ) .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($)) - .addRow( - $.row('Ingester') + .addRowIf( + $._config.show_grpc_ingestion_panels, + ($.row('Ingester')) .addPanel( $.timeseriesPanel('Requests / sec') + $.panelDescription( @@ -206,6 +236,202 @@ local filename = 'mimir-writes.json'; ) ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage)')) + .addPanel( + $.timeseriesPanel('Kafka fetches / sec') + + $.panelDescription( + 'Kafka fetches / sec', + ||| + Rate of fetches received from Kafka brokers. A fetch can contain multiple records (a write request received on the write path is mapped into a single record). + Read errors are any errors reported on connection to Kafka brokers, and are separate from "failed" fetches. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_fetches_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + // cortex_ingest_storage_reader_read_errors_total metric is reported by Kafka client. + 'sum (rate (cortex_ingest_storage_reader_read_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + 'read errors', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed, 'read errors': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka records / sec') + + $.panelDescription( + 'Kafka records / sec', + ||| + Rate of processed records from Kafka. Failed records are categorized as "client" errors (e.g. per-tenant limits) or server errors. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_records_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_records_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="client"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="server"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed (client)', + 'failed (server)', + ], + ) + $.aliasColors({ successful: $._colors.success, 'failed (client)': $._colors.clientError, 'failed (server)': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka record processing latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time used to process a single record (write request). This time is spent by appending data to per-tenant TSDB. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage – end-to-end latency)')) + .addPanel( + $.timeseriesPanel('Kafka record end-to-end latency when ingesters are running') + + $.panelDescription( + 'Kafka record end-to-end latency when ingesters are running', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester, when ingesters are running. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + .addPanel( + $.timeseriesPanel('Kafka record end-to-end latency when starting') + + $.panelDescription( + 'Kafka record end-to-end latency when starting', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester during catch-up phase, when ingesters are starting. + If ingesters are not starting and catching up in the selected time range, this panel will be empty. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage - last consumed offset)')) + .addPanel( + $.timeseriesPanel('Last consumed offset commits / sec') + + $.panelDescription( + 'Last consumed offset commits / sec', + ||| + Rate of "last consumed offset" commits issued by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_offset_commit_requests_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Last consumed offset commits latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time spent to commit "last consumed offset" by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) .addRowIf( $._config.gateway_enabled && $._config.autoscaling.gateway.enabled, $.cpuAndMemoryBasedAutoScalingRow('Gateway'), diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json index 2dde1800..3cad418e 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" } ], "legacyImports": false diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json index fc0f4f78..842f550f 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-overview.json @@ -763,7 +763,217 @@ }, "unit": "reqps" }, - "overrides": [ ] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "instant queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#429D48", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_range($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "range queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#F1C731", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_labels($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label names\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#2A66CF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_label_name_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label values\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#9E44C1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "series queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#FFAB57", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_read($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "remote read queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C79424", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_metadata($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "metadata queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#84D586", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_exemplars($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplar queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#A1C4FC", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_active_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"active series\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C788DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_names($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label name cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#3F6833", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label value cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#447EBC", + "mode": "fixed" + } + } + ] + } + ] }, "id": 11, "links": [ ], @@ -776,114 +986,15 @@ "sort": "none" } }, - "seriesOverrides": [ - { - "alias": "instant queries", - "color": "#429D48" - }, - { - "alias": "range queries", - "color": "#F1C731" - }, - { - "alias": "\"label names\" queries", - "color": "#2A66CF" - }, - { - "alias": "\"label values\" queries", - "color": "#9E44C1" - }, - { - "alias": "series queries", - "color": "#FFAB57" - }, - { - "alias": "remote read queries", - "color": "#C79424" - }, - { - "alias": "metadata queries", - "color": "#84D586" - }, - { - "alias": "exemplar queries", - "color": "#A1C4FC" - }, - { - "alias": "\"active series\" queries", - "color": "#C788DE" - } - ], "span": 3, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "instant queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "range queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "\"label names\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "\"label values\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "series queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_read\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "remote read queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_metadata\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "metadata queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_exemplars\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "exemplar queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_active_series\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"active series\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_names\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label name cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_values\"}[$__rate_interval])) > 0", + "expr": "sum by (route) (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "\"label value cardinality\" queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "other", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json index 6eb15943..53a86d20 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-reads.json @@ -1852,7 +1852,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1901,19 +1901,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1963,7 +1963,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json index 2f20031e..442701c2 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-slow-queries.json @@ -364,7 +364,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Accross tenants", + "title": "Across tenants", "titleSize": "h6" }, { diff --git a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json index 27df82a0..2b0fea40 100644 --- a/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json +++ b/monitoring-mixins/mimir-mixin/deploy/dashboards_out/mimir-top-tenants.json @@ -108,11 +108,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -234,11 +249,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -420,11 +450,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -606,11 +651,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -792,11 +852,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -918,11 +993,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -1044,11 +1134,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -1170,11 +1275,26 @@ "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -1301,6 +1421,21 @@ "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, diff --git a/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml index 6eb450f9..ff59f4d0 100644 --- a/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -10074,7 +10074,217 @@ data: }, "unit": "reqps" }, - "overrides": [ ] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "instant queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#429D48", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_range($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "range queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#F1C731", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_labels($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label names\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#2A66CF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_label_name_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label values\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#9E44C1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "series queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#FFAB57", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_read($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "remote read queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C79424", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_metadata($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "metadata queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#84D586", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_query_exemplars($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "exemplar queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#A1C4FC", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_active_series($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"active series\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#C788DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_names($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label name cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#3F6833", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*_api_v1_cardinality_label_values($|[^_])/" + }, + "properties": [ + { + "id": "displayName", + "value": "\"label value cardinality\" queries" + }, + { + "id": "color", + "value": { + "fixedColor": "#447EBC", + "mode": "fixed" + } + } + ] + } + ] }, "id": 11, "links": [ ], @@ -10087,114 +10297,15 @@ data: "sort": "none" } }, - "seriesOverrides": [ - { - "alias": "instant queries", - "color": "#429D48" - }, - { - "alias": "range queries", - "color": "#F1C731" - }, - { - "alias": "\"label names\" queries", - "color": "#2A66CF" - }, - { - "alias": "\"label values\" queries", - "color": "#9E44C1" - }, - { - "alias": "series queries", - "color": "#FFAB57" - }, - { - "alias": "remote read queries", - "color": "#C79424" - }, - { - "alias": "metadata queries", - "color": "#84D586" - }, - { - "alias": "exemplar queries", - "color": "#A1C4FC" - }, - { - "alias": "\"active series\" queries", - "color": "#C788DE" - } - ], "span": 3, "targets": [ { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "instant queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_range\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "range queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_labels\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "\"label names\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_label_name_values\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "\"label values\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_series\"}[$__rate_interval]))", + "expr": "sum by (route) (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))", "format": "time_series", - "legendFormat": "series queries", "legendLink": null }, { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_read\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "remote read queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_metadata\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "metadata queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_query_exemplars\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "exemplar queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_active_series\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"active series\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_names\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label name cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=\"prometheus_api_v1_cardinality_label_values\"}[$__rate_interval])) > 0", - "format": "time_series", - "legendFormat": "\"label value cardinality\" queries", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|query-frontend.*|cortex|mimir-read.*))\",route=~\"(prometheus|api_prom)_api_v1_.*\",route!~\"(prometheus|api_prom)(_api_v1_query|_api_v1_query_range|_api_v1_labels|_api_v1_label_name_values|_api_v1_series|_api_v1_read|_api_v1_metadata|_api_v1_query_exemplars|_api_v1_cardinality_active_series|_api_v1_cardinality_label_names|_api_v1_cardinality_label_values)\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "other", "legendLink": null @@ -19188,7 +19299,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\",route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -19237,19 +19348,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"})", + "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"})", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -19299,7 +19410,7 @@ data: "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((mimir|ingester.*|cortex|mimir-write.*))\", route=~\"/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null @@ -29006,7 +29117,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Accross tenants", + "title": "Across tenants", "titleSize": "h6" }, { @@ -32898,11 +33009,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33024,11 +33150,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33210,11 +33351,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33396,11 +33552,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33582,11 +33753,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33708,11 +33894,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33834,11 +34035,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -33960,11 +34176,26 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #A", + "pattern": "Value", "thresholds": [ ], "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, @@ -34091,6 +34322,21 @@ data: "type": "number", "unit": "short" }, + { + "alias": "user", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "user", + "thresholds": [ ], + "type": "number", + "unit": "string" + }, { "alias": "", "colorMode": null, diff --git a/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml b/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml index 2a14a39a..245a2829 100644 --- a/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml +++ b/monitoring-mixins/mimir-mixin/deploy/mimir-mixin-alerts.yaml @@ -913,6 +913,86 @@ groups: for: 1h labels: severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (10 * 60) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json index 7223f6cd..8cd458e0 100644 --- a/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "operations/mimir-mixin" } }, - "version": "6794cb4a1c48ec6cd272fc9a8559b1c033034865", - "sum": "dgNk0zx57kRIPKynxUiSFMWemw6sHP7/c0Sg33lVoWE=" + "version": "cd13a1b0509f877f3d84e3de5c884680563e6ab3", + "sum": "eNgijCGmvYx1X4yXMLBsdo7jo3HbDknbzSYHGt/I/MY=" } ], "legacyImports": false diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index 0bd0b339..cc43f483 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { dashboard(title, uid='', datasource='default', datasource_regex=''):: { // Stuff that isn't materialised. @@ -70,6 +72,40 @@ }, }, + addShowNativeLatencyVariable():: self { + templating+: { + list+: [{ + current: { + selected: true, + text: 'classic', + value: '1', + }, + description: 'Choose between showing latencies based on low precision classic or high precision native histogram metrics.', + hide: 0, + includeAll: false, + label: 'Latency metrics', + multi: false, + name: 'latency_metrics', + query: 'native : -1,classic : 1', + options: [ + { + selected: false, + text: 'native', + value: '-1', + }, + { + selected: true, + text: 'classic', + value: '1', + }, + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], + }, + }, + dashboardLinkUrl(title, url):: self { links+: [ { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index 622598f7..d669aa55 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -1,6 +1,93 @@ local g = import 'grafana-builder/grafana.libsonnet'; { + // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. + // Metric name should be provided without _bucket suffix. + nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier=''):: + local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; + local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + classicSumBy: classicSumBy, + metric: metric, + multiplierStr: multiplierStr, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + metric: metric, + multiplierStr: multiplierStr, + nativeSumBy: nativeSumBy, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. + // Metric name should be provided without _sum suffix. + nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + + // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. + // Metric name should be provided without _count suffix. + nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // TODO(krajorama) Switch to histogram_avg function for native histograms later. + nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier=''):: + local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; + { + classic: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic, + multiplier: multiplierStr, + }, + native: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native, + multiplier: multiplierStr, + }, + }, + + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query + // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. + showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], + // showNativeHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the native query + // to dashboard variable which should take -1 or +1 as values in order to show or hide the native query. + showNativeHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * -Inf)' % [query.native, dashboard_variable], + histogramRules(metric, labels, interval='1m', record_native=false):: local vars = { metric: metric, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet index 10f936e8..03fe7d4c 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet @@ -6,5 +6,6 @@ (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + (import 'alerts/autoscaling.libsonnet') + + (import 'alerts/ingest-storage.libsonnet') + (import 'alerts/continuous-test.libsonnet'), } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet index 20ae34a1..06c92bb7 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alertmanager.libsonnet @@ -6,8 +6,8 @@ { alert: $.alertName('AlertmanagerSyncConfigsFailing'), expr: ||| - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - |||, + rate(cortex_alertmanager_sync_configs_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), 'for': '30m', labels: { severity: 'critical', @@ -21,8 +21,8 @@ { alert: $.alertName('AlertmanagerRingCheckFailing'), expr: ||| - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - |||, + rate(cortex_alertmanager_ring_check_errors_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -36,8 +36,8 @@ { alert: $.alertName('AlertmanagerPartialStateMergeFailing'), expr: ||| - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_partial_state_merges_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -51,8 +51,8 @@ { alert: $.alertName('AlertmanagerReplicationFailing'), expr: ||| - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - |||, + rate(cortex_alertmanager_state_replication_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(2), 'for': '10m', labels: { severity: 'critical', @@ -66,8 +66,8 @@ { alert: $.alertName('AlertmanagerPersistStateFailing'), expr: ||| - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - |||, + rate(cortex_alertmanager_state_persist_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(15), 'for': '1h', labels: { severity: 'critical', @@ -81,8 +81,8 @@ { alert: $.alertName('AlertmanagerInitialSyncFailed'), expr: ||| - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - |||, + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet index b550937d..c586c6dc 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -54,4 +54,6 @@ for group in groups ], + alertRangeInterval(multiple):: + ($._config.base_alerts_range_interval_minutes * multiple) + 'm', } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet index efe1f119..ca67c94b 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet @@ -34,14 +34,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m])) + 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[%(range_interval)s])) / - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m])) + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) > 1 ||| % { group_by: $._config.alert_aggregation_labels, job_label: $._config.per_job_label, excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes), + range_interval: $.alertRangeInterval(1), }, 'for': '15m', labels: { @@ -81,10 +82,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('QueriesIncorrect'), expr: ||| - 100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + 100 * sum by (%(group_by)s) (rate(test_exporter_test_case_result_total{result="fail"}[%(range_interval)s])) / - sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + sum by (%(group_by)s) (rate(test_exporter_test_case_result_total[%(range_interval)s])) > 1 + ||| % { + group_by: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(5), + }, 'for': '15m', labels: { severity: 'warning', @@ -130,8 +134,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('FrontendQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_frontend_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -145,8 +153,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('SchedulerQueriesStuck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - ||| % $._config, + sum by (%(group_by)s, %(job_label)s) (min_over_time(cortex_query_scheduler_queue_length[%(range_interval)s])) > 0 + ||| % { + group_by: $._config.alert_aggregation_labels, + job_label: $._config.per_job_label, + range_interval: $.alertRangeInterval(1), + }, 'for': '7m', // We don't want to block for longer. labels: { severity: 'critical', @@ -161,19 +173,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), expr: ||| ( - sum by(%s, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operation_failures_total[%(range_interval)s]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total[%(range_interval)s]) ) / - sum by(%s, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + sum by(%(group_by)s, name, operation) ( + rate(thanos_memcached_operations_total[%(range_interval)s]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total[%(range_interval)s]) ) ) * 100 > 5 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % { + group_by: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -215,13 +230,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('KVStoreFailure'), expr: ||| ( - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[%(range_interval)s])) / - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[%(range_interval)s])) ) # We want to get alerted only in case there's a constant failure. == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -316,9 +333,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('StoreGatewayTooManyFailedOperations'), 'for': '5m', expr: ||| - sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + sum by(%(alert_aggregation_labels)s, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[%(range_interval)s])) > 0 ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(1), }, labels: { severity: 'warning', @@ -502,7 +520,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; %(kube_statefulset_status_replicas_updated)s ) ) and ( - changes(%(kube_statefulset_status_replicas_updated)s[15m:1m]) + changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -513,6 +531,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'), kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'), kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -533,7 +552,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; != %(kube_deployment_status_replicas_updated)s ) and ( - changes(%(kube_deployment_status_replicas_updated)s[15m:1m]) + changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s]) == 0 ) @@ -542,6 +561,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; aggregation_labels: $._config.alert_aggregation_labels, kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'), kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), }, 'for': '30m', labels: { @@ -619,11 +639,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedPushes'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -638,11 +660,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerTooManyFailedQueries'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -657,11 +681,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerMissedEvaluations'), expr: ||| 100 * ( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[%(range_interval)s])) / - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[%(range_interval)s])) ) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'warning', @@ -675,9 +701,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; { alert: $.alertName('RulerFailedRingCheck'), expr: ||| - sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ruler_ring_check_errors_total[%(range_interval)s])) > 0 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(1), + }, 'for': '5m', labels: { severity: 'critical', @@ -692,11 +720,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('RulerRemoteEvaluationFailing'), expr: ||| 100 * ( - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %(job_regex)s}[%(range_interval)s])) / - sum by (%s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %s}[5m])) + sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %(job_regex)s}[%(range_interval)s])) ) > 1 - ||| % [$._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend), $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ruler_query_frontend)], + ||| % { + alert_aggregation_labels: $._config.alert_aggregation_labels, + job_regex: $.jobMatcher($._config.job_names.ruler_query_frontend), + range_interval: $.alertRangeInterval(5), + }, 'for': '5m', labels: { severity: 'warning', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet index 31c9ea1e..28a4028a 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/autoscaling.libsonnet @@ -40,13 +40,14 @@ expr: ||| ( # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + label_replace(rate(keda_scaler_errors[%(range_interval)s]), "namespace", "$1", "exported_namespace", "(.*)") # Match only Mimir namespaces. * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) ) > 0 ||| % { aggregation_labels: $._config.alert_aggregation_labels, + range_interval: $.alertRangeInterval(5), }, labels: { severity: 'critical', diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet index 9cfc25f3..8f6ed51d 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet @@ -79,8 +79,8 @@ alert: $.alertName('IngesterTSDBHeadCompactionFailed'), 'for': '15m', expr: ||| - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_compactions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -91,8 +91,8 @@ { alert: $.alertName('IngesterTSDBHeadTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_head_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -103,8 +103,8 @@ { alert: $.alertName('IngesterTSDBCheckpointCreationFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -115,8 +115,8 @@ { alert: $.alertName('IngesterTSDBCheckpointDeletionFailed'), expr: ||| - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, @@ -127,8 +127,8 @@ { alert: $.alertName('IngesterTSDBWALTruncationFailed'), expr: ||| - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_truncations_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -140,11 +140,13 @@ alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + count by (%(alert_aggregation_labels)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0) > 1 and # and there is only one zone count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'single-zone', @@ -157,11 +159,13 @@ alert: $.alertName('IngesterTSDBWALCorrupted'), expr: ||| # alert when there are more than one corruptions - count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + count by (%(alert_aggregation_labels)s) (sum by (%(alert_aggregation_labels)s, %(per_job_label)s) (rate(cortex_ingester_tsdb_wal_corruptions_total[%(range_interval)s]) > 0)) > 1 and # and there are multiple zones count by (%(alert_aggregation_labels)s) (group by (%(alert_aggregation_labels)s, %(per_job_label)s) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - ||| % $._config, + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'critical', deployment: 'multi-zone', @@ -174,8 +178,8 @@ alert: $.alertName('IngesterTSDBWALWritesFailed'), 'for': '3m', expr: ||| - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - |||, + rate(cortex_ingester_tsdb_wal_writes_failed_total[%s]) > 0 + ||| % $.alertRangeInterval(1), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet index e9b4c8ce..6206ff6d 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/compactor.libsonnet @@ -114,8 +114,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '1m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 0 + ||| % $.alertRangeInterval(5), labels: { severity: 'warning', }, @@ -129,8 +129,8 @@ alert: $.alertName('CompactorSkippedUnhealthyBlocks'), 'for': '30m', expr: ||| - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - |||, + increase(cortex_compactor_blocks_marked_for_no_compaction_total[%s]) > 1 + ||| % $.alertRangeInterval(5), labels: { severity: 'critical', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet index 1313ee46..264f0e96 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/continuous-test.libsonnet @@ -9,8 +9,10 @@ alert: $.alertName('ContinuousTestNotRunningOnWrites'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_writes_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, @@ -24,8 +26,10 @@ alert: $.alertName('ContinuousTestNotRunningOnReads'), 'for': '1h', expr: ||| - sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - ||| % $._config, + sum by(%(alert_aggregation_labels)s, test) (rate(mimir_continuous_test_queries_failed_total[%(range_interval)s])) > 0 + ||| % $._config { + range_interval: $.alertRangeInterval(5), + }, labels: { severity: 'warning', }, diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet index f594bba5..d40c3a23 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet @@ -649,6 +649,9 @@ 'debug_pprof', ], + // All query methods from IngesterServer interface. Basically everything except Push. + ingester_read_path_routes_regex: '/cortex.Ingester/(QueryStream|QueryExemplars|LabelValues|LabelNames|UserStats|AllUserStats|MetricsForLabelMatchers|MetricsMetadata|LabelNamesAndValues|LabelValuesCardinality|ActiveSeries)', + // The default datasource used for dashboards. dashboard_datasource: 'default', datasource_regex: '', @@ -658,6 +661,10 @@ // Set to four times the scrape interval to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ recording_rules_range_interval: '1m', + // Used to calculate range interval in alerts with default range selector under 10 minutes. + // Needed to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ + base_alerts_range_interval_minutes: 1, + // Used to inject rows into dashboards at specific places that support it. injectRows: {}, @@ -671,5 +678,11 @@ // Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is // not used (default), then rejected requests are already counted as failures. show_rejected_requests_on_writes_dashboard: false, + + // Show panels that use queries for gRPC-based ingestion (distributor -> ingester) + show_grpc_ingestion_panels: true, + + // Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters) + show_ingest_storage_panels: false, }, } diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index ca75f19c..5ebf1bf8 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -74,18 +74,54 @@ query_frontend: { readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, - instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, - rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, - labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, - labelValuesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_label_name_values"}[$__rate_interval]))' % variables, - seriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_series"}[$__rate_interval]))' % variables, - remoteReadQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_read"}[$__rate_interval]))' % variables, - metadataQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_metadata"}[$__rate_interval]))' % variables, - exemplarsQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_exemplars"}[$__rate_interval]))' % variables, - activeSeriesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_active_series"}[$__rate_interval])) > 0' % variables, - labelNamesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_names"}[$__rate_interval])) > 0' % variables, - labelValuesCardinalityQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route="prometheus_api_v1_cardinality_label_values"}[$__rate_interval])) > 0' % variables, - otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)"}[$__rate_interval]))' % variables, + // These query routes are used in the overview and other dashboard, everythign else is considered "other" queries. + // Has to be a list to keep the same colors as before, see overridesNonErrorColorsPalette. + local overviewRoutes = [ + { name: 'instantQuery', displayName: 'instant queries', route: '/api/v1/query', routeLabel: '_api_v1_query' }, + { name: 'rangeQuery', displayName: 'range queries', route: '/api/v1/query_range', routeLabel: '_api_v1_query_range' }, + { name: 'labelNames', displayName: '"label names" queries', route: '/api/v1/labels', routeLabel: '_api_v1_labels' }, + { name: 'labelValues', displayName: '"label values" queries', route: '/api/v1/label_name_values', routeLabel: '_api_v1_label_name_values' }, + { name: 'series', displayName: 'series queries', route: '/api/v1/series', routeLabel: '_api_v1_series' }, + { name: 'remoteRead', displayName: 'remote read queries', route: '/api/v1/read', routeLabel: '_api_v1_read' }, + { name: 'metadata', displayName: 'metadata queries', route: '/api/v1/metadata', routeLabel: '_api_v1_metadata' }, + { name: 'exemplars', displayName: 'exemplar queries', route: '/api/v1/query_exemplars', routeLabel: '_api_v1_query_exemplars' }, + { name: 'activeSeries', displayName: '"active series" queries', route: '/api/v1/cardinality_active_series', routeLabel: '_api_v1_cardinality_active_series' }, + { name: 'labelNamesCardinality', displayName: '"label name cardinality" queries', route: '/api/v1/cardinality_label_names', routeLabel: '_api_v1_cardinality_label_names' }, + { name: 'labelValuesCardinality', displayName: '"label value cardinality" queries', route: '/api/v1/cardinality_label_values', routeLabel: '_api_v1_cardinality_label_values' }, + ], + local overviewRoutesRegex = '(prometheus|api_prom)(%s)' % std.join('|', [r.routeLabel for r in overviewRoutes]), + overviewRoutesOverrides: [ + { + matcher: { + id: 'byRegexp', + // To distinguish between query and query_range, we need to match the route with a negative lookahead. + options: '/.*%s($|[^_])/' % r.routeLabel, + }, + properties: [ + { + id: 'displayName', + value: r.displayName, + }, + ], + } + for r in overviewRoutes + ], + overviewRoutesPerSecond: 'sum by (route) (rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + nonOverviewRoutesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }), + + local queryPerSecond(name) = 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"}[$__rate_interval]))' % + (variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }), + instantQueriesPerSecond: queryPerSecond('instantQuery'), + rangeQueriesPerSecond: queryPerSecond('rangeQuery'), + labelNamesQueriesPerSecond: queryPerSecond('labelNames'), + labelValuesQueriesPerSecond: queryPerSecond('labelValues'), + seriesQueriesPerSecond: queryPerSecond('series'), + remoteReadQueriesPerSecond: queryPerSecond('remoteRead'), + metadataQueriesPerSecond: queryPerSecond('metadata'), + exemplarsQueriesPerSecond: queryPerSecond('exemplars'), + activeSeriesQueriesPerSecond: queryPerSecond('activeSeries'), + labelNamesCardinalityQueriesPerSecond: queryPerSecond('labelNamesCardinality'), + labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'), // Read failures rate as percentage of total requests. readFailuresRate: ||| diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index a269ccc9..47a347cb 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1,22 +1,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'grafana-builder/grafana.libsonnet') { - local resourceRequestColor = '#FFC000', - local resourceLimitColor = '#E02F44', - local successColor = '#7EB26D', - local warningColor = '#EAB839', - local errorColor = '#E24D42', + _colors:: { + resourceRequest: '#FFC000', + resourceLimit: '#E02F44', + success: '#7EB26D', + clientError: '#EF843C', + warning: '#EAB839', + failed: '#E24D42', // "error" is reserved word in Jsonnet. + }, // Colors palette picked from Grafana UI, excluding red-ish colors which we want to keep reserved for errors / failures. - local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE'], + local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE', '#3F6833', '#447EBC', '#967302', '#5794F2'], local resourceRequestStyle = $.overrideFieldByName('request', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceRequestColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceRequest }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), local resourceLimitStyle = $.overrideFieldByName('limit', [ - $.overrideProperty('color', { mode: 'fixed', fixedColor: resourceLimitColor }), + $.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceLimit }), $.overrideProperty('custom.fillOpacity', 0), $.overrideProperty('custom.lineStyle', { fill: 'dash' }), ]), @@ -196,14 +199,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; qpsPanel(selector, statusLabelName='status_code'):: super.qpsPanel(selector, statusLabelName) + $.aliasColors({ - '1xx': warningColor, - '2xx': successColor, + '1xx': $._colors.warning, + '2xx': $._colors.success, '3xx': '#6ED0E0', '4xx': '#EF843C', - '5xx': errorColor, - OK: successColor, - success: successColor, - 'error': errorColor, + '5xx': $._colors.failed, + OK: $._colors.success, + success: $._colors.success, + 'error': $._colors.failed, cancel: '#A9A9A9', }) + { fieldConfig+: { @@ -260,15 +263,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Set the failure color only if there's just 1 legend and it doesn't contain any placeholder. $.aliasColors( if (std.type(legends) == 'string' && std.length(std.findSubstr('{', legends[0])) == 0) then { - [legends]: errorColor, + [legends]: $._colors.failed, } else {} ), successFailurePanel(successMetric, failureMetric):: $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.aliasColors({ - successful: successColor, - failed: errorColor, + successful: $._colors.success, + failed: $._colors.failed, }), // successFailureCustomPanel is like successFailurePanel() but allows to customize the legends @@ -277,8 +280,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; successFailureCustomPanel(queries, legends):: $.queryPanel(queries, legends) + $.aliasColors({ - [legends[0]]: successColor, - [legends[1]]: errorColor, + [legends[0]]: $._colors.success, + [legends[1]]: $._colors.failed, }), // Displays started, completed and failed rate. @@ -288,8 +291,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack + $.aliasColors({ started: '#34CCEB', - completed: successColor, - failed: errorColor, + completed: $._colors.success, + failed: $._colors.failed, }), resourceUtilizationAndLimitLegend(resourceName):: @@ -993,9 +996,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; thresholds: { mode: 'absolute', steps: [ - { color: successColor, value: null }, - { color: warningColor, value: 0.01 }, // 1% - { color: errorColor, value: 0.05 }, // 5% + { color: $._colors.success, value: null }, + { color: $._colors.warning, value: 0.01 }, // 1% + { color: $._colors.failed, value: 0.05 }, // 5% ], }, }, @@ -1343,6 +1346,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), legends)), }, + overridesNonErrorColorsPalette(overrides):: std.mapWithIndex(function(idx, override) ( + // Do not define an override if we exausted the colors in the palette. + // Grafana will automatically choose another color. + if idx >= std.length(nonErrorColorsPalette) then override else + { + matcher: override.matcher, + properties: override.properties + [ + { + id: 'color', + value: { + fixedColor: nonErrorColorsPalette[idx], + mode: 'fixed', + }, + }, + ], + } + ), overrides), + // Panel query override functions overrideField(matcherId, options, overrideProperties):: { matcher: { diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet index 5b754d73..bbd038ca 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet @@ -175,42 +175,29 @@ local filename = 'mimir-overview.json'; ) ) .addPanel( - local legends = [ - 'instant queries', - 'range queries', - '"label names" queries', - '"label values" queries', - 'series queries', - 'remote read queries', - 'metadata queries', - 'exemplar queries', - '"active series" queries', - '"label name cardinality" queries', - '"label value cardinality" queries', - 'other', - ]; - $.timeseriesPanel('Queries / sec') + - $.queryPanel( - [ - $.queries.query_frontend.instantQueriesPerSecond, - $.queries.query_frontend.rangeQueriesPerSecond, - $.queries.query_frontend.labelNamesQueriesPerSecond, - $.queries.query_frontend.labelValuesQueriesPerSecond, - $.queries.query_frontend.seriesQueriesPerSecond, - $.queries.query_frontend.remoteReadQueriesPerSecond, - $.queries.query_frontend.metadataQueriesPerSecond, - $.queries.query_frontend.exemplarsQueriesPerSecond, - $.queries.query_frontend.activeSeriesQueriesPerSecond, - $.queries.query_frontend.labelNamesCardinalityQueriesPerSecond, - $.queries.query_frontend.labelValuesCardinalityQueriesPerSecond, - $.queries.query_frontend.otherQueriesPerSecond, + { + targets: [ + { + expr: $.queries.query_frontend.overviewRoutesPerSecond, + format: 'time_series', + legendLink: null, + }, + { + expr: $.queries.query_frontend.nonOverviewRoutesPerSecond, + format: 'time_series', + legendFormat: 'other', + legendLink: null, + }, ], - legends, - ) + - $.panelSeriesNonErrorColorsPalette(legends) + - $.stack + - { fieldConfig+: { defaults+: { unit: 'reqps' } } }, + } + + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + overrides+: $.overridesNonErrorColorsPalette($.queries.query_frontend.overviewRoutesOverrides), + }, + } + + $.stack ) ) diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet index 129a1d92..93d605b0 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/queries.libsonnet @@ -167,6 +167,157 @@ local filename = 'mimir-queries.json'; { fieldConfig+: { defaults+: { unit: 'short' } } }, ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage: strong consistency)')) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency / sec') + + $.panelDescription( + 'Requests with strong read consistency / sec', + ||| + Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Requests with strong read consistency ratio') + + $.panelDescription( + 'Requests with strong read consistency ratio', + ||| + Ratio between requests with strong read consistency and all read requests on ingesters. + ||| + ) + + $.queryPanel( + [ + ||| + ( + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + / + sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], + ], + ['successful', 'failed'], + ) + + $.aliasColors({ failed: $._colors.failed, successful: $._colors.success }) + + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + + $.stack + ) + .addPanel( + $.timeseriesPanel('Strong read consistency queries — wait latency') + + $.panelDescription( + 'Strong read consistency queries — wait latency', + ||| + How long does the request wait to guarantee strong read consistency. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage: last produced offset)')) + .addPanel( + $.timeseriesPanel('Last produced offset requests / sec') + + $.panelDescription( + 'Rate of requests to fetch last produced offset for partition', + ||| + Shows rate of requests to fetch last produced offset for partition, and rate of failed requests. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Last produced offset latency') + + $.panelDescription( + 'Latency', + ||| + How long does it take to fetch "last produced offset" of partition. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) .addRow( $.row('Querier') .addPanel( diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet index dd149a93..23c4d4de 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/reads.libsonnet @@ -299,16 +299,16 @@ local filename = 'mimir-reads.json'; $.row('Ingester') .addPanel( $.timeseriesPanel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex]) ) .addPanel( $.timeseriesPanel('Latency') + - $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + $.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', $._config.ingester_read_path_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex], '' ) ) ) diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet index b133f01f..b1ed99ad 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/slow-queries.libsonnet @@ -7,7 +7,7 @@ local filename = 'mimir-slow-queries.json'; ($.dashboard('Slow queries') + { uid: std.md5(filename) }) .addClusterSelectorTemplates(false) .addRow( - $.row('Accross tenants') + $.row('Across tenants') .addPanel( $.timeseriesPanel('Response time') + $.lokiMetricsQueryPanel( diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet index 4359ee3d..96eab666 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/top-tenants.libsonnet @@ -58,8 +58,10 @@ local filename = 'mimir-top-tenants.json'; distributor: $.jobMatcher($._config.job_names.distributor), group_by_cluster: $._config.group_by_cluster, }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -72,8 +74,10 @@ local filename = 'mimir-top-tenants.json'; $.tablePanel( [ 'topk($limit, %(in_memory_series_per_user)s)' % { in_memory_series_per_user: in_memory_series_per_user_query() }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -107,8 +111,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -143,8 +149,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_discarded_samples_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.ingester + $._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'samples/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'samples/s' }, + } ) ), ) @@ -190,8 +198,10 @@ local filename = 'mimir-top-tenants.json'; distributor: $.jobMatcher($._config.job_names.distributor), group_by_cluster: $._config.group_by_cluster, }, - ], - { 'Value #A': { alias: 'series' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'series' }, + } ) ), ) @@ -205,8 +215,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (user) (rate(cortex_distributor_received_exemplars_total{%(job)s}[5m])))' % { job: $.jobMatcher($._config.job_names.distributor) }, - ], - { 'Value #A': { alias: 'exemplars/s' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'exemplars/s' }, + } ) ), ) @@ -221,8 +233,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_rules{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'rules' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'rules' }, + } ) ), ) @@ -236,8 +250,10 @@ local filename = 'mimir-top-tenants.json'; [ 'topk($limit, sum by (rule_group, user) (cortex_prometheus_rule_group_last_duration_seconds{%(job)s}))' % { job: $.jobMatcher($._config.job_names.ruler) }, - ], - { 'Value #A': { alias: 'seconds' } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'seconds' }, + } ) ) ) @@ -256,8 +272,10 @@ local filename = 'mimir-top-tenants.json'; (sum(rate(cortex_bucket_index_estimated_compaction_jobs_errors_total{%s}[$__rate_interval])) == 0) ) ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], - ], - { Value: { alias: 'Compaction Jobs', decimals: 0 } } + ], { + user: { alias: 'user', unit: 'string' }, + Value: { alias: 'Compaction Jobs', decimals: 0 }, + } ) ), ), diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet index 9d80feb0..61205581 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet @@ -3,6 +3,7 @@ local filename = 'mimir-writes.json'; (import 'dashboard-utils.libsonnet') + (import 'dashboard-queries.libsonnet') { + [filename]: assert std.md5(filename) == '8280707b8f16e7b87b840fc1cc92d4c5' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Writes') + { uid: std.md5(filename) }) @@ -162,10 +163,39 @@ local filename = 'mimir-writes.json'; 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], '' ) ) + .addPanelIf( + $._config.show_ingest_storage_panels, + $.timeseriesPanel('Sync write to Kafka latency (ingest storage)') + + $.panelDescription( + 'Sync write to Kafka latency (ingest storage)', + ||| + Latency of synchronous write operation used to store data into Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_writer_latency_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.distributor)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) ) .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($)) - .addRow( - $.row('Ingester') + .addRowIf( + $._config.show_grpc_ingestion_panels, + ($.row('Ingester')) .addPanel( $.timeseriesPanel('Requests / sec') + $.panelDescription( @@ -206,6 +236,202 @@ local filename = 'mimir-writes.json'; ) ) ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage)')) + .addPanel( + $.timeseriesPanel('Kafka fetches / sec') + + $.panelDescription( + 'Kafka fetches / sec', + ||| + Rate of fetches received from Kafka brokers. A fetch can contain multiple records (a write request received on the write path is mapped into a single record). + Read errors are any errors reported on connection to Kafka brokers, and are separate from "failed" fetches. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_fetches_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + // cortex_ingest_storage_reader_read_errors_total metric is reported by Kafka client. + 'sum (rate (cortex_ingest_storage_reader_read_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + 'read errors', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed, 'read errors': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka records / sec') + + $.panelDescription( + 'Kafka records / sec', + ||| + Rate of processed records from Kafka. Failed records are categorized as "client" errors (e.g. per-tenant limits) or server errors. + ||| + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_records_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_records_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="client"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_records_failed_total{%s, cause="server"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed (client)', + 'failed (server)', + ], + ) + $.aliasColors({ successful: $._colors.success, 'failed (client)': $._colors.clientError, 'failed (server)': $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Kafka record processing latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time used to process a single record (write request). This time is spent by appending data to per-tenant TSDB. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_processing_time_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage – end-to-end latency)')) + .addPanel( + $.timeseriesPanel('Kafka record end-to-end latency when ingesters are running') + + $.panelDescription( + 'Kafka record end-to-end latency when ingesters are running', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester, when ingesters are running. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="running"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + .addPanel( + $.timeseriesPanel('Kafka record end-to-end latency when starting') + + $.panelDescription( + 'Kafka record end-to-end latency when starting', + ||| + Time between writing request by distributor to Kafka and reading the record by ingester during catch-up phase, when ingesters are starting. + If ingesters are not starting and catching up in the selected time range, this panel will be empty. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_receive_delay_seconds{%s, phase="starting"}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + ($.row('Ingester (ingest storage - last consumed offset)')) + .addPanel( + $.timeseriesPanel('Last consumed offset commits / sec') + + $.panelDescription( + 'Last consumed offset commits / sec', + ||| + Rate of "last consumed offset" commits issued by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + ||| + sum (rate (cortex_ingest_storage_reader_offset_commit_requests_total{%s}[$__rate_interval])) + - + sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum (rate (cortex_ingest_storage_reader_offset_commit_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + 'successful', + 'failed', + ], + ) + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + ) + .addPanel( + $.timeseriesPanel('Last consumed offset commits latency') + + $.panelDescription( + 'Kafka record processing latency', + ||| + Time spent to commit "last consumed offset" by ingesters to Kafka. + ||| + ) + + $.queryPanel( + [ + 'histogram_quantile(0.5, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_offset_commit_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], + ], + [ + '50th percentile', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + ) + ) .addRowIf( $._config.gateway_enabled && $._config.autoscaling.gateway.enabled, $.cpuAndMemoryBasedAutoScalingRow('Gateway'), diff --git a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json index 2dde1800..3cad418e 100644 --- a/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/mimir-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" } ], "legacyImports": false diff --git a/monitoring-mixins/pyroscope-mixin/jsonnetfile.lock.json b/monitoring-mixins/pyroscope-mixin/jsonnetfile.lock.json index b0614aaf..b7152af4 100644 --- a/monitoring-mixins/pyroscope-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/pyroscope-mixin/jsonnetfile.lock.json @@ -18,8 +18,8 @@ "subdir": "grafana-builder" } }, - "version": "f364a45c27c385d86e399cd29e0b63baab7ac10a", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "mixin-utils" } }, - "version": "f364a45c27c385d86e399cd29e0b63baab7ac10a", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" }, { "source": { @@ -38,7 +38,7 @@ "subdir": "operations/pyroscope/jsonnet/pyroscope-mixin/pyroscope-mixin" } }, - "version": "26b27357682530445808d5b901fff29f572b67ab", + "version": "65be0e07d548a7f6e048fb419f9a0c8f58bb9465", "sum": "8ruCWQHfhd+RHAUnFBVl6Seryd9EEqCXZUm/dsSW5Wc=" } ], diff --git a/monitoring-mixins/tempo-mixin/deploy.libsonnet b/monitoring-mixins/tempo-mixin/deploy.libsonnet index 31fe6fd5..9ffdcd84 100644 --- a/monitoring-mixins/tempo-mixin/deploy.libsonnet +++ b/monitoring-mixins/tempo-mixin/deploy.libsonnet @@ -1,3 +1,14 @@ (import 'mixin.libsonnet') + { // Config overrides + _config+:: { + jobs: { + gateway: 'cortex-gw(-internal)?', + query_frontend: '(tempo|query-frontend)', + querier: '(tempo|querier)', + ingester: '(tempo|ingester)', + metrics_generator: '(tempo|metrics-generator)', + distributor: '(tempo|distributor)', + compactor: '(tempo|compactor)', + } + } } \ No newline at end of file diff --git a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-reads.json b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-reads.json index 991db79d..7eabd459 100644 --- a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-reads.json +++ b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-reads.json @@ -260,7 +260,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -335,7 +335,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -344,7 +344,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -353,7 +353,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -452,7 +452,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -527,7 +527,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -536,7 +536,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -545,7 +545,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -644,7 +644,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -719,7 +719,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (le,endpoint)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -728,7 +728,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (le,endpoint)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -737,7 +737,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint)", + "expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (endpoint)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -836,7 +836,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -911,7 +911,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -920,7 +920,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -929,7 +929,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1028,7 +1028,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -1103,7 +1103,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1112,7 +1112,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1121,7 +1121,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1220,7 +1220,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -1295,7 +1295,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1304,7 +1304,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1313,7 +1313,7 @@ "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, diff --git a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-resources.json b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-resources.json index 6782ffa8..1b2ff2d4 100644 --- a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-resources.json +++ b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-resources.json @@ -356,21 +356,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -456,21 +456,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -545,7 +545,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -643,21 +643,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -743,21 +743,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -832,7 +832,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -930,21 +930,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1030,21 +1030,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1119,7 +1119,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -1217,21 +1217,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1317,21 +1317,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1406,7 +1406,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -1504,21 +1504,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1604,21 +1604,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1693,7 +1693,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/querier\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -1791,21 +1791,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1891,21 +1891,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -1980,7 +1980,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", diff --git a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-tenants.json b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-tenants.json index c3ec5f23..13fe4958 100644 --- a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-tenants.json +++ b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-tenants.json @@ -77,7 +77,7 @@ ], "targets": [ { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n) by (limit_name)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})\n) by (limit_name)\n", "format": "table", "instant": true, "legendFormat": "", @@ -176,21 +176,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "received", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n", "format": "time_series", "interval": "1m", "legendFormat": "burst limit", @@ -265,14 +265,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "accepted", "legendLink": null }, { - "expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)", + "expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)", "format": "time_series", "interval": "1m", "legendFormat": "refused {{ reason }}", @@ -358,21 +358,21 @@ "steppedLine": false, "targets": [ { - "expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",tenant=\"$tenant\"})", + "expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "live traces", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n", "format": "time_series", "interval": "1m", "legendFormat": "global limit", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n", "format": "time_series", "interval": "1m", "legendFormat": "local limit", @@ -459,7 +459,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)", + "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", "legendFormat": "{{ status }}", @@ -534,7 +534,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)", + "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", "legendFormat": "{{ status }}", @@ -615,7 +615,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})", + "expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "length", @@ -684,7 +684,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n", + "expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})\n", "format": "time_series", "interval": "1m", "legendFormat": "blocks", @@ -765,7 +765,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "rate", @@ -846,14 +846,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"})", + "expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{ tenant }}", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n", "format": "time_series", "interval": "1m", "legendFormat": "limit", @@ -987,7 +987,7 @@ "multi": false, "name": "tenant", "options": [ ], - "query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"}, tenant)", + "query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"}, tenant)", "refresh": 1, "regex": "", "sort": 2, diff --git a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-writes.json b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-writes.json index 26bbc007..8b189ebf 100644 --- a/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-writes.json +++ b/monitoring-mixins/tempo-mixin/deploy/dashboards_out/tempo-writes.json @@ -357,14 +357,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "accepted", "legendLink": null }, { - "expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "refused", @@ -439,7 +439,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -448,7 +448,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -457,7 +457,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -556,7 +556,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -631,7 +631,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -640,7 +640,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -649,7 +649,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -748,7 +748,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -823,7 +823,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -832,7 +832,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -841,7 +841,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -940,7 +940,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -1015,7 +1015,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1024,7 +1024,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1033,7 +1033,7 @@ "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1132,7 +1132,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -1207,7 +1207,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1216,7 +1216,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1225,7 +1225,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1324,7 +1324,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -1399,7 +1399,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1408,7 +1408,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1417,7 +1417,7 @@ "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, diff --git a/monitoring-mixins/tempo-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/tempo-mixin/deploy/manifests/k8s-all-in-one.yaml index f0bacae1..f2b02dfb 100644 --- a/monitoring-mixins/tempo-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/tempo-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -6962,7 +6962,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7037,7 +7037,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7046,7 +7046,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7055,7 +7055,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=~\"api_.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",route=~\"api_.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7154,7 +7154,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7229,7 +7229,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7238,7 +7238,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7247,7 +7247,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",route=~\"querier_api_.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7346,7 +7346,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7421,7 +7421,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (le,endpoint)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7430,7 +7430,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (le,endpoint)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7439,7 +7439,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint)", + "expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"}[$__rate_interval])) by (endpoint)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7538,7 +7538,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7613,7 +7613,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7622,7 +7622,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (le,route)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7631,7 +7631,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route)", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Querier/.*\"}[$__rate_interval])) by (route)", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7730,7 +7730,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7805,7 +7805,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7814,7 +7814,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7823,7 +7823,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -7922,7 +7922,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -7997,7 +7997,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -8006,7 +8006,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -8015,7 +8015,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\",operation=\"GET\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -8544,21 +8544,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -8644,21 +8644,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|distributor)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -8733,7 +8733,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -8831,21 +8831,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -8931,21 +8931,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|ingester)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9020,7 +9020,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -9118,21 +9118,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9218,21 +9218,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|metrics-generator)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9307,7 +9307,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -9405,21 +9405,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9505,21 +9505,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|query-frontend)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9594,7 +9594,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -9692,21 +9692,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9792,21 +9792,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|querier)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -9881,7 +9881,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/querier\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|querier)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -9979,21 +9979,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -10079,21 +10079,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", + "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"(tempo|compactor)\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", "legendFormat": "request", @@ -10168,7 +10168,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})", + "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{instance}}", @@ -11839,7 +11839,7 @@ data: ], "targets": [ { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n) by (limit_name)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})\n) by (limit_name)\n", "format": "table", "instant": true, "legendFormat": "", @@ -11938,21 +11938,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "received", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n", "format": "time_series", "interval": "1m", "legendFormat": "limit", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n", "format": "time_series", "interval": "1m", "legendFormat": "burst limit", @@ -12027,14 +12027,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "accepted", "legendLink": null }, { - "expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)", + "expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)", "format": "time_series", "interval": "1m", "legendFormat": "refused {{ reason }}", @@ -12120,21 +12120,21 @@ data: "steppedLine": false, "targets": [ { - "expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",tenant=\"$tenant\"})", + "expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "live traces", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n", "format": "time_series", "interval": "1m", "legendFormat": "global limit", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n", "format": "time_series", "interval": "1m", "legendFormat": "local limit", @@ -12221,7 +12221,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)", + "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", "legendFormat": "{{ status }}", @@ -12296,7 +12296,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)", + "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|query-frontend)\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", "legendFormat": "{{ status }}", @@ -12377,7 +12377,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})", + "expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "length", @@ -12446,7 +12446,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n", + "expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"})\n", "format": "time_series", "interval": "1m", "legendFormat": "blocks", @@ -12527,7 +12527,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "rate", @@ -12608,14 +12608,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"})", + "expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|metrics-generator)\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", "legendFormat": "{{ tenant }}", "legendLink": null }, { - "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n", + "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n", "format": "time_series", "interval": "1m", "legendFormat": "limit", @@ -12749,7 +12749,7 @@ data: "multi": false, "name": "tenant", "options": [ ], - "query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"}, tenant)", + "query": "label_values(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\"}, tenant)", "refresh": 1, "regex": "", "sort": 2, @@ -13166,14 +13166,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "accepted", "legendLink": null }, { - "expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", + "expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", "legendFormat": "refused", @@ -13248,7 +13248,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13257,7 +13257,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_distributor_push_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13266,7 +13266,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_distributor_push_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_distributor_push_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|distributor)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13365,7 +13365,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -13440,7 +13440,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13449,7 +13449,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13458,7 +13458,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13557,7 +13557,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -13632,7 +13632,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13641,7 +13641,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13650,7 +13650,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13749,7 +13749,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -13824,7 +13824,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13833,7 +13833,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13842,7 +13842,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|ingester)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13941,7 +13941,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -14016,7 +14016,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14025,7 +14025,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14034,7 +14034,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempo_memcache_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",method=\"Memcache.Put\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14133,7 +14133,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", "legendFormat": "{{status}}", @@ -14208,7 +14208,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14217,7 +14217,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempodb_backend_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by (le,)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14226,7 +14226,7 @@ data: "step": 10 }, { - "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", + "expr": "sum(rate(tempodb_backend_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by () * 1e3 / sum(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(tempo|compactor)\",operation=~\"(PUT|POST)\"}[$__rate_interval])) by ()", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -14418,7 +14418,7 @@ spec: message: There are {{ printf "%f" $value }} unhealthy compactor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="tempo", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|compactor)", namespace=~".*"}) > 0 for: 15m labels: severity: critical @@ -14427,7 +14427,7 @@ spec: message: There are {{ printf "%f" $value }} unhealthy distributor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="tempo", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|compactor)", namespace=~".*"}) > 0 for: 15m labels: severity: warning diff --git a/monitoring-mixins/tempo-mixin/deploy/prometheus-alerts.yaml b/monitoring-mixins/tempo-mixin/deploy/prometheus-alerts.yaml index f75ed1df..12ddbf48 100644 --- a/monitoring-mixins/tempo-mixin/deploy/prometheus-alerts.yaml +++ b/monitoring-mixins/tempo-mixin/deploy/prometheus-alerts.yaml @@ -23,7 +23,7 @@ spec: message: There are {{ printf "%f" $value }} unhealthy compactor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="tempo", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|compactor)", namespace=~".*"}) > 0 for: 15m labels: severity: critical @@ -32,7 +32,7 @@ spec: message: There are {{ printf "%f" $value }} unhealthy distributor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="tempo", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|compactor)", namespace=~".*"}) > 0 for: 15m labels: severity: warning diff --git a/monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml b/monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml index bb8bdfeb..ed9b58c5 100644 --- a/monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml +++ b/monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml @@ -16,7 +16,7 @@ groups: message: There are {{ printf "%f" $value }} unhealthy compactor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="compactor", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|compactor)", namespace=~".*"}) > 0 for: 15m labels: severity: critical @@ -25,7 +25,7 @@ groups: message: There are {{ printf "%f" $value }} unhealthy distributor(s). runbook_url: https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy expr: | - max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="distributor", namespace=~".*"}) > 0 + max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="(tempo|distributor)", namespace=~".*"}) > 0 for: 15m labels: severity: warning diff --git a/monitoring-mixins/tempo-mixin/jsonnetfile.lock.json b/monitoring-mixins/tempo-mixin/jsonnetfile.lock.json index d62bc79d..39cbb27e 100644 --- a/monitoring-mixins/tempo-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/tempo-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "3f2b0ee0b6853c12d19cf059c590390f6948929b", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" }, { "source": { @@ -28,7 +28,7 @@ "subdir": "operations/tempo-mixin" } }, - "version": "ae083c34b82a07dd72bfb36792e0b2a380634e94", + "version": "89d7c9e58788547940215dec566d92f631ecd3ca", "sum": "PCP6wd28E7IJmZs0hpW6VCGFo4L0KJ9Uvf0KOtDU2iI=" } ], diff --git a/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index 0bd0b339..cc43f483 100644 --- a/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { dashboard(title, uid='', datasource='default', datasource_regex=''):: { // Stuff that isn't materialised. @@ -70,6 +72,40 @@ }, }, + addShowNativeLatencyVariable():: self { + templating+: { + list+: [{ + current: { + selected: true, + text: 'classic', + value: '1', + }, + description: 'Choose between showing latencies based on low precision classic or high precision native histogram metrics.', + hide: 0, + includeAll: false, + label: 'Latency metrics', + multi: false, + name: 'latency_metrics', + query: 'native : -1,classic : 1', + options: [ + { + selected: false, + text: 'native', + value: '-1', + }, + { + selected: true, + text: 'classic', + value: '1', + }, + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], + }, + }, + dashboardLinkUrl(title, url):: self { links+: [ { diff --git a/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index 622598f7..d669aa55 100644 --- a/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/monitoring-mixins/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -1,6 +1,93 @@ local g = import 'grafana-builder/grafana.libsonnet'; { + // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. + // Metric name should be provided without _bucket suffix. + nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier=''):: + local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; + local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + classicSumBy: classicSumBy, + metric: metric, + multiplierStr: multiplierStr, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + metric: metric, + multiplierStr: multiplierStr, + nativeSumBy: nativeSumBy, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. + // Metric name should be provided without _sum suffix. + nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + + // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. + // Metric name should be provided without _count suffix. + nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // TODO(krajorama) Switch to histogram_avg function for native histograms later. + nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier=''):: + local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; + { + classic: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic, + multiplier: multiplierStr, + }, + native: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native, + multiplier: multiplierStr, + }, + }, + + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query + // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. + showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], + // showNativeHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the native query + // to dashboard variable which should take -1 or +1 as values in order to show or hide the native query. + showNativeHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * -Inf)' % [query.native, dashboard_variable], + histogramRules(metric, labels, interval='1m', record_native=false):: local vars = { metric: metric,