Skip to content

Commit

Permalink
update monitoring-mixins
Browse files Browse the repository at this point in the history
Signed-off-by: Weifeng Wang <[email protected]>

update mimir-mixin

Signed-off-by: Weifeng Wang <[email protected]>

update pyroscope-mixin

Signed-off-by: Weifeng Wang <[email protected]>

update loki-mixin

Signed-off-by: Weifeng Wang <[email protected]>

update tempo-mixin

Signed-off-by: Weifeng Wang <[email protected]>

add tempo-mixin

Signed-off-by: Weifeng Wang <[email protected]>
  • Loading branch information
qclaogui committed Apr 1, 2024
1 parent df0e697 commit 5bdb1b9
Show file tree
Hide file tree
Showing 75 changed files with 2,903 additions and 916 deletions.
6 changes: 3 additions & 3 deletions compose.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
volumes:
- ./monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin
- ./monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
# - ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
- ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
environment:
GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn}
GF_DIAGNOSTICS_PROFILING_ENABLED: true
Expand Down Expand Up @@ -83,8 +83,8 @@ services:
- ./monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
- ./monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
- ./monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml
# - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
# - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
- ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
- ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml

# override compose.yaml included service pyroscope's labels and environment to enabled traces data collection
pyroscope:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ services:
grafana:
volumes:
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin

Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ services:
volumes:
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
volumes:
- ../../../monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
environment:
GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn}
GF_DIAGNOSTICS_PROFILING_ENABLED: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ services:
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
- ../../../monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ services:
grafana:
volumes:
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin

4 changes: 2 additions & 2 deletions docker-compose/monolithic-mode/traces/mimirtool.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ services:
volumes:
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@ groups:
rules:
- alert: ClusterNotConverging
annotations:
message: Cluster is not converging.
message: 'Cluster is not converging: nodes report different number of peers in the cluster.'
expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0
for: 10m
- alert: ClusterSplitBrain
- alert: ClusterNodeCountMismatch
annotations:
message: Cluster nodes have entered a split brain state.
message: Nodes report different number of peers vs. the count of observed agent metrics. Some agent metrics may be missing or the cluster is in a split brain state.
expr: |
sum without (state) (cluster_node_peers) !=
on (cluster, namespace) group_left
count by (cluster, namespace) (cluster_node_info)
for: 10m
for: 15m
- alert: ClusterNodeUnhealthy
annotations:
message: Cluster node is reporting a health score > 0.
message: Cluster node is reporting a gossip protocol health score > 0.
expr: |
cluster_node_gossip_health_score > 0
for: 10m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@
{
"datasource": "${datasource}",
"description": "Number of spans successfully pushed into the pipeline.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -82,11 +93,6 @@
{
"datasource": "${datasource}",
"description": "The duration of inbound RPCs.\n",
"fieldConfig": {
"defaults": {
"unit": "milliseconds"
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -115,7 +121,7 @@
"yHistogram": true
},
"yAxis": {
"unit": "s"
"unit": "ms"
}
},
"pluginVersion": "9.0.6",
Expand All @@ -129,7 +135,7 @@
"range": true
}
],
"title": "RPC server duration (traces)",
"title": "RPC server duration",
"type": "heatmap"
},
{
Expand All @@ -140,12 +146,17 @@
"x": 0,
"y": 10
},
"title": "Batching [otelcol.processor.batch]",
"title": "Batching of logs, metrics, and traces [otelcol.processor.batch]",
"type": "row"
},
{
"datasource": "${datasource}",
"description": "Number of units in the batch\n",
"description": "Number of spans, metric datapoints, or log lines in a batch\n",
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -174,7 +185,7 @@
"yHistogram": true
},
"yAxis": {
"unit": "s"
"unit": "short"
}
},
"pluginVersion": "9.0.6",
Expand Down Expand Up @@ -247,6 +258,17 @@
{
"datasource": "${datasource}",
"description": "Number of spans successfully sent to destination.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand All @@ -268,6 +290,17 @@
{
"datasource": "${datasource}",
"description": "Number of spans in failed attempts to send to destination.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1438,6 +1438,17 @@ data:
{
"datasource": "${datasource}",
"description": "Number of spans successfully pushed into the pipeline.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -1491,11 +1502,6 @@ data:
{
"datasource": "${datasource}",
"description": "The duration of inbound RPCs.\n",
"fieldConfig": {
"defaults": {
"unit": "milliseconds"
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -1524,7 +1530,7 @@ data:
"yHistogram": true
},
"yAxis": {
"unit": "s"
"unit": "ms"
}
},
"pluginVersion": "9.0.6",
Expand All @@ -1538,7 +1544,7 @@ data:
"range": true
}
],
"title": "RPC server duration (traces)",
"title": "RPC server duration",
"type": "heatmap"
},
{
Expand All @@ -1549,12 +1555,17 @@ data:
"x": 0,
"y": 10
},
"title": "Batching [otelcol.processor.batch]",
"title": "Batching of logs, metrics, and traces [otelcol.processor.batch]",
"type": "row"
},
{
"datasource": "${datasource}",
"description": "Number of units in the batch\n",
"description": "Number of spans, metric datapoints, or log lines in a batch\n",
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -1583,7 +1594,7 @@ data:
"yHistogram": true
},
"yAxis": {
"unit": "s"
"unit": "short"
}
},
"pluginVersion": "9.0.6",
Expand Down Expand Up @@ -1656,6 +1667,17 @@ data:
{
"datasource": "${datasource}",
"description": "Number of spans successfully sent to destination.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand All @@ -1677,6 +1699,17 @@ data:
{
"datasource": "${datasource}",
"description": "Number of spans in failed attempts to send to destination.\n",
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 20,
"gradientMode": "hue",
"stacking": {
"mode": "normal"
}
}
}
},
"gridPos": {
"h": 10,
"w": 8,
Expand Down Expand Up @@ -2807,37 +2840,27 @@ spec:
rules:
- alert: ClusterNotConverging
annotations:
message: Cluster is not converging.
message: 'Cluster is not converging: nodes report different number of peers
in the cluster.'
expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers))
!= 0
for: 5m
- alert: ClusterSplitBrain
for: 10m
- alert: ClusterNodeCountMismatch
annotations:
message: Cluster nodes have entered a split brain state.
message: Nodes report different number of peers vs. the count of observed
agent metrics. Some agent metrics may be missing or the cluster is in a
split brain state.
expr: |
sum without (state) (cluster_node_peers) !=
on (cluster, namespace) group_left
count by (cluster, namespace) (cluster_node_info)
for: 5m
- alert: ClusterLamportClockDrift
annotations:
message: Cluster nodes' lamport clocks are not converging.
expr: stddev by (cluster, namespace) (cluster_node_lamport_time) > 4 * sqrt(count
by (cluster, namespace) (cluster_node_info))
for: 5m
for: 15m
- alert: ClusterNodeUnhealthy
annotations:
message: Cluster node is reporting a health score > 0.
message: Cluster node is reporting a gossip protocol health score > 0.
expr: |
cluster_node_gossip_health_score > 0
for: 5m
- alert: ClusterLamportClockStuck
annotations:
message: Cluster nodes's lamport clocks is not progressing.
expr: |
sum by (cluster, namespace, instance) (rate(cluster_node_lamport_time[2m])) == 0
and on (cluster, namespace, instance) (cluster_node_peers > 1)
for: 5m
for: 10m
- alert: ClusterNodeNameConflict
annotations:
message: A node tried to join the cluster with a name conflicting with an
Expand All @@ -2850,7 +2873,7 @@ spec:
message: Cluster node stuck in Terminating state.
expr: sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"})
> 0
for: 5m
for: 10m
- alert: ClusterConfigurationDrift
annotations:
message: Cluster nodes are not using the same configuration file.
Expand All @@ -2870,6 +2893,6 @@ spec:
- alert: UnhealthyComponents
annotations:
message: Unhealthy Flow components detected.
expr: sum(agent_component_controller_running_components{health_type!="healthy"})
expr: sum by (cluster, namespace) (agent_component_controller_running_components{health_type!="healthy"})
> 0
for: 15m
Loading

0 comments on commit 5bdb1b9

Please sign in to comment.