diff --git a/CHANGELOG.md b/CHANGELOG.md index 08caacbfc6..64bf66306c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ Main (unreleased) used as a temporary measure, since this flag will be disabled in future releases. (@thampiotr) +- Added a new panel to Cluster Overview dashboard to show the number of peers + seen by each instance in the cluster. This can help diagnose cluster split + brain issues. (@thampiotr) + ### Bugfixes - Fixed an issue which caused loss of context data in Faro exception. (@codecapitano) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 06ad02b552..d5e4ff3fd1 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -225,5 +225,30 @@ local cluster_node_filename = 'alloy-cluster-node.json'; }, ]) ), + + // Number of peers as seen by each instance. + ( + panel.new(title='Number of peers seen by each instance', type='timeseries') + + panel.withUnit('peers') + + panel.withDescription(||| + The number of cluster peers seen by each instance. + + When cluster is converged, every peer should see all the other instances. When we have a split brain or one + peer not joining the cluster, we will see two or more groups of instances that report different peer numbers + for an extended period of time and not converging. + + This graph helps to identify which instances may be in a split brain state. + |||) + + panel.withPosition({ h: 12, w: 24, x: 0, y: 18 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (cluster_node_peers{%(groupSelector)s}) + ||| % $._config, + legendFormat='{{instance}}', + ), + ]) + ), + ]), }