diff --git a/docs/metrics.md b/docs/metrics.md index 59f315b6..86420df4 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -9,8 +9,10 @@ CKE exposes the following metrics with the Prometheus format at `/metrics` REST | node_reboot_status | The reboot status of a node. | Gauge | `node`, `status` | | operation_phase | 1 if CKE is operating in the phase specified by the `phase` label. | Gauge | `phase` | | operation_phase_timestamp_seconds | The Unix timestamp when `operation_phase` was last updated. | Gauge | | +| reboot_queue_enabled | True (=1) if reboot queue is enabled. | Gauge | | | reboot_queue_entries | The number of reboot queue entries remaining. | Gauge | | | reboot_queue_items | The number reboot queue entries remaining per status. | Gauge | `status` | +| reboot_queue_running | True (=1) if reboot queue is enabled and the queue is not empty. | Gauge | | | sabakan_integration_successful | True (=1) if sabakan-integration satisfies constraints. | Gauge | | | sabakan_integration_timestamp_seconds | The Unix timestamp when `sabakan_integration_successful` was last updated. | Gauge | | | sabakan_workers | The number of worker nodes for each role. | Gauge | `role` | diff --git a/metrics/collector.go b/metrics/collector.go index c77e0c2b..b594022c 100644 --- a/metrics/collector.go +++ b/metrics/collector.go @@ -39,6 +39,7 @@ type metricGroup struct { // This abstraction is for mock test. type storage interface { IsSabakanDisabled(context.Context) (bool, error) + IsRebootQueueDisabled(ctx context.Context) (bool, error) GetRebootsEntries(ctx context.Context) ([]*cke.RebootQueueEntry, error) GetCluster(ctx context.Context) (*cke.Cluster, error) } @@ -131,8 +132,10 @@ type nodeMetricsCollector struct { var _ prometheus.Collector = &nodeMetricsCollector{} func (c nodeMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- rebootQueueEnabled ch <- rebootQueueEntries ch <- rebootQueueItems + ch <- rebootQueueRunning ch <- nodeRebootStatus } @@ -140,6 +143,14 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) defer cancel() + rqDisabled, err := c.storage.IsRebootQueueDisabled(ctx) + if err != nil { + log.Error("failed to get if reboot queue is enabled", map[string]interface{}{ + log.FnError: err, + }) + return + } + rqEntries, err := c.storage.GetRebootsEntries(ctx) if err != nil { log.Error("failed to get reboots entries", map[string]interface{}{ @@ -148,6 +159,14 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) { return } + var rqEnabled, rqRunning float64 + if !rqDisabled { + rqEnabled = 1 + } + if !rqDisabled && len(rqEntries) > 0 { + rqRunning = 1 + } + cluster, err := c.storage.GetCluster(ctx) if err != nil { log.Error("failed to get cluster", map[string]interface{}{ @@ -158,11 +177,21 @@ func (c nodeMetricsCollector) Collect(ch chan<- prometheus.Metric) { itemCounts := cke.CountRebootQueueEntries(rqEntries) nodeStatus := cke.BuildNodeRebootStatus(cluster.Nodes, rqEntries) + ch <- prometheus.MustNewConstMetric( + rebootQueueEnabled, + prometheus.GaugeValue, + rqEnabled, + ) ch <- prometheus.MustNewConstMetric( rebootQueueEntries, prometheus.GaugeValue, float64(len(rqEntries)), ) + ch <- prometheus.MustNewConstMetric( + rebootQueueRunning, + prometheus.GaugeValue, + rqRunning, + ) for status, count := range itemCounts { ch <- prometheus.MustNewConstMetric( rebootQueueItems, diff --git a/metrics/metrics.go b/metrics/metrics.go index 4737a00a..dd3397a6 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -33,6 +33,13 @@ var operationPhaseTimestampSeconds = prometheus.NewGauge( }, ) +var rebootQueueEnabled = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "reboot_queue_enabled"), + "1 if reboot queue is enabled.", + nil, + nil, +) + var rebootQueueEntries = prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "reboot_queue_entries"), "The number of reboot queue entries remaining.", @@ -47,6 +54,13 @@ var rebootQueueItems = prometheus.NewDesc( nil, ) +var rebootQueueRunning = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "reboot_queue_running"), + "1 if reboot queue is enabled and the queue is not empty.", + nil, + nil, +) + var nodeRebootStatus = prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "node_reboot_status"), "The reboot status of a node.", diff --git a/metrics/updater_test.go b/metrics/updater_test.go index d824c351..2664919d 100644 --- a/metrics/updater_test.go +++ b/metrics/updater_test.go @@ -42,9 +42,12 @@ type updateOperationPhaseTestCase struct { } type updateRebootQueueEntriesTestCase struct { - name string - input []*cke.RebootQueueEntry - expected float64 + name string + enabled bool + input []*cke.RebootQueueEntry + expectedEnabled float64 + expectedRunning float64 + expectedEntries float64 } type updateRebootQueueItemsTestCase struct { @@ -251,24 +254,44 @@ func testUpdateOperationPhase(t *testing.T) { func testUpdateRebootQueueEntries(t *testing.T) { testCases := []updateRebootQueueEntriesTestCase{ { - name: "zero", - input: nil, - expected: 0, + name: "zero", + enabled: true, + input: nil, + expectedEnabled: 1, + expectedRunning: 0, + expectedEntries: 0, }, { - name: "one", + name: "one", + enabled: true, input: []*cke.RebootQueueEntry{ {Status: cke.RebootStatusQueued}, }, - expected: 1, + expectedEnabled: 1, + expectedRunning: 1, + expectedEntries: 1, + }, + { + name: "two", + enabled: true, + input: []*cke.RebootQueueEntry{ + {Status: cke.RebootStatusQueued}, + {Status: cke.RebootStatusRebooting}, + }, + expectedEnabled: 1, + expectedRunning: 1, + expectedEntries: 2, }, { - name: "two", + name: "two-disabled", + enabled: false, input: []*cke.RebootQueueEntry{ {Status: cke.RebootStatusQueued}, {Status: cke.RebootStatusRebooting}, }, - expected: 2, + expectedEnabled: 0, + expectedRunning: 0, + expectedEntries: 2, }, } for _, tt := range testCases { @@ -277,6 +300,7 @@ func testUpdateRebootQueueEntries(t *testing.T) { defer ctx.Done() collector, storage := newTestCollector() + storage.enableRebootQueue(tt.enabled) storage.setRebootsEntries(tt.input) handler := GetHandler(collector) @@ -289,19 +313,41 @@ func testUpdateRebootQueueEntries(t *testing.T) { t.Fatal(err) } - metricsFound := false + metricsEnabledFound := false + metricsRunningFound := false + metricsEntriesFound := false for _, mf := range metricsFamily { - if *mf.Name != "cke_reboot_queue_entries" { - continue - } - for _, m := range mf.Metric { - metricsFound = true - if *m.Gauge.Value != tt.expected { - t.Errorf("value for cke_reboot_queue_entries is wrong. expected: %f, actual: %f", tt.expected, *m.Gauge.Value) + switch *mf.Name { + case "cke_reboot_queue_enabled": + for _, m := range mf.Metric { + metricsEnabledFound = true + if *m.Gauge.Value != tt.expectedEnabled { + t.Errorf("value for cke_reboot_queue_enabled is wrong. expected: %f, actual: %f", tt.expectedEnabled, *m.Gauge.Value) + } + } + case "cke_reboot_queue_running": + for _, m := range mf.Metric { + metricsRunningFound = true + if *m.Gauge.Value != tt.expectedEnabled { + t.Errorf("value for cke_reboot_queue_running is wrong. expected: %f, actual: %f", tt.expectedRunning, *m.Gauge.Value) + } + } + case "cke_reboot_queue_entries": + for _, m := range mf.Metric { + metricsEntriesFound = true + if *m.Gauge.Value != tt.expectedEntries { + t.Errorf("value for cke_reboot_queue_entries is wrong. expected: %f, actual: %f", tt.expectedEntries, *m.Gauge.Value) + } } } } - if !metricsFound { + if !metricsEnabledFound { + t.Errorf("metrics reboot_queue_enabled was not found") + } + if !metricsRunningFound { + t.Errorf("metrics reboot_queue_running was not found") + } + if !metricsEntriesFound { t.Errorf("metrics reboot_queue_entries was not found") } }) @@ -623,9 +669,10 @@ func newTestCollector() (prometheus.Collector, *testStorage) { } type testStorage struct { - sabakanEnabled bool - rebootEntries []*cke.RebootQueueEntry - cluster *cke.Cluster + sabakanEnabled bool + rebootQueueEnabled bool + rebootEntries []*cke.RebootQueueEntry + cluster *cke.Cluster } func (s *testStorage) enableSabakan(flag bool) { @@ -636,6 +683,14 @@ func (s *testStorage) IsSabakanDisabled(_ context.Context) (bool, error) { return !s.sabakanEnabled, nil } +func (s *testStorage) IsRebootQueueDisabled(_ context.Context) (bool, error) { + return !s.rebootQueueEnabled, nil +} + +func (s *testStorage) enableRebootQueue(flag bool) error { + s.rebootQueueEnabled = flag +} + func (s *testStorage) setRebootsEntries(entries []*cke.RebootQueueEntry) { s.rebootEntries = entries }