diff --git a/docs/metrics.md b/docs/metrics.md index 41bc51c4550..3df1df461ce 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -15,18 +15,20 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver, |-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- | | `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`namespace`=<pipelinerun-namespace> | experimental | | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace>| experimental | -| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental | -| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental | -| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace> | experimental | -| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental | -| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental | -| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental | -| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental | -| `tekton_pipelines_controller_running_taskruns_waiting_on_task_resolution_count` | Gauge | | experimental | -| `tekton_pipelines_controller_running_pipelineruns_waiting_on_pipeline_resolution_count` | Gauge | | experimental | -| `tekton_pipelines_controller_running_pipelineruns_waiting_on_task_resolution_count` | Gauge | | experimental | -| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=<taskruns-namespace>
`pod`= < taskrun_pod_name>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
| experimental | -| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | +| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental | +| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace> | experimental | +| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | deprecate | +| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate | +| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental | +| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental | +| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental | The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics. diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go index 4794bc8955e..b389cf4907b 100644 --- a/pkg/pipelinerunmetrics/metrics.go +++ b/pkg/pipelinerunmetrics/metrics.go @@ -56,20 +56,40 @@ var ( stats.UnitDimensionless) prCountView *view.View + prTotal = stats.Float64("pipelinerun_total", + "Number of pipelineruns", + stats.UnitDimensionless) + prTotalView *view.View + runningPRsCount = stats.Float64("running_pipelineruns_count", "Number of pipelineruns executing currently", stats.UnitDimensionless) runningPRsCountView *view.View + runningPRs = stats.Float64("running_pipelineruns", + "Number of pipelineruns executing currently", + stats.UnitDimensionless) + runningPRsView *view.View + runningPRsWaitingOnPipelineResolutionCount = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution_count", "Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.", stats.UnitDimensionless) runningPRsWaitingOnPipelineResolutionCountView *view.View + runningPRsWaitingOnPipelineResolution = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution", + "Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.", + stats.UnitDimensionless) + runningPRsWaitingOnPipelineResolutionView *view.View + runningPRsWaitingOnTaskResolutionCount = stats.Float64("running_pipelineruns_waiting_on_task_resolution_count", "Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.", stats.UnitDimensionless) runningPRsWaitingOnTaskResolutionCountView *view.View + + runningPRsWaitingOnTaskResolution = stats.Float64("running_pipelineruns_waiting_on_task_resolution", + "Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.", + stats.UnitDimensionless) + runningPRsWaitingOnTaskResolutionView *view.View ) const ( @@ -171,33 +191,69 @@ func viewRegister(cfg *config.Metrics) error { Aggregation: view.Count(), TagKeys: prCountViewTags, } + prTotalView = &view.View{ + Description: prTotal.Description(), + Measure: prTotal, + Aggregation: view.Count(), + TagKeys: []tag.Key{statusTag}, + } + runningPRsCountView = &view.View{ Description: runningPRsCount.Description(), Measure: runningPRsCount, Aggregation: view.LastValue(), } + runningPRsView = &view.View{ + Description: runningPRs.Description(), + Measure: runningPRs, + Aggregation: view.LastValue(), + } + runningPRsWaitingOnPipelineResolutionCountView = &view.View{ Description: runningPRsWaitingOnPipelineResolutionCount.Description(), Measure: runningPRsWaitingOnPipelineResolutionCount, Aggregation: view.LastValue(), } + runningPRsWaitingOnPipelineResolutionView = &view.View{ + Description: runningPRsWaitingOnPipelineResolution.Description(), + Measure: runningPRsWaitingOnPipelineResolution, + Aggregation: view.LastValue(), + } + runningPRsWaitingOnTaskResolutionCountView = &view.View{ Description: runningPRsWaitingOnTaskResolutionCount.Description(), Measure: runningPRsWaitingOnTaskResolutionCount, Aggregation: view.LastValue(), } + runningPRsWaitingOnTaskResolutionView = &view.View{ + Description: runningPRsWaitingOnTaskResolution.Description(), + Measure: runningPRsWaitingOnTaskResolution, + Aggregation: view.LastValue(), + } return view.Register( prDurationView, prCountView, + prTotalView, runningPRsCountView, + runningPRsView, runningPRsWaitingOnPipelineResolutionCountView, + runningPRsWaitingOnPipelineResolutionView, runningPRsWaitingOnTaskResolutionCountView, + runningPRsWaitingOnTaskResolutionView, ) } func viewUnregister() { - view.Unregister(prDurationView, prCountView, runningPRsCountView, runningPRsWaitingOnPipelineResolutionCountView, runningPRsWaitingOnTaskResolutionCountView) + view.Unregister(prDurationView, + prCountView, + prTotalView, + runningPRsCountView, + runningPRsView, + runningPRsWaitingOnPipelineResolutionCountView, + runningPRsWaitingOnPipelineResolutionView, + runningPRsWaitingOnTaskResolutionCountView, + runningPRsWaitingOnTaskResolutionView) } // MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so @@ -282,6 +338,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co metrics.Record(ctx, prDuration.M(duration.Seconds())) metrics.Record(ctx, prCount.M(1)) + metrics.Record(ctx, prTotal.M(1)) return nil } @@ -301,13 +358,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { return fmt.Errorf("failed to list pipelineruns while generating metrics : %w", err) } - var runningPRs int + var runningPipelineRuns int var trsWaitResolvingTaskRef int var prsWaitResolvingPipelineRef int for _, pr := range prs { if !pr.IsDone() { - runningPRs++ + runningPipelineRuns++ succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded) if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown { switch succeedCondition.Reason { @@ -324,9 +381,12 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { if err != nil { return err } - metrics.Record(ctx, runningPRsCount.M(float64(runningPRs))) metrics.Record(ctx, runningPRsWaitingOnPipelineResolutionCount.M(float64(prsWaitResolvingPipelineRef))) + metrics.Record(ctx, runningPRsWaitingOnPipelineResolution.M(float64(prsWaitResolvingPipelineRef))) metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef))) + metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef))) + metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns))) + metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns))) return nil } diff --git a/pkg/pipelinerunmetrics/metrics_test.go b/pkg/pipelinerunmetrics/metrics_test.go index 6e96fe84006..5a7c757e512 100644 --- a/pkg/pipelinerunmetrics/metrics_test.go +++ b/pkg/pipelinerunmetrics/metrics_test.go @@ -404,8 +404,11 @@ func TestRecordPipelineRunDurationCount(t *testing.T) { } if test.expectedCountTags != nil { metricstest.CheckCountData(t, "pipelinerun_count", test.expectedCountTags, test.expectedCount) + delete(test.expectedCountTags, "reason") + metricstest.CheckCountData(t, "pipelinerun_total", test.expectedCountTags, test.expectedCount) } else { metricstest.CheckStatsNotReported(t, "pipelinerun_count") + metricstest.CheckStatsNotReported(t, "pipelinerun_total") } }) } @@ -451,6 +454,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) { t.Errorf("RunningPipelineRuns: %v", err) } metricstest.CheckLastValueData(t, "running_pipelineruns_count", map[string]string{}, 1) + metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{}, 1) } func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) { @@ -532,11 +536,13 @@ func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) { } metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution_count", map[string]string{}, tc.prWaitCount) metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution_count", map[string]string{}, tc.trWaitCount) + metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution", map[string]string{}, tc.prWaitCount) + metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution", map[string]string{}, tc.trWaitCount) } } func unregisterMetrics() { - metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_count") + metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "pipelinerun_total", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_pipeline_resolution", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_waiting_on_task_resolution", "running_pipelineruns_count", "running_pipelineruns") // Allow the recorder singleton to be recreated. once = sync.Once{} diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go index e5b251b497d..d60b5e567f9 100644 --- a/pkg/taskrunmetrics/metrics.go +++ b/pkg/taskrunmetrics/metrics.go @@ -56,9 +56,13 @@ var ( trDurationView *view.View prTRDurationView *view.View trCountView *view.View + trTotalView *view.View runningTRsCountView *view.View + runningTRsView *view.View runningTRsThrottledByQuotaCountView *view.View runningTRsThrottledByNodeCountView *view.View + runningTRsThrottledByQuotaView *view.View + runningTRsThrottledByNodeView *view.View runningTRsWaitingOnTaskResolutionCountView *view.View podLatencyView *view.View @@ -76,10 +80,18 @@ var ( "number of taskruns", stats.UnitDimensionless) + trTotal = stats.Float64("taskrun_total", + "Number of taskruns", + stats.UnitDimensionless) + runningTRsCount = stats.Float64("running_taskruns_count", "Number of taskruns executing currently", stats.UnitDimensionless) + runningTRs = stats.Float64("running_taskruns", + "Number of taskruns executing currently", + stats.UnitDimensionless) + runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count", "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", stats.UnitDimensionless) @@ -92,6 +104,14 @@ var ( "Number of taskruns executing currently that are waiting on resolution requests for their task references.", stats.UnitDimensionless) + runningTRsThrottledByQuota = stats.Float64("running_taskruns_throttled_by_quota", + "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", + stats.UnitDimensionless) + + runningTRsThrottledByNode = stats.Float64("running_taskruns_throttled_by_node", + "Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started", + stats.UnitDimensionless) + podLatency = stats.Float64("taskruns_pod_latency_milliseconds", "scheduling latency for the taskruns pods", stats.UnitMilliseconds) @@ -215,11 +235,23 @@ func viewRegister(cfg *config.Metrics) error { Aggregation: view.Count(), TagKeys: trCountViewTags, } + trTotalView = &view.View{ + Description: trTotal.Description(), + Measure: trTotal, + Aggregation: view.Count(), + TagKeys: []tag.Key{statusTag}, + } runningTRsCountView = &view.View{ Description: runningTRsCount.Description(), Measure: runningTRsCount, Aggregation: view.LastValue(), } + + runningTRsView = &view.View{ + Description: runningTRs.Description(), + Measure: runningTRs, + Aggregation: view.LastValue(), + } runningTRsThrottledByQuotaCountView = &view.View{ Description: runningTRsThrottledByQuotaCount.Description(), Measure: runningTRsThrottledByQuotaCount, @@ -235,6 +267,17 @@ func viewRegister(cfg *config.Metrics) error { Measure: runningTRsWaitingOnTaskResolutionCount, Aggregation: view.LastValue(), } + + runningTRsThrottledByQuotaView = &view.View{ + Description: runningTRsThrottledByQuota.Description(), + Measure: runningTRsThrottledByQuota, + Aggregation: view.LastValue(), + } + runningTRsThrottledByNodeView = &view.View{ + Description: runningTRsThrottledByNode.Description(), + Measure: runningTRsThrottledByNode, + Aggregation: view.LastValue(), + } podLatencyView = &view.View{ Description: podLatency.Description(), Measure: podLatency, @@ -245,10 +288,14 @@ func viewRegister(cfg *config.Metrics) error { trDurationView, prTRDurationView, trCountView, + trTotalView, runningTRsCountView, + runningTRsView, runningTRsThrottledByQuotaCountView, runningTRsThrottledByNodeCountView, runningTRsWaitingOnTaskResolutionCountView, + runningTRsThrottledByQuotaView, + runningTRsThrottledByNodeView, podLatencyView, ) } @@ -258,10 +305,14 @@ func viewUnregister() { trDurationView, prTRDurationView, trCountView, + trTotalView, runningTRsCountView, + runningTRsView, runningTRsThrottledByQuotaCountView, runningTRsThrottledByNodeCountView, runningTRsWaitingOnTaskResolutionCountView, + runningTRsThrottledByQuotaView, + runningTRsThrottledByNodeView, podLatencyView, ) } @@ -356,6 +407,7 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC metrics.Record(ctx, durationStat.M(duration.Seconds())) metrics.Record(ctx, trCount.M(1)) + metrics.Record(ctx, trTotal.M(1)) return nil } @@ -402,9 +454,12 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi return err } metrics.Record(ctx, runningTRsCount.M(float64(runningTrs))) + metrics.Record(ctx, runningTRs.M(float64(runningTrs))) metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode))) metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota))) metrics.Record(ctx, runningTRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef))) + metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode))) + metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota))) return nil } diff --git a/pkg/taskrunmetrics/metrics_test.go b/pkg/taskrunmetrics/metrics_test.go index 437fd032846..c7e581d8962 100644 --- a/pkg/taskrunmetrics/metrics_test.go +++ b/pkg/taskrunmetrics/metrics_test.go @@ -415,8 +415,11 @@ func TestRecordTaskRunDurationCount(t *testing.T) { } if c.expectedCountTags != nil { metricstest.CheckCountData(t, "taskrun_count", c.expectedCountTags, c.expectedCount) + delete(c.expectedCountTags, "reason") + metricstest.CheckCountData(t, "taskrun_total", c.expectedCountTags, c.expectedCount) } else { metricstest.CheckStatsNotReported(t, "taskrun_count") + metricstest.CheckStatsNotReported(t, "taskrun_total") } if c.expectedDurationTags != nil { metricstest.CheckLastValueData(t, c.metricName, c.expectedDurationTags, c.expectedDuration) @@ -680,7 +683,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) { } func unregisterMetrics() { - metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds") + metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds", "taskrun_total", "running_taskruns", "running_taskruns_throttled_by_quota", "running_taskruns_throttled_by_node", "running_taskruns_waiting_on_task_resolution") // Allow the recorder singleton to be recreated. once = sync.Once{}