Skip to content

Commit

Permalink
Fix the naming for Metrics as per convention
Browse files Browse the repository at this point in the history
Most of the metrics aren't as per convention. This fixes it in a backward
compatible way. We introduce metrics with compliant naming.
Gauge metrics: Gauge metrics shouldn't end with `count` as it implies a counter.
Counter metrics: Counter metrics shouldn't end with `count` as it implies a
counter from histogram. Instead, we should use `total`.
https://prometheus.io/docs/practices/naming/
https://www.robustperception.io/on-the-naming-of-things/
  • Loading branch information
khrm authored and tekton-robot committed Apr 3, 2024
1 parent b7abe37 commit 4203fbd
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 18 deletions.
26 changes: 14 additions & 12 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,20 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
|-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_waiting_on_task_resolution_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_pipeline_resolution_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_task_resolution_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | deprecate |
| `tekton_pipelines_controller_pipelinerun_total` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_pipelineruns` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | deprecate |
| `tekton_pipelines_controller_taskrun_total` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | deprecate |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node` | Gauge | | experimental |
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |

The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.

Expand Down
68 changes: 64 additions & 4 deletions pkg/pipelinerunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,40 @@ var (
stats.UnitDimensionless)
prCountView *view.View

prTotal = stats.Float64("pipelinerun_total",
"Number of pipelineruns",
stats.UnitDimensionless)
prTotalView *view.View

runningPRsCount = stats.Float64("running_pipelineruns_count",
"Number of pipelineruns executing currently",
stats.UnitDimensionless)
runningPRsCountView *view.View

runningPRs = stats.Float64("running_pipelineruns",
"Number of pipelineruns executing currently",
stats.UnitDimensionless)
runningPRsView *view.View

runningPRsWaitingOnPipelineResolutionCount = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution_count",
"Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.",
stats.UnitDimensionless)
runningPRsWaitingOnPipelineResolutionCountView *view.View

runningPRsWaitingOnPipelineResolution = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution",
"Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.",
stats.UnitDimensionless)
runningPRsWaitingOnPipelineResolutionView *view.View

runningPRsWaitingOnTaskResolutionCount = stats.Float64("running_pipelineruns_waiting_on_task_resolution_count",
"Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.",
stats.UnitDimensionless)
runningPRsWaitingOnTaskResolutionCountView *view.View

runningPRsWaitingOnTaskResolution = stats.Float64("running_pipelineruns_waiting_on_task_resolution",
"Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.",
stats.UnitDimensionless)
runningPRsWaitingOnTaskResolutionView *view.View
)

const (
Expand Down Expand Up @@ -171,33 +191,69 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: view.Count(),
TagKeys: prCountViewTags,
}
prTotalView = &view.View{
Description: prTotal.Description(),
Measure: prTotal,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
}

runningPRsCountView = &view.View{
Description: runningPRsCount.Description(),
Measure: runningPRsCount,
Aggregation: view.LastValue(),
}
runningPRsView = &view.View{
Description: runningPRs.Description(),
Measure: runningPRs,
Aggregation: view.LastValue(),
}

runningPRsWaitingOnPipelineResolutionCountView = &view.View{
Description: runningPRsWaitingOnPipelineResolutionCount.Description(),
Measure: runningPRsWaitingOnPipelineResolutionCount,
Aggregation: view.LastValue(),
}
runningPRsWaitingOnPipelineResolutionView = &view.View{
Description: runningPRsWaitingOnPipelineResolution.Description(),
Measure: runningPRsWaitingOnPipelineResolution,
Aggregation: view.LastValue(),
}

runningPRsWaitingOnTaskResolutionCountView = &view.View{
Description: runningPRsWaitingOnTaskResolutionCount.Description(),
Measure: runningPRsWaitingOnTaskResolutionCount,
Aggregation: view.LastValue(),
}
runningPRsWaitingOnTaskResolutionView = &view.View{
Description: runningPRsWaitingOnTaskResolution.Description(),
Measure: runningPRsWaitingOnTaskResolution,
Aggregation: view.LastValue(),
}

return view.Register(
prDurationView,
prCountView,
prTotalView,
runningPRsCountView,
runningPRsView,
runningPRsWaitingOnPipelineResolutionCountView,
runningPRsWaitingOnPipelineResolutionView,
runningPRsWaitingOnTaskResolutionCountView,
runningPRsWaitingOnTaskResolutionView,
)
}

func viewUnregister() {
view.Unregister(prDurationView, prCountView, runningPRsCountView, runningPRsWaitingOnPipelineResolutionCountView, runningPRsWaitingOnTaskResolutionCountView)
view.Unregister(prDurationView,
prCountView,
prTotalView,
runningPRsCountView,
runningPRsView,
runningPRsWaitingOnPipelineResolutionCountView,
runningPRsWaitingOnPipelineResolutionView,
runningPRsWaitingOnTaskResolutionCountView,
runningPRsWaitingOnTaskResolutionView)
}

// MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so
Expand Down Expand Up @@ -282,6 +338,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co

metrics.Record(ctx, prDuration.M(duration.Seconds()))
metrics.Record(ctx, prCount.M(1))
metrics.Record(ctx, prTotal.M(1))

return nil
}
Expand All @@ -301,13 +358,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
return fmt.Errorf("failed to list pipelineruns while generating metrics : %w", err)
}

var runningPRs int
var runningPipelineRuns int
var trsWaitResolvingTaskRef int
var prsWaitResolvingPipelineRef int

for _, pr := range prs {
if !pr.IsDone() {
runningPRs++
runningPipelineRuns++
succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
switch succeedCondition.Reason {
Expand All @@ -324,9 +381,12 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
if err != nil {
return err
}
metrics.Record(ctx, runningPRsCount.M(float64(runningPRs)))
metrics.Record(ctx, runningPRsWaitingOnPipelineResolutionCount.M(float64(prsWaitResolvingPipelineRef)))
metrics.Record(ctx, runningPRsWaitingOnPipelineResolution.M(float64(prsWaitResolvingPipelineRef)))
metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef)))
metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns)))
metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns)))

return nil
}
Expand Down
8 changes: 7 additions & 1 deletion pkg/pipelinerunmetrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,11 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
}
if test.expectedCountTags != nil {
metricstest.CheckCountData(t, "pipelinerun_count", test.expectedCountTags, test.expectedCount)
delete(test.expectedCountTags, "reason")
metricstest.CheckCountData(t, "pipelinerun_total", test.expectedCountTags, test.expectedCount)
} else {
metricstest.CheckStatsNotReported(t, "pipelinerun_count")
metricstest.CheckStatsNotReported(t, "pipelinerun_total")
}
})
}
Expand Down Expand Up @@ -451,6 +454,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
t.Errorf("RunningPipelineRuns: %v", err)
}
metricstest.CheckLastValueData(t, "running_pipelineruns_count", map[string]string{}, 1)
metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{}, 1)
}

func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
Expand Down Expand Up @@ -532,11 +536,13 @@ func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
}
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution_count", map[string]string{}, tc.prWaitCount)
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution_count", map[string]string{}, tc.trWaitCount)
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution", map[string]string{}, tc.prWaitCount)
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution", map[string]string{}, tc.trWaitCount)
}
}

func unregisterMetrics() {
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_count")
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "pipelinerun_total", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_pipeline_resolution", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_waiting_on_task_resolution", "running_pipelineruns_count", "running_pipelineruns")

// Allow the recorder singleton to be recreated.
once = sync.Once{}
Expand Down
55 changes: 55 additions & 0 deletions pkg/taskrunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,13 @@ var (
trDurationView *view.View
prTRDurationView *view.View
trCountView *view.View
trTotalView *view.View
runningTRsCountView *view.View
runningTRsView *view.View
runningTRsThrottledByQuotaCountView *view.View
runningTRsThrottledByNodeCountView *view.View
runningTRsThrottledByQuotaView *view.View
runningTRsThrottledByNodeView *view.View
runningTRsWaitingOnTaskResolutionCountView *view.View
podLatencyView *view.View

Expand All @@ -76,10 +80,18 @@ var (
"number of taskruns",
stats.UnitDimensionless)

trTotal = stats.Float64("taskrun_total",
"Number of taskruns",
stats.UnitDimensionless)

runningTRsCount = stats.Float64("running_taskruns_count",
"Number of taskruns executing currently",
stats.UnitDimensionless)

runningTRs = stats.Float64("running_taskruns",
"Number of taskruns executing currently",
stats.UnitDimensionless)

runningTRsThrottledByQuotaCount = stats.Float64("running_taskruns_throttled_by_quota_count",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)
Expand All @@ -92,6 +104,14 @@ var (
"Number of taskruns executing currently that are waiting on resolution requests for their task references.",
stats.UnitDimensionless)

runningTRsThrottledByQuota = stats.Float64("running_taskruns_throttled_by_quota",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of defined ResourceQuotas. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

runningTRsThrottledByNode = stats.Float64("running_taskruns_throttled_by_node",
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
stats.UnitDimensionless)

podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
"scheduling latency for the taskruns pods",
stats.UnitMilliseconds)
Expand Down Expand Up @@ -215,11 +235,23 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: view.Count(),
TagKeys: trCountViewTags,
}
trTotalView = &view.View{
Description: trTotal.Description(),
Measure: trTotal,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
}
runningTRsCountView = &view.View{
Description: runningTRsCount.Description(),
Measure: runningTRsCount,
Aggregation: view.LastValue(),
}

runningTRsView = &view.View{
Description: runningTRs.Description(),
Measure: runningTRs,
Aggregation: view.LastValue(),
}
runningTRsThrottledByQuotaCountView = &view.View{
Description: runningTRsThrottledByQuotaCount.Description(),
Measure: runningTRsThrottledByQuotaCount,
Expand All @@ -235,6 +267,17 @@ func viewRegister(cfg *config.Metrics) error {
Measure: runningTRsWaitingOnTaskResolutionCount,
Aggregation: view.LastValue(),
}

runningTRsThrottledByQuotaView = &view.View{
Description: runningTRsThrottledByQuota.Description(),
Measure: runningTRsThrottledByQuota,
Aggregation: view.LastValue(),
}
runningTRsThrottledByNodeView = &view.View{
Description: runningTRsThrottledByNode.Description(),
Measure: runningTRsThrottledByNode,
Aggregation: view.LastValue(),
}
podLatencyView = &view.View{
Description: podLatency.Description(),
Measure: podLatency,
Expand All @@ -245,10 +288,14 @@ func viewRegister(cfg *config.Metrics) error {
trDurationView,
prTRDurationView,
trCountView,
trTotalView,
runningTRsCountView,
runningTRsView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
runningTRsWaitingOnTaskResolutionCountView,
runningTRsThrottledByQuotaView,
runningTRsThrottledByNodeView,
podLatencyView,
)
}
Expand All @@ -258,10 +305,14 @@ func viewUnregister() {
trDurationView,
prTRDurationView,
trCountView,
trTotalView,
runningTRsCountView,
runningTRsView,
runningTRsThrottledByQuotaCountView,
runningTRsThrottledByNodeCountView,
runningTRsWaitingOnTaskResolutionCountView,
runningTRsThrottledByQuotaView,
runningTRsThrottledByNodeView,
podLatencyView,
)
}
Expand Down Expand Up @@ -356,6 +407,7 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC

metrics.Record(ctx, durationStat.M(duration.Seconds()))
metrics.Record(ctx, trCount.M(1))
metrics.Record(ctx, trTotal.M(1))

return nil
}
Expand Down Expand Up @@ -402,9 +454,12 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
return err
}
metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
metrics.Record(ctx, runningTRs.M(float64(runningTrs)))
metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))
metrics.Record(ctx, runningTRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
metrics.Record(ctx, runningTRsThrottledByNode.M(float64(trsThrottledByNode)))
metrics.Record(ctx, runningTRsThrottledByQuota.M(float64(trsThrottledByQuota)))

return nil
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/taskrunmetrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,11 @@ func TestRecordTaskRunDurationCount(t *testing.T) {
}
if c.expectedCountTags != nil {
metricstest.CheckCountData(t, "taskrun_count", c.expectedCountTags, c.expectedCount)
delete(c.expectedCountTags, "reason")
metricstest.CheckCountData(t, "taskrun_total", c.expectedCountTags, c.expectedCount)
} else {
metricstest.CheckStatsNotReported(t, "taskrun_count")
metricstest.CheckStatsNotReported(t, "taskrun_total")
}
if c.expectedDurationTags != nil {
metricstest.CheckLastValueData(t, c.metricName, c.expectedDurationTags, c.expectedDuration)
Expand Down Expand Up @@ -680,7 +683,7 @@ func TestTaskRunIsOfPipelinerun(t *testing.T) {
}

func unregisterMetrics() {
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds")
metricstest.Unregister("taskrun_duration_seconds", "pipelinerun_taskrun_duration_seconds", "taskrun_count", "running_taskruns_count", "running_taskruns_throttled_by_quota_count", "running_taskruns_throttled_by_node_count", "running_taskruns_waiting_on_task_resolution_count", "taskruns_pod_latency_milliseconds", "taskrun_total", "running_taskruns", "running_taskruns_throttled_by_quota", "running_taskruns_throttled_by_node", "running_taskruns_waiting_on_task_resolution")

// Allow the recorder singleton to be recreated.
once = sync.Once{}
Expand Down

0 comments on commit 4203fbd

Please sign in to comment.