diff --git a/config/config-observability.yaml b/config/config-observability.yaml
index 1f97697ba8e..6417b405fb7 100644
--- a/config/config-observability.yaml
+++ b/config/config-observability.yaml
@@ -58,3 +58,4 @@ data:
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
+ metrics.count.reason: "false"
diff --git a/docs/metrics.md b/docs/metrics.md
index b0b8ab75e2f..51177ae08c8 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
| ---------- | ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`namespace`=<pipelinerun-namespace> | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace>| experimental |
-| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | experimental |
+| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace> | experimental |
-| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | experimental |
+| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
@@ -37,6 +37,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
+ metrics.count.reason: "false"
```
Following values are available in the configmap:
@@ -53,6 +54,7 @@ Following values are available in the configmap:
| metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
+| metrics.count.reason | `false` | Sets if the `reason` label should be included on count metrics |
Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.
diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go
index 9422ec5eb43..d7f89bbb1fc 100644
--- a/pkg/apis/config/metrics.go
+++ b/pkg/apis/config/metrics.go
@@ -36,6 +36,9 @@ const (
// metrics to use for aggregating duration for pipelinerun
metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"
+ // countWithReasonKey sets if the reason label should be included on count metrics
+ countWithReasonKey = "metrics.count.reason"
+
// DefaultTaskrunLevel determines to what level to aggregate metrics
// when it isn't specified in configmap
DefaultTaskrunLevel = TaskrunLevelAtTask
@@ -92,6 +95,7 @@ type Metrics struct {
PipelinerunLevel string
DurationTaskrunType string
DurationPipelinerunType string
+ CountWithReason bool
}
// GetMetricsConfigName returns the name of the configmap containing all
@@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
return other.TaskrunLevel == cfg.TaskrunLevel &&
other.PipelinerunLevel == cfg.PipelinerunLevel &&
other.DurationTaskrunType == cfg.DurationTaskrunType &&
- other.DurationPipelinerunType == cfg.DurationPipelinerunType
+ other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
+ other.CountWithReason == cfg.CountWithReason
}
// newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
@@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
PipelinerunLevel: DefaultPipelinerunLevel,
DurationTaskrunType: DefaultDurationTaskrunType,
DurationPipelinerunType: DefaultDurationPipelinerunType,
+ CountWithReason: false,
}
if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
@@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
tc.DurationPipelinerunType = durationPipelinerun
}
+
+ if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
+ tc.CountWithReason = true
+ }
+
return &tc, nil
}
diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go
index f89a5b46e06..3768598a5a3 100644
--- a/pkg/pipelinerunmetrics/metrics.go
+++ b/pkg/pipelinerunmetrics/metrics.go
@@ -43,6 +43,7 @@ var (
pipelineTag = tag.MustNewKey("pipeline")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
+ reasonTag = tag.MustNewKey("reason")
prDuration = stats.Float64(
"pipelinerun_duration_seconds",
@@ -149,11 +150,15 @@ func viewRegister(cfg *config.Metrics) error {
TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...),
}
+ prCountViewTags := []tag.Key{statusTag}
+ if cfg.CountWithReason {
+ prCountViewTags = append(prCountViewTags, reasonTag)
+ }
prCountView = &view.View{
Description: prCount.Description(),
Measure: prCount,
Aggregation: view.Count(),
- TagKeys: []tag.Key{statusTag},
+ TagKeys: prCountViewTags,
}
runningPRsCountView = &view.View{
Description: runningPRsCount.Description(),
@@ -230,13 +235,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
}
}
+ cond := pr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
- if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
+ if cond.Status == corev1.ConditionFalse {
status = "failed"
if cond.Reason == ReasonCancelled {
status = "cancelled"
}
}
+ reason := cond.Reason
pipelineName := "anonymous"
if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
@@ -245,7 +252,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
ctx, err := tag.New(
context.Background(),
append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
- tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
+ tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
if err != nil {
return err
}
diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go
index 473b8f4c604..d42e6db8c48 100644
--- a/pkg/taskrunmetrics/metrics.go
+++ b/pkg/taskrunmetrics/metrics.go
@@ -50,6 +50,7 @@ var (
taskTag = tag.MustNewKey("task")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
+ reasonTag = tag.MustNewKey("reason")
podTag = tag.MustNewKey("pod")
trDurationView *view.View
@@ -198,11 +199,16 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: distribution,
TagKeys: append([]tag.Key{statusTag, namespaceTag}, append(trunTag, prunTag...)...),
}
+
+ trCountViewTags := []tag.Key{statusTag}
+ if cfg.CountWithReason {
+ trCountViewTags = append(trCountViewTags, reasonTag)
+ }
trCountView = &view.View{
Description: trCount.Description(),
Measure: trCount,
Aggregation: view.Count(),
- TagKeys: []tag.Key{statusTag},
+ TagKeys: trCountViewTags,
}
runningTRsCountView = &view.View{
Description: runningTRsCount.Description(),
@@ -316,13 +322,15 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC
taskName = tr.Spec.TaskRef.Name
}
+ cond := tr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
- if cond := tr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
+ if cond.Status == corev1.ConditionFalse {
status = "failed"
}
+ reason := cond.Reason
durationStat := trDuration
- tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status)}
+ tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}
if ok, pipeline, pipelinerun := IsPartOfPipeline(tr); ok {
durationStat = prTRDuration
tags = append(tags, r.insertPipelineTag(pipeline, pipelinerun)...)