Skip to content

Commit

Permalink
feat: add ability to include reason in count metrics
Browse files Browse the repository at this point in the history
Adds a configuration flag to enable including the `reason` for a TaskRun or PipelineRun status on their count metrics. This allows for more fine-grained monitoring and alerting of run failures.

Signed-off-by: Marcus Noble <[email protected]>
  • Loading branch information
AverageMarcus committed Aug 22, 2023
1 parent 445734d commit d490073
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 9 deletions.
1 change: 1 addition & 0 deletions config/config-observability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ data:
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.reason: "false"
6 changes: 4 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
| ---------- | ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
Expand All @@ -37,6 +37,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.reason: "false"
```
Following values are available in the configmap:
Expand All @@ -53,6 +54,7 @@ Following values are available in the configmap:
| metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
| metrics.count.reason | `false` | Sets if the `reason` label should be included on count metrics |

Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.

Expand Down
13 changes: 12 additions & 1 deletion pkg/apis/config/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ const (
// metrics to use for aggregating duration for pipelinerun
metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"

// countWithReasonKey sets if the reason label should be included on count metrics
countWithReasonKey = "metrics.count.reason"

// DefaultTaskrunLevel determines to what level to aggregate metrics
// when it isn't specified in configmap
DefaultTaskrunLevel = TaskrunLevelAtTask
Expand Down Expand Up @@ -92,6 +95,7 @@ type Metrics struct {
PipelinerunLevel string
DurationTaskrunType string
DurationPipelinerunType string
CountWithReason bool
}

// GetMetricsConfigName returns the name of the configmap containing all
Expand All @@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
return other.TaskrunLevel == cfg.TaskrunLevel &&
other.PipelinerunLevel == cfg.PipelinerunLevel &&
other.DurationTaskrunType == cfg.DurationTaskrunType &&
other.DurationPipelinerunType == cfg.DurationPipelinerunType
other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
other.CountWithReason == cfg.CountWithReason
}

// newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
Expand All @@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
PipelinerunLevel: DefaultPipelinerunLevel,
DurationTaskrunType: DefaultDurationTaskrunType,
DurationPipelinerunType: DefaultDurationPipelinerunType,
CountWithReason: false,
}

if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
Expand All @@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
tc.DurationPipelinerunType = durationPipelinerun
}

if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
tc.CountWithReason = true
}

return &tc, nil
}

Expand Down
13 changes: 10 additions & 3 deletions pkg/pipelinerunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var (
pipelineTag = tag.MustNewKey("pipeline")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
reasonTag = tag.MustNewKey("reason")

prDuration = stats.Float64(
"pipelinerun_duration_seconds",
Expand Down Expand Up @@ -149,11 +150,15 @@ func viewRegister(cfg *config.Metrics) error {
TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...),
}

prCountViewTags := []tag.Key{statusTag}
if cfg.CountWithReason {
prCountViewTags = append(prCountViewTags, reasonTag)
}
prCountView = &view.View{
Description: prCount.Description(),
Measure: prCount,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
TagKeys: prCountViewTags,
}
runningPRsCountView = &view.View{
Description: runningPRsCount.Description(),
Expand Down Expand Up @@ -230,13 +235,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
}
}

cond := pr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
if cond.Status == corev1.ConditionFalse {
status = "failed"
if cond.Reason == ReasonCancelled {
status = "cancelled"
}
}
reason := cond.Reason

pipelineName := "anonymous"
if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
Expand All @@ -245,7 +252,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
ctx, err := tag.New(
context.Background(),
append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
if err != nil {
return err
}
Expand Down
14 changes: 11 additions & 3 deletions pkg/taskrunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var (
taskTag = tag.MustNewKey("task")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
reasonTag = tag.MustNewKey("reason")
podTag = tag.MustNewKey("pod")

trDurationView *view.View
Expand Down Expand Up @@ -198,11 +199,16 @@ func viewRegister(cfg *config.Metrics) error {
Aggregation: distribution,
TagKeys: append([]tag.Key{statusTag, namespaceTag}, append(trunTag, prunTag...)...),
}

trCountViewTags := []tag.Key{statusTag}
if cfg.CountWithReason {
trCountViewTags = append(trCountViewTags, reasonTag)
}
trCountView = &view.View{
Description: trCount.Description(),
Measure: trCount,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
TagKeys: trCountViewTags,
}
runningTRsCountView = &view.View{
Description: runningTRsCount.Description(),
Expand Down Expand Up @@ -316,13 +322,15 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC
taskName = tr.Spec.TaskRef.Name
}

cond := tr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
if cond := tr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
if cond.Status == corev1.ConditionFalse {
status = "failed"
}
reason := cond.Reason

durationStat := trDuration
tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status)}
tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}
if ok, pipeline, pipelinerun := IsPartOfPipeline(tr); ok {
durationStat = prTRDuration
tags = append(tags, r.insertPipelineTag(pipeline, pipelinerun)...)
Expand Down

0 comments on commit d490073

Please sign in to comment.