Skip to content

Commit

Permalink
feat: add ability to include reason in count metrics
Browse files Browse the repository at this point in the history
Adds a configuration flag to enable including the `reason` for a TaskRun or PipelineRun status on their count metrics. This allows for more fine-grained monitoring and alerting of run failures.

Signed-off-by: Marcus Noble <[email protected]>
  • Loading branch information
AverageMarcus committed Aug 22, 2023
1 parent 445734d commit 17a7fe3
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 19 deletions.
1 change: 1 addition & 0 deletions config/config-observability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ data:
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.reason: "false"
6 changes: 4 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
| ---------- | ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
Expand All @@ -37,6 +37,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.reason: "false"
```
Following values are available in the configmap:
Expand All @@ -53,6 +54,7 @@ Following values are available in the configmap:
| metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
| metrics.count.reason | `false` | Sets if the `reason` label should be included on count metrics |

Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.

Expand Down
13 changes: 12 additions & 1 deletion pkg/apis/config/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ const (
// metrics to use for aggregating duration for pipelinerun
metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"

// countWithReasonKey sets if the reason label should be included on count metrics
countWithReasonKey = "metrics.count.reason"

// DefaultTaskrunLevel determines to what level to aggregate metrics
// when it isn't specified in configmap
DefaultTaskrunLevel = TaskrunLevelAtTask
Expand Down Expand Up @@ -92,6 +95,7 @@ type Metrics struct {
PipelinerunLevel string
DurationTaskrunType string
DurationPipelinerunType string
CountWithReason bool
}

// GetMetricsConfigName returns the name of the configmap containing all
Expand All @@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
return other.TaskrunLevel == cfg.TaskrunLevel &&
other.PipelinerunLevel == cfg.PipelinerunLevel &&
other.DurationTaskrunType == cfg.DurationTaskrunType &&
other.DurationPipelinerunType == cfg.DurationPipelinerunType
other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
other.CountWithReason == cfg.CountWithReason
}

// newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
Expand All @@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
PipelinerunLevel: DefaultPipelinerunLevel,
DurationTaskrunType: DefaultDurationTaskrunType,
DurationPipelinerunType: DefaultDurationPipelinerunType,
CountWithReason: false,
}

if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
Expand All @@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
tc.DurationPipelinerunType = durationPipelinerun
}

if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
tc.CountWithReason = true
}

return &tc, nil
}

Expand Down
13 changes: 13 additions & 0 deletions pkg/apis/config/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
CountWithReason: false,
},
fileName: config.GetMetricsConfigName(),
},
Expand All @@ -47,9 +48,20 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtNS,
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
CountWithReason: false,
},
fileName: "config-observability-namespacelevel",
},
{
expectedConfig: &config.Metrics{
TaskrunLevel: config.TaskrunLevelAtNS,
PipelinerunLevel: config.PipelinerunLevelAtNS,
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
CountWithReason: true,
},
fileName: "config-observability-reason",
},
}

for _, tc := range testCases {
Expand All @@ -64,6 +76,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtPipeline,
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
CountWithReason: false,
}
verifyConfigFileWithExpectedMetricsConfig(t, MetricsConfigEmptyName, expectedConfig)
}
Expand Down
31 changes: 31 additions & 0 deletions pkg/apis/config/testdata/config-observability-reason.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2019 The Tekton Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: ConfigMap
metadata:
name: config-observability
namespace: tekton-pipelines
labels:
app.kubernetes.io/instance: default
app.kubernetes.io/part-of: tekton-pipelines
data:
metrics.backend-destination: prometheus
metrics.stackdriver-project-id: "<your stackdriver project id>"
metrics.allow-stackdriver-custom-metrics: "false"
metrics.taskrun.level: "namespace"
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "namespace"
metrics.pipelinerun.duration-type: "lastvalue"
metrics.count.reason: "true"
13 changes: 10 additions & 3 deletions pkg/pipelinerunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var (
pipelineTag = tag.MustNewKey("pipeline")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
reasonTag = tag.MustNewKey("reason")

prDuration = stats.Float64(
"pipelinerun_duration_seconds",
Expand Down Expand Up @@ -149,11 +150,15 @@ func viewRegister(cfg *config.Metrics) error {
TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...),
}

prCountViewTags := []tag.Key{statusTag}
if cfg.CountWithReason {
prCountViewTags = append(prCountViewTags, reasonTag)
}
prCountView = &view.View{
Description: prCount.Description(),
Measure: prCount,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
TagKeys: prCountViewTags,
}
runningPRsCountView = &view.View{
Description: runningPRsCount.Description(),
Expand Down Expand Up @@ -230,13 +235,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
}
}

cond := pr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
if cond.Status == corev1.ConditionFalse {
status = "failed"
if cond.Reason == ReasonCancelled {
status = "cancelled"
}
}
reason := cond.Reason

pipelineName := "anonymous"
if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
Expand All @@ -245,7 +252,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
ctx, err := tag.New(
context.Background(),
append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
if err != nil {
return err
}
Expand Down
86 changes: 82 additions & 4 deletions pkg/pipelinerunmetrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,15 @@ var (
completionTime = metav1.NewTime(startTime.Time.Add(time.Minute))
)

func getConfigContext() context.Context {
func getConfigContext(countWithReason bool) context.Context {
ctx := context.Background()
cfg := &config.Config{
Metrics: &config.Metrics{
TaskrunLevel: config.TaskrunLevelAtTaskrun,
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
DurationTaskrunType: config.DefaultDurationTaskrunType,
DurationPipelinerunType: config.DefaultDurationPipelinerunType,
CountWithReason: countWithReason,
},
}
return config.ToContext(ctx, cfg)
Expand All @@ -71,7 +72,7 @@ func TestMetricsOnStore(t *testing.T) {
defer log.Sync()
logger := log.Sugar()

ctx := getConfigContext()
ctx := getConfigContext(false)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down Expand Up @@ -117,6 +118,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration float64
expectedCount int64
beforeCondition *apis.Condition
countWithReason bool
}{{
name: "for succeeded pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -149,6 +151,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for succeeded pipeline different condition",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -184,6 +187,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
Type: apis.ConditionReady,
Status: corev1.ConditionUnknown,
},
countWithReason: false,
}, {
name: "for succeeded pipeline recount",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -212,6 +216,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
Type: apis.ConditionSucceeded,
Status: corev1.ConditionTrue,
},
countWithReason: false,
}, {
name: "for cancelled pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -245,6 +250,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for failed pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -277,6 +283,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for pipeline without start or completion time",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -306,11 +313,82 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 0,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for failed pipeline with reason",
pipelineRun: &v1.PipelineRun{
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
Spec: v1.PipelineRunSpec{
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
},
Status: v1.PipelineRunStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: "Failed",
}},
},
PipelineRunStatusFields: v1.PipelineRunStatusFields{
StartTime: &startTime,
CompletionTime: &completionTime,
},
},
},
expectedDurationTags: map[string]string{
"pipeline": "pipeline-1",
"pipelinerun": "pipelinerun-1",
"namespace": "ns",
"status": "failed",
},
expectedCountTags: map[string]string{
"status": "failed",
"reason": "Failed",
},
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: true,
}, {
name: "for cancelled pipeline with reason",
pipelineRun: &v1.PipelineRun{
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
Spec: v1.PipelineRunSpec{
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
},
Status: v1.PipelineRunStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: ReasonCancelled,
}},
},
PipelineRunStatusFields: v1.PipelineRunStatusFields{
StartTime: &startTime,
CompletionTime: &completionTime,
},
},
},
expectedDurationTags: map[string]string{
"pipeline": "pipeline-1",
"pipelinerun": "pipelinerun-1",
"namespace": "ns",
"status": "cancelled",
},
expectedCountTags: map[string]string{
"status": "cancelled",
"reason": ReasonCancelled,
},
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: true,
}} {
t.Run(test.name, func(t *testing.T) {
unregisterMetrics()

ctx := getConfigContext()
ctx := getConfigContext(test.countWithReason)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down Expand Up @@ -363,7 +441,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
}
}

ctx = getConfigContext()
ctx = getConfigContext(false)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down
Loading

0 comments on commit 17a7fe3

Please sign in to comment.