diff --git a/config/config-observability.yaml b/config/config-observability.yaml index 1f97697ba8e..6417b405fb7 100644 --- a/config/config-observability.yaml +++ b/config/config-observability.yaml @@ -58,3 +58,4 @@ data: metrics.taskrun.duration-type: "histogram" metrics.pipelinerun.level: "pipeline" metrics.pipelinerun.duration-type: "histogram" + metrics.count.reason: "false" diff --git a/docs/metrics.md b/docs/metrics.md index b0b8ab75e2f..51177ae08c8 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver, | ---------- | ----------- | ----------- | ----------- | | `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`namespace`=<pipelinerun-namespace> | experimental | | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=<pipeline_name>
`*pipelinerun`=<pipelinerun_name>
`status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace>| experimental | -| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental | | `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental | | `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=<status>
`*task`=<task_name>
`*taskrun`=<taskrun_name>
`namespace`=<pipelineruns-taskruns-namespace> | experimental | -| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status> | experimental | +| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=<status>
`*reason`=<reason> | experimental | | `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental | | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental | @@ -37,6 +37,7 @@ A sample config-map has been provided as [config-observability](./../config/conf metrics.taskrun.duration-type: "histogram" metrics.pipelinerun.level: "pipeline" metrics.pipelinerun.duration-type: "histogram" + metrics.count.reason: "false" ``` Following values are available in the configmap: @@ -53,6 +54,7 @@ Following values are available in the configmap: | metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue | | metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram | | metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue | +| metrics.count.reason | `false` | Sets if the `reason` label should be included on count metrics | Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database. diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go index 9422ec5eb43..d7f89bbb1fc 100644 --- a/pkg/apis/config/metrics.go +++ b/pkg/apis/config/metrics.go @@ -36,6 +36,9 @@ const ( // metrics to use for aggregating duration for pipelinerun metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type" + // countWithReasonKey sets if the reason label should be included on count metrics + countWithReasonKey = "metrics.count.reason" + // DefaultTaskrunLevel determines to what level to aggregate metrics // when it isn't specified in configmap DefaultTaskrunLevel = TaskrunLevelAtTask @@ -92,6 +95,7 @@ type Metrics struct { PipelinerunLevel string DurationTaskrunType string DurationPipelinerunType string + CountWithReason bool } // GetMetricsConfigName returns the name of the configmap containing all @@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool { return other.TaskrunLevel == cfg.TaskrunLevel && other.PipelinerunLevel == cfg.PipelinerunLevel && other.DurationTaskrunType == cfg.DurationTaskrunType && - other.DurationPipelinerunType == cfg.DurationPipelinerunType + other.DurationPipelinerunType == cfg.DurationPipelinerunType && + other.CountWithReason == cfg.CountWithReason } // newMetricsFromMap returns a Config given a map corresponding to a ConfigMap @@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) { PipelinerunLevel: DefaultPipelinerunLevel, DurationTaskrunType: DefaultDurationTaskrunType, DurationPipelinerunType: DefaultDurationPipelinerunType, + CountWithReason: false, } if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok { @@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) { if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok { tc.DurationPipelinerunType = durationPipelinerun } + + if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" { + tc.CountWithReason = true + } + return &tc, nil } diff --git a/pkg/apis/config/metrics_test.go b/pkg/apis/config/metrics_test.go index e61fa7a5863..bf9795de667 100644 --- a/pkg/apis/config/metrics_test.go +++ b/pkg/apis/config/metrics_test.go @@ -38,6 +38,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) { PipelinerunLevel: config.PipelinerunLevelAtPipelinerun, DurationTaskrunType: config.DurationPipelinerunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeHistogram, + CountWithReason: false, }, fileName: config.GetMetricsConfigName(), }, @@ -47,9 +48,20 @@ func TestNewMetricsFromConfigMap(t *testing.T) { PipelinerunLevel: config.PipelinerunLevelAtNS, DurationTaskrunType: config.DurationTaskrunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeLastValue, + CountWithReason: false, }, fileName: "config-observability-namespacelevel", }, + { + expectedConfig: &config.Metrics{ + TaskrunLevel: config.TaskrunLevelAtNS, + PipelinerunLevel: config.PipelinerunLevelAtNS, + DurationTaskrunType: config.DurationTaskrunTypeHistogram, + DurationPipelinerunType: config.DurationPipelinerunTypeLastValue, + CountWithReason: true, + }, + fileName: "config-observability-reason", + }, } for _, tc := range testCases { @@ -64,6 +76,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) { PipelinerunLevel: config.PipelinerunLevelAtPipeline, DurationTaskrunType: config.DurationPipelinerunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeHistogram, + CountWithReason: false, } verifyConfigFileWithExpectedMetricsConfig(t, MetricsConfigEmptyName, expectedConfig) } diff --git a/pkg/apis/config/testdata/config-observability-reason.yaml b/pkg/apis/config/testdata/config-observability-reason.yaml new file mode 100644 index 00000000000..b8a47846b77 --- /dev/null +++ b/pkg/apis/config/testdata/config-observability-reason.yaml @@ -0,0 +1,31 @@ +# Copyright 2019 The Tekton Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: tekton-pipelines + labels: + app.kubernetes.io/instance: default + app.kubernetes.io/part-of: tekton-pipelines +data: + metrics.backend-destination: prometheus + metrics.stackdriver-project-id: "" + metrics.allow-stackdriver-custom-metrics: "false" + metrics.taskrun.level: "namespace" + metrics.taskrun.duration-type: "histogram" + metrics.pipelinerun.level: "namespace" + metrics.pipelinerun.duration-type: "lastvalue" + metrics.count.reason: "true" diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go index f89a5b46e06..3768598a5a3 100644 --- a/pkg/pipelinerunmetrics/metrics.go +++ b/pkg/pipelinerunmetrics/metrics.go @@ -43,6 +43,7 @@ var ( pipelineTag = tag.MustNewKey("pipeline") namespaceTag = tag.MustNewKey("namespace") statusTag = tag.MustNewKey("status") + reasonTag = tag.MustNewKey("reason") prDuration = stats.Float64( "pipelinerun_duration_seconds", @@ -149,11 +150,15 @@ func viewRegister(cfg *config.Metrics) error { TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...), } + prCountViewTags := []tag.Key{statusTag} + if cfg.CountWithReason { + prCountViewTags = append(prCountViewTags, reasonTag) + } prCountView = &view.View{ Description: prCount.Description(), Measure: prCount, Aggregation: view.Count(), - TagKeys: []tag.Key{statusTag}, + TagKeys: prCountViewTags, } runningPRsCountView = &view.View{ Description: runningPRsCount.Description(), @@ -230,13 +235,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co } } + cond := pr.Status.GetCondition(apis.ConditionSucceeded) status := "success" - if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse { + if cond.Status == corev1.ConditionFalse { status = "failed" if cond.Reason == ReasonCancelled { status = "cancelled" } } + reason := cond.Reason pipelineName := "anonymous" if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" { @@ -245,7 +252,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co ctx, err := tag.New( context.Background(), append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace), - tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...) + tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...) if err != nil { return err } diff --git a/pkg/taskrunmetrics/metrics.go b/pkg/taskrunmetrics/metrics.go index 473b8f4c604..d42e6db8c48 100644 --- a/pkg/taskrunmetrics/metrics.go +++ b/pkg/taskrunmetrics/metrics.go @@ -50,6 +50,7 @@ var ( taskTag = tag.MustNewKey("task") namespaceTag = tag.MustNewKey("namespace") statusTag = tag.MustNewKey("status") + reasonTag = tag.MustNewKey("reason") podTag = tag.MustNewKey("pod") trDurationView *view.View @@ -198,11 +199,16 @@ func viewRegister(cfg *config.Metrics) error { Aggregation: distribution, TagKeys: append([]tag.Key{statusTag, namespaceTag}, append(trunTag, prunTag...)...), } + + trCountViewTags := []tag.Key{statusTag} + if cfg.CountWithReason { + trCountViewTags = append(trCountViewTags, reasonTag) + } trCountView = &view.View{ Description: trCount.Description(), Measure: trCount, Aggregation: view.Count(), - TagKeys: []tag.Key{statusTag}, + TagKeys: trCountViewTags, } runningTRsCountView = &view.View{ Description: runningTRsCount.Description(), @@ -316,13 +322,15 @@ func (r *Recorder) DurationAndCount(ctx context.Context, tr *v1.TaskRun, beforeC taskName = tr.Spec.TaskRef.Name } + cond := tr.Status.GetCondition(apis.ConditionSucceeded) status := "success" - if cond := tr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse { + if cond.Status == corev1.ConditionFalse { status = "failed" } + reason := cond.Reason durationStat := trDuration - tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status)} + tags := []tag.Mutator{tag.Insert(namespaceTag, tr.Namespace), tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)} if ok, pipeline, pipelinerun := IsPartOfPipeline(tr); ok { durationStat = prTRDuration tags = append(tags, r.insertPipelineTag(pipeline, pipelinerun)...)