diff --git a/config/config-observability.yaml b/config/config-observability.yaml index f1f800beb06..facda374d83 100644 --- a/config/config-observability.yaml +++ b/config/config-observability.yaml @@ -59,3 +59,4 @@ data: metrics.pipelinerun.level: "pipeline" metrics.pipelinerun.duration-type: "histogram" metrics.count.enable-reason: "false" + metrics.running-pipelinerun.level: "" diff --git a/docs/metrics.md b/docs/metrics.md index 68441559ead..ca6e5cc3801 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -41,26 +41,30 @@ A sample config-map has been provided as [config-observability](./../config/conf metrics.taskrun.level: "task" metrics.taskrun.duration-type: "histogram" metrics.pipelinerun.level: "pipeline" + metrics.running-pipelinerun.level: "" metrics.pipelinerun.duration-type: "histogram" metrics.count.enable-reason: "false" ``` Following values are available in the configmap: -| configmap data | value | description | -| -- | ----------- | ----------- | -| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun | -| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics | -| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics -| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun | -| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics | -| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics | -| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram | +| configmap data | value | description | +| -- | ----------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------| +| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun | +| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics | +| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics +| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun | +| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics | +| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics | +| metrics.running-pipelinerun.level | `pipelinerun` | Level of running-pipelinerun metrics is pipelinerun | +| metrics.running-pipelinerun.level | `pipeline` | Level of running-pipelinerun metrics is pipeline and pipelinerun label isn't present in the metrics | +| metrics.running-pipelinerun.level | `namespace` | Level of running-pipelinerun metrics is namespace, pipeline and pipelinerun label isn't present in the metrics | +| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram | | metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue | -| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram | -| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue | -| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics | -| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric | +| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram | +| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue | +| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics | +| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric | Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database. diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go index 0df91805db0..f86d4a136a2 100644 --- a/pkg/apis/config/metrics.go +++ b/pkg/apis/config/metrics.go @@ -29,6 +29,9 @@ const ( // metricsPipelinerunLevel determines to what level to aggregate metrics // for pipelinerun metricsPipelinerunLevelKey = "metrics.pipelinerun.level" + // metricsRunningPipelinerunLevelKey determines to what level to aggregate metrics + // for running pipelineruns + metricsRunningPipelinerunLevelKey = "metrics.running-pipelinerun.level" // metricsDurationTaskrunType determines what type of // metrics to use for aggregating duration for taskrun metricsDurationTaskrunType = "metrics.taskrun.duration-type" @@ -55,6 +58,9 @@ const ( // DefaultPipelinerunLevel determines to what level to aggregate metrics // when it isn't specified in configmap DefaultPipelinerunLevel = PipelinerunLevelAtPipeline + // DefaultRunningPipelinerunLevel determines to what level to aggregate metrics + // when it isn't specified in configmap + DefaultRunningPipelinerunLevel = "" // PipelinerunLevelAtPipelinerun specify that aggregation will be done at // pipelinerun level PipelinerunLevelAtPipelinerun = "pipelinerun" @@ -96,6 +102,7 @@ var DefaultMetrics, _ = newMetricsFromMap(map[string]string{}) type Metrics struct { TaskrunLevel string PipelinerunLevel string + RunningPipelinerunLevel string DurationTaskrunType string DurationPipelinerunType string CountWithReason bool @@ -130,6 +137,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) { tc := Metrics{ TaskrunLevel: DefaultTaskrunLevel, PipelinerunLevel: DefaultPipelinerunLevel, + RunningPipelinerunLevel: DefaultRunningPipelinerunLevel, DurationTaskrunType: DefaultDurationTaskrunType, DurationPipelinerunType: DefaultDurationPipelinerunType, CountWithReason: false, @@ -143,6 +151,9 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) { if pipelinerunLevel, ok := cfgMap[metricsPipelinerunLevelKey]; ok { tc.PipelinerunLevel = pipelinerunLevel } + if runningPipelinerunLevel, ok := cfgMap[metricsRunningPipelinerunLevelKey]; ok { + tc.RunningPipelinerunLevel = runningPipelinerunLevel + } if durationTaskrun, ok := cfgMap[metricsDurationTaskrunType]; ok { tc.DurationTaskrunType = durationTaskrun } diff --git a/pkg/apis/config/metrics_test.go b/pkg/apis/config/metrics_test.go index c273ed6b76e..62872897af1 100644 --- a/pkg/apis/config/metrics_test.go +++ b/pkg/apis/config/metrics_test.go @@ -36,6 +36,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) { expectedConfig: &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtTaskrun, PipelinerunLevel: config.PipelinerunLevelAtPipelinerun, + RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel, DurationTaskrunType: config.DurationPipelinerunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeHistogram, CountWithReason: false, @@ -47,6 +48,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) { expectedConfig: &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtNS, PipelinerunLevel: config.PipelinerunLevelAtNS, + RunningPipelinerunLevel: config.PipelinerunLevelAtNS, DurationTaskrunType: config.DurationTaskrunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeLastValue, CountWithReason: false, @@ -58,6 +60,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) { expectedConfig: &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtNS, PipelinerunLevel: config.PipelinerunLevelAtNS, + RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel, DurationTaskrunType: config.DurationTaskrunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeLastValue, CountWithReason: true, @@ -69,6 +72,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) { expectedConfig: &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtNS, PipelinerunLevel: config.PipelinerunLevelAtNS, + RunningPipelinerunLevel: config.PipelinerunLevelAtPipeline, DurationTaskrunType: config.DurationTaskrunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeLastValue, CountWithReason: true, @@ -88,6 +92,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) { expectedConfig := &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtTask, PipelinerunLevel: config.PipelinerunLevelAtPipeline, + RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel, DurationTaskrunType: config.DurationPipelinerunTypeHistogram, DurationPipelinerunType: config.DurationPipelinerunTypeHistogram, CountWithReason: false, diff --git a/pkg/apis/config/testdata/config-observability-namespacelevel.yaml b/pkg/apis/config/testdata/config-observability-namespacelevel.yaml index 5029ee0099f..65a72ede515 100644 --- a/pkg/apis/config/testdata/config-observability-namespacelevel.yaml +++ b/pkg/apis/config/testdata/config-observability-namespacelevel.yaml @@ -27,4 +27,5 @@ data: metrics.taskrun.level: "namespace" metrics.taskrun.duration-type: "histogram" metrics.pipelinerun.level: "namespace" + metrics.running-pipelinerun.level: "namespace" metrics.pipelinerun.duration-type: "lastvalue" diff --git a/pkg/apis/config/testdata/config-observability-throttle.yaml b/pkg/apis/config/testdata/config-observability-throttle.yaml index 2b418e176cd..08fe6ac9d5a 100644 --- a/pkg/apis/config/testdata/config-observability-throttle.yaml +++ b/pkg/apis/config/testdata/config-observability-throttle.yaml @@ -27,6 +27,7 @@ data: metrics.taskrun.level: "namespace" metrics.taskrun.duration-type: "histogram" metrics.pipelinerun.level: "namespace" + metrics.running-pipelinerun.level: "pipeline" metrics.pipelinerun.duration-type: "lastvalue" metrics.count.enable-reason: "true" metrics.taskrun.throttle.enable-namespace: "true" diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go index d528681db9f..0879f0d1377 100644 --- a/pkg/pipelinerunmetrics/metrics.go +++ b/pkg/pipelinerunmetrics/metrics.go @@ -149,7 +149,6 @@ func viewRegister(cfg *config.Metrics) error { defer r.mutex.Unlock() var prunTag []tag.Key - switch cfg.PipelinerunLevel { case config.PipelinerunLevelAtPipelinerun: prunTag = []tag.Key{pipelinerunTag, pipelineTag} @@ -164,6 +163,18 @@ func viewRegister(cfg *config.Metrics) error { return errors.New("invalid config for PipelinerunLevel: " + cfg.PipelinerunLevel) } + var runningPRTag []tag.Key + switch cfg.RunningPipelinerunLevel { + case config.PipelinerunLevelAtPipelinerun: + runningPRTag = []tag.Key{pipelinerunTag, pipelineTag, namespaceTag} + case config.PipelinerunLevelAtPipeline: + runningPRTag = []tag.Key{pipelineTag, namespaceTag} + case config.PipelinerunLevelAtNS: + runningPRTag = []tag.Key{namespaceTag} + default: + runningPRTag = []tag.Key{} + } + distribution := view.Distribution(10, 30, 60, 300, 900, 1800, 3600, 5400, 10800, 21600, 43200, 86400) if cfg.PipelinerunLevel == config.PipelinerunLevelAtPipelinerun { @@ -213,6 +224,7 @@ func viewRegister(cfg *config.Metrics) error { Description: runningPRs.Description(), Measure: runningPRs, Aggregation: view.LastValue(), + TagKeys: runningPRTag, } runningPRsWaitingOnPipelineResolutionCountView = &view.View{ @@ -326,7 +338,7 @@ func (r *Recorder) updateConfig(cfg *config.Metrics) { // DurationAndCount logs the duration of PipelineRun execution and // count for number of PipelineRuns succeed or failed -// returns an error if its failed to log the metrics +// returns an error if it's failed to log the metrics func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Condition) error { if !r.initialized { return fmt.Errorf("ignoring the metrics recording for %s , failed to initialize the metrics recorder", pr.Name) @@ -379,11 +391,10 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co } // RunningPipelineRuns logs the number of PipelineRuns running right now -// returns an error if its failed to log the metrics +// returns an error if it's failed to log the metrics func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { r.mutex.Lock() defer r.mutex.Unlock() - if !r.initialized { return errors.New("ignoring the metrics recording, failed to initialize the metrics recorder") } @@ -396,9 +407,34 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { var runningPipelineRuns int var trsWaitResolvingTaskRef int var prsWaitResolvingPipelineRef int + countMap := map[string]int{} for _, pr := range prs { + pipelineName := getPipelineTagName(pr) + pipelineRunKey := "" + mutators := []tag.Mutator{ + tag.Insert(namespaceTag, pr.Namespace), + tag.Insert(pipelineTag, pipelineName), + } + if r.cfg != nil { + switch r.cfg.RunningPipelinerunLevel { + case "pipelinerun": + pipelineRunKey = pipelineRunKey + "#" + pr.Name + fallthrough + case "pipeline": + pipelineRunKey = pipelineRunKey + "#" + pipelineName + fallthrough + case "namespace": + pipelineRunKey = pipelineRunKey + "#" + pr.Namespace + } + } + ctx_, err_ := tag.New(context.Background(), mutators...) + if err_ != nil { + return err + } if !pr.IsDone() { + countMap[pipelineRunKey]++ + metrics.Record(ctx_, runningPRs.M(float64(countMap[pipelineRunKey]))) runningPipelineRuns++ succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded) if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown { @@ -409,6 +445,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { prsWaitResolvingPipelineRef++ } } + } else { + // If all PipelineRuns for a Pipeline are completed then record it with 0 + // set the key to 0 so that we do not record further completed PipelineRuns for same Pipeline + if _, exists := countMap[pipelineRunKey]; !exists { + countMap[pipelineRunKey] = 0 + metrics.Record(ctx_, runningPRs.M(0)) + } } } @@ -421,7 +464,6 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error { metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef))) metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef))) metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns))) - metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns))) return nil } diff --git a/pkg/pipelinerunmetrics/metrics_test.go b/pkg/pipelinerunmetrics/metrics_test.go index 23703e64539..d03c3958fe8 100644 --- a/pkg/pipelinerunmetrics/metrics_test.go +++ b/pkg/pipelinerunmetrics/metrics_test.go @@ -50,6 +50,7 @@ func getConfigContext(countWithReason bool) context.Context { Metrics: &config.Metrics{ TaskrunLevel: config.TaskrunLevelAtTaskrun, PipelinerunLevel: config.PipelinerunLevelAtPipelinerun, + RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel, DurationTaskrunType: config.DefaultDurationTaskrunType, DurationPipelinerunType: config.DefaultDurationPipelinerunType, CountWithReason: countWithReason, @@ -58,6 +59,21 @@ func getConfigContext(countWithReason bool) context.Context { return config.ToContext(ctx, cfg) } +func getConfigContextRunningPRLevel(runningPipelinerunLevel string) context.Context { + ctx := context.Background() + cfg := &config.Config{ + Metrics: &config.Metrics{ + TaskrunLevel: config.TaskrunLevelAtTaskrun, + PipelinerunLevel: config.PipelinerunLevelAtPipelinerun, + DurationTaskrunType: config.DefaultDurationTaskrunType, + DurationPipelinerunType: config.DefaultDurationPipelinerunType, + CountWithReason: false, + RunningPipelinerunLevel: runningPipelinerunLevel, + }, + } + return config.ToContext(ctx, cfg) +} + func TestUninitializedMetrics(t *testing.T) { metrics := Recorder{} @@ -504,6 +520,48 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) { metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{}, 1) } +func TestRecordRunningPipelineRunsCountAtPipelineLevel(t *testing.T) { + unregisterMetrics() + + newPipelineRun := func(status corev1.ConditionStatus, namespace string) *v1.PipelineRun { + return &v1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("pipelinerun-"), Namespace: namespace}, + Status: v1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: apis.ConditionSucceeded, + Status: status, + }}, + }, + }, + } + } + + ctx, _ := ttesting.SetupFakeContext(t) + informer := fakepipelineruninformer.Get(ctx) + // Add N randomly-named PipelineRuns with differently-succeeded statuses. + for _, pipelineRun := range []*v1.PipelineRun{ + newPipelineRun(corev1.ConditionTrue, "testns"), + newPipelineRun(corev1.ConditionUnknown, "testns"), + newPipelineRun(corev1.ConditionFalse, "testns"), + } { + if err := informer.Informer().GetIndexer().Add(pipelineRun); err != nil { + t.Fatalf("Adding TaskRun to informer: %v", err) + } + } + + ctx = getConfigContextRunningPRLevel("pipeline") + recorder, err := NewRecorder(ctx) + if err != nil { + t.Fatalf("NewRecorder: %v", err) + } + + if err := recorder.RunningPipelineRuns(informer.Lister()); err != nil { + t.Errorf("RunningPipelineRuns: %v", err) + } + metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{"namespace": "testns", "pipeline": "anonymous"}, 1) +} + func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) { multiplier := 3 for _, tc := range []struct {