Adding metrics for Running PipelineRuns

Currently metrices shown for Running Pipeline Count is at cluster level. There is no way we can get that metric at namespace or pipeline level. We have enhanced the running pipelinerun metric at pipelinerun, pipeline and namespace level (Can be configured via ConfigMap, Default behaviour is same as earlier).
tektoncd · Sep 27, 2024 · a73f04d · a73f04d
1 parent 0649270
commit a73f04d
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 18 deletions.
diff --git a/config/config-observability.yaml b/config/config-observability.yaml
@@ -59,3 +59,4 @@ data:
     metrics.pipelinerun.level: "pipeline"
     metrics.pipelinerun.duration-type: "histogram"
     metrics.count.enable-reason: "false"
+    metrics.running-pipelinerun.level: ""
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -41,26 +41,30 @@ A sample config-map has been provided as [config-observability](./../config/conf
     metrics.taskrun.level: "task"
     metrics.taskrun.duration-type: "histogram"
     metrics.pipelinerun.level: "pipeline"
+    metrics.running-pipelinerun.level: ""
     metrics.pipelinerun.duration-type: "histogram"
     metrics.count.enable-reason: "false"
 ```
 
 Following values are available in the configmap:
 
-| configmap data | value | description |
-| -- | ----------- | ----------- |
-| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun |
-| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics |
-| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics
-| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun |
-| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics |
-| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics |
-| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram |
+| configmap data | value | description                                                                                                                                                  |
+| -- | ----------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun                                                                                                                                  |
+| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics                                                                                      |
+| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics                                                                       
+| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun                                                                                                                              |
+| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics                                                                              |
+| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics                                                                   |
+| metrics.running-pipelinerun.level | `pipelinerun` | Level of running-pipelinerun metrics is pipelinerun                                                                                                          |
+| metrics.running-pipelinerun.level | `pipeline` | Level of running-pipelinerun metrics is pipeline and pipelinerun label isn't present in the metrics                                                          |
+| metrics.running-pipelinerun.level | `namespace` | Level of running-pipelinerun metrics is namespace, pipeline and pipelinerun label isn't present in the metrics                                               |
+| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram           |
 | metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and  `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
-| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
-| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
-| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics |
-| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric |
+| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram                                                                              |
+| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue                                                                     |
+| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics                                                                                               |
+| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric                             |
 
 Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.
 

diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go
@@ -29,6 +29,9 @@ const (
 	// metricsPipelinerunLevel determines to what level to aggregate metrics
 	// for pipelinerun
 	metricsPipelinerunLevelKey = "metrics.pipelinerun.level"
+	// metricsRunningPipelinerunLevelKey determines to what level to aggregate metrics
+	// for running pipelineruns
+	metricsRunningPipelinerunLevelKey = "metrics.running-pipelinerun.level"
 	// metricsDurationTaskrunType determines what type of
 	// metrics to use for aggregating duration for taskrun
 	metricsDurationTaskrunType = "metrics.taskrun.duration-type"
@@ -55,6 +58,9 @@ const (
 	// DefaultPipelinerunLevel determines to what level to aggregate metrics
 	// when it isn't specified in configmap
 	DefaultPipelinerunLevel = PipelinerunLevelAtPipeline
+	// DefaultRunningPipelinerunLevel determines to what level to aggregate metrics
+	// when it isn't specified in configmap
+	DefaultRunningPipelinerunLevel = ""
 	// PipelinerunLevelAtPipelinerun specify that aggregation will be done at
 	// pipelinerun level
 	PipelinerunLevelAtPipelinerun = "pipelinerun"
@@ -96,6 +102,7 @@ var DefaultMetrics, _ = newMetricsFromMap(map[string]string{})
 type Metrics struct {
 	TaskrunLevel            string
 	PipelinerunLevel        string
+	RunningPipelinerunLevel string
 	DurationTaskrunType     string
 	DurationPipelinerunType string
 	CountWithReason         bool
@@ -130,6 +137,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 	tc := Metrics{
 		TaskrunLevel:            DefaultTaskrunLevel,
 		PipelinerunLevel:        DefaultPipelinerunLevel,
+		RunningPipelinerunLevel: DefaultRunningPipelinerunLevel,
 		DurationTaskrunType:     DefaultDurationTaskrunType,
 		DurationPipelinerunType: DefaultDurationPipelinerunType,
 		CountWithReason:         false,
@@ -143,6 +151,9 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 	if pipelinerunLevel, ok := cfgMap[metricsPipelinerunLevelKey]; ok {
 		tc.PipelinerunLevel = pipelinerunLevel
 	}
+	if runningPipelinerunLevel, ok := cfgMap[metricsRunningPipelinerunLevelKey]; ok {
+		tc.RunningPipelinerunLevel = runningPipelinerunLevel
+	}
 	if durationTaskrun, ok := cfgMap[metricsDurationTaskrunType]; ok {
 		tc.DurationTaskrunType = durationTaskrun
 	}

diff --git a/pkg/apis/config/metrics_test.go b/pkg/apis/config/metrics_test.go
@@ -36,6 +36,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtTaskrun,
 				PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
+				RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 				DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
 				CountWithReason:         false,
@@ -47,6 +48,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.PipelinerunLevelAtNS,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         false,
@@ -58,6 +60,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         true,
@@ -69,6 +72,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.PipelinerunLevelAtPipeline,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         true,
@@ -88,6 +92,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
 	expectedConfig := &config.Metrics{
 		TaskrunLevel:            config.TaskrunLevelAtTask,
 		PipelinerunLevel:        config.PipelinerunLevelAtPipeline,
+		RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 		DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 		DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
 		CountWithReason:         false,

diff --git a/pkg/apis/config/testdata/config-observability-namespacelevel.yaml b/pkg/apis/config/testdata/config-observability-namespacelevel.yaml
@@ -27,4 +27,5 @@ data:
   metrics.taskrun.level: "namespace"
   metrics.taskrun.duration-type: "histogram"
   metrics.pipelinerun.level: "namespace"
+  metrics.running-pipelinerun.level: "namespace"
   metrics.pipelinerun.duration-type: "lastvalue"
diff --git a/pkg/apis/config/testdata/config-observability-throttle.yaml b/pkg/apis/config/testdata/config-observability-throttle.yaml
@@ -27,6 +27,7 @@ data:
   metrics.taskrun.level: "namespace"
   metrics.taskrun.duration-type: "histogram"
   metrics.pipelinerun.level: "namespace"
+  metrics.running-pipelinerun.level: "pipeline"
   metrics.pipelinerun.duration-type: "lastvalue"
   metrics.count.enable-reason: "true"
   metrics.taskrun.throttle.enable-namespace: "true"
diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go
@@ -149,7 +149,6 @@ func viewRegister(cfg *config.Metrics) error {
 	defer r.mutex.Unlock()
 
 	var prunTag []tag.Key
-
 	switch cfg.PipelinerunLevel {
 	case config.PipelinerunLevelAtPipelinerun:
 		prunTag = []tag.Key{pipelinerunTag, pipelineTag}
@@ -164,6 +163,18 @@ func viewRegister(cfg *config.Metrics) error {
 		return errors.New("invalid config for PipelinerunLevel: " + cfg.PipelinerunLevel)
 	}
 
+	var runningPRTag []tag.Key
+	switch cfg.RunningPipelinerunLevel {
+	case config.PipelinerunLevelAtPipelinerun:
+		runningPRTag = []tag.Key{pipelinerunTag, pipelineTag, namespaceTag}
+	case config.PipelinerunLevelAtPipeline:
+		runningPRTag = []tag.Key{pipelineTag, namespaceTag}
+	case config.PipelinerunLevelAtNS:
+		runningPRTag = []tag.Key{namespaceTag}
+	default:
+		runningPRTag = []tag.Key{}
+	}
+
 	distribution := view.Distribution(10, 30, 60, 300, 900, 1800, 3600, 5400, 10800, 21600, 43200, 86400)
 
 	if cfg.PipelinerunLevel == config.PipelinerunLevelAtPipelinerun {
@@ -213,6 +224,7 @@ func viewRegister(cfg *config.Metrics) error {
 		Description: runningPRs.Description(),
 		Measure:     runningPRs,
 		Aggregation: view.LastValue(),
+		TagKeys:     runningPRTag,
 	}
 
 	runningPRsWaitingOnPipelineResolutionCountView = &view.View{
@@ -326,7 +338,7 @@ func (r *Recorder) updateConfig(cfg *config.Metrics) {
 
 // DurationAndCount logs the duration of PipelineRun execution and
 // count for number of PipelineRuns succeed or failed
-// returns an error if its failed to log the metrics
+// returns an error if it's failed to log the metrics
 func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Condition) error {
 	if !r.initialized {
 		return fmt.Errorf("ignoring the metrics recording for %s , failed to initialize the metrics recorder", pr.Name)
@@ -379,11 +391,10 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
 }
 
 // RunningPipelineRuns logs the number of PipelineRuns running right now
-// returns an error if its failed to log the metrics
+// returns an error if it's failed to log the metrics
 func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	r.mutex.Lock()
 	defer r.mutex.Unlock()
-
 	if !r.initialized {
 		return errors.New("ignoring the metrics recording, failed to initialize the metrics recorder")
 	}
@@ -396,9 +407,34 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	var runningPipelineRuns int
 	var trsWaitResolvingTaskRef int
 	var prsWaitResolvingPipelineRef int
+	countMap := map[string]int{}
 
 	for _, pr := range prs {
+		pipelineName := getPipelineTagName(pr)
+		pipelineRunKey := ""
+		mutators := []tag.Mutator{
+			tag.Insert(namespaceTag, pr.Namespace),
+			tag.Insert(pipelineTag, pipelineName),
+		}
+		if r.cfg != nil {
+			switch r.cfg.RunningPipelinerunLevel {
+			case "pipelinerun":
+				pipelineRunKey = pipelineRunKey + "#" + pr.Name
+				fallthrough
+			case "pipeline":
+				pipelineRunKey = pipelineRunKey + "#" + pipelineName
+				fallthrough
+			case "namespace":
+				pipelineRunKey = pipelineRunKey + "#" + pr.Namespace
+			}
+		}
+		ctx_, err_ := tag.New(context.Background(), mutators...)
+		if err_ != nil {
+			return err
+		}
 		if !pr.IsDone() {
+			countMap[pipelineRunKey]++
+			metrics.Record(ctx_, runningPRs.M(float64(countMap[pipelineRunKey])))
 			runningPipelineRuns++
 			succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
 			if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
@@ -409,6 +445,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 					prsWaitResolvingPipelineRef++
 				}
 			}
+		} else {
+			// If all PipelineRuns for a Pipeline are completed then record it with 0
+			// set the key to 0 so that we do not record further completed PipelineRuns  for same Pipeline
+			if _, exists := countMap[pipelineRunKey]; !exists {
+				countMap[pipelineRunKey] = 0
+				metrics.Record(ctx_, runningPRs.M(0))
+			}
 		}
 	}
 
@@ -421,7 +464,6 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
 	metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef)))
 	metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns)))
-	metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns)))
 
 	return nil
 }

diff --git a/pkg/pipelinerunmetrics/metrics_test.go b/pkg/pipelinerunmetrics/metrics_test.go
@@ -50,6 +50,7 @@ func getConfigContext(countWithReason bool) context.Context {
 		Metrics: &config.Metrics{
 			TaskrunLevel:            config.TaskrunLevelAtTaskrun,
 			PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
+			RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 			DurationTaskrunType:     config.DefaultDurationTaskrunType,
 			DurationPipelinerunType: config.DefaultDurationPipelinerunType,
 			CountWithReason:         countWithReason,
@@ -58,6 +59,21 @@ func getConfigContext(countWithReason bool) context.Context {
 	return config.ToContext(ctx, cfg)
 }
 
+func getConfigContextRunningPRLevel(runningPipelinerunLevel string) context.Context {
+	ctx := context.Background()
+	cfg := &config.Config{
+		Metrics: &config.Metrics{
+			TaskrunLevel:            config.TaskrunLevelAtTaskrun,
+			PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
+			DurationTaskrunType:     config.DefaultDurationTaskrunType,
+			DurationPipelinerunType: config.DefaultDurationPipelinerunType,
+			CountWithReason:         false,
+			RunningPipelinerunLevel: runningPipelinerunLevel,
+		},
+	}
+	return config.ToContext(ctx, cfg)
+}
+
 func TestUninitializedMetrics(t *testing.T) {
 	metrics := Recorder{}
 
@@ -504,6 +520,48 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
 	metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{}, 1)
 }
 
+func TestRecordRunningPipelineRunsCountAtPipelineLevel(t *testing.T) {
+	unregisterMetrics()
+
+	newPipelineRun := func(status corev1.ConditionStatus, namespace string) *v1.PipelineRun {
+		return &v1.PipelineRun{
+			ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("pipelinerun-"), Namespace: namespace},
+			Status: v1.PipelineRunStatus{
+				Status: duckv1.Status{
+					Conditions: duckv1.Conditions{{
+						Type:   apis.ConditionSucceeded,
+						Status: status,
+					}},
+				},
+			},
+		}
+	}
+
+	ctx, _ := ttesting.SetupFakeContext(t)
+	informer := fakepipelineruninformer.Get(ctx)
+	// Add N randomly-named PipelineRuns with differently-succeeded statuses.
+	for _, pipelineRun := range []*v1.PipelineRun{
+		newPipelineRun(corev1.ConditionTrue, "testns"),
+		newPipelineRun(corev1.ConditionUnknown, "testns"),
+		newPipelineRun(corev1.ConditionFalse, "testns"),
+	} {
+		if err := informer.Informer().GetIndexer().Add(pipelineRun); err != nil {
+			t.Fatalf("Adding TaskRun to informer: %v", err)
+		}
+	}
+
+	ctx = getConfigContextRunningPRLevel("pipeline")
+	recorder, err := NewRecorder(ctx)
+	if err != nil {
+		t.Fatalf("NewRecorder: %v", err)
+	}
+
+	if err := recorder.RunningPipelineRuns(informer.Lister()); err != nil {
+		t.Errorf("RunningPipelineRuns: %v", err)
+	}
+	metricstest.CheckLastValueData(t, "running_pipelineruns", map[string]string{"namespace": "testns", "pipeline": "anonymous"}, 1)
+}
+
 func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
 	multiplier := 3
 	for _, tc := range []struct {