Fix calculating metrics from different services in metrics collector (#…

…4297) # What this PR does Fix calculating metric values per integration from different services ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes.
grafana · Apr 29, 2024 · 0790d45 · 0790d45
1 parent d1085b7
commit 0790d45
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 19 deletions.
diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py
@@ -102,14 +102,19 @@ def _get_alert_groups_total_metric(self, org_ids):
                 labels_values = list(map(str, labels_values))
                 # clause below is needed for compatibility with old metric cache during rollout metrics with services
                 if "services" in integration_data:
+                    count_per_state = {state.value: 0 for state in AlertGroupState}
                     for service_name in integration_data["services"]:
                         for state in AlertGroupState:
-                            alert_groups_total.add_metric(
-                                labels_values + [state.value],
-                                # todo:metrics: replace [state.value] when all metric cache is updated
-                                # + [service_name, state.value],
-                                integration_data["services"][service_name][state.value],
-                            )
+                            count_per_state[state.value] += integration_data["services"][service_name][state.value]
+                    # todo:metrics: with enabling service_name label move "add_metric" under
+                    #  "for service_name..." iteration
+                    for state_name, counter in count_per_state.items():
+                        alert_groups_total.add_metric(
+                            labels_values + [state_name],
+                            # todo:metrics: replace [state.value] when all metric cache is updated
+                            # + [service_name, state.value],
+                            counter,
+                        )
                 else:
                     for state in AlertGroupState:
                         alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
@@ -143,24 +148,25 @@ def _get_response_time_metric(self, org_ids):
 
                 # clause below is needed for compatibility with old metric cache during rollout metrics with services
                 if "services" in integration_data:
+                    response_time_values = []
                     # todo:metrics: for service_name, response_time
                     for _, response_time in integration_data["services"].items():
                         if not response_time:
                             continue
-                        buckets, sum_value = self.get_buckets_with_sum(response_time)
-                        buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
-                        alert_groups_response_time_seconds.add_metric(
-                            labels_values,  # + [service_name]  todo:metrics: uncomment when all metric cache is updated
-                            buckets=buckets,
-                            sum_value=sum_value,
-                        )
+                        response_time_values.extend(response_time)
                 else:
                     response_time_values = integration_data["response_time"]
                     if not response_time_values:
                         continue
-                    buckets, sum_value = self.get_buckets_with_sum(response_time_values)
-                    buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
-                    alert_groups_response_time_seconds.add_metric(labels_values, buckets=buckets, sum_value=sum_value)
+                # todo:metrics: with enabling service_name label move "add_metric" under
+                #  "for service_name, response_time..." iteration
+                buckets, sum_value = self.get_buckets_with_sum(response_time_values)
+                buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
+                alert_groups_response_time_seconds.add_metric(
+                    labels_values,  # + [service_name]  todo:metrics: uncomment when all metric cache is updated
+                    buckets=buckets,
+                    sum_value=sum_value,
+                )
             org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
             processed_org_ids.add(int(org_id_from_key))
         missing_org_ids = org_ids - processed_org_ids

diff --git a/engine/apps/metrics_exporter/tests/conftest.py b/engine/apps/metrics_exporter/tests/conftest.py
@@ -118,6 +118,12 @@ def _mock_cache_get(key, *args, **kwargs):
                             "acknowledged": 3,
                             "resolved": 5,
                         },
+                        "test_service": {
+                            "firing": 10,
+                            "silenced": 10,
+                            "acknowledged": 10,
+                            "resolved": 10,
+                        },
                     },
                 },
             },
@@ -138,9 +144,7 @@ def _mock_cache_get(key, *args, **kwargs):
                     "org_id": 1,
                     "slug": "Test stack",
                     "id": 1,
-                    "services": {
-                        NO_SERVICE_VALUE: [2, 10, 200, 650],
-                    },
+                    "services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
                 },
             },
             USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {

diff --git a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py
@@ -62,9 +62,13 @@ def test_application_metrics_collector_mixed_cache(
         if metric.name == ALERT_GROUPS_TOTAL:
             # integration with labels for each alert group state
             assert len(metric.samples) == len(AlertGroupState) * 2
+            # check that values from different services were combined to one sample
+            assert {2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples)
         elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
             # integration with labels for each value in collector's bucket + _count and _sum histogram values
             assert len(metric.samples) == (len(collector._buckets) + 2) * 2
+            # check that values from different services were combined to one sample
+            assert 7.0 in set(sample.value for sample in metric.samples)
         elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
             # metric with labels for each notified user
             assert len(metric.samples) == 1