Skip to content

Commit

Permalink
Fix calculating metrics from different services in metrics collector (#…
Browse files Browse the repository at this point in the history
…4297)

# What this PR does
Fix calculating metric values per integration from different services

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
  • Loading branch information
Ferril authored Apr 29, 2024
1 parent d1085b7 commit 0790d45
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 19 deletions.
38 changes: 22 additions & 16 deletions engine/apps/metrics_exporter/metrics_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,19 @@ def _get_alert_groups_total_metric(self, org_ids):
labels_values = list(map(str, labels_values))
# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
count_per_state = {state.value: 0 for state in AlertGroupState}
for service_name in integration_data["services"]:
for state in AlertGroupState:
alert_groups_total.add_metric(
labels_values + [state.value],
# todo:metrics: replace [state.value] when all metric cache is updated
# + [service_name, state.value],
integration_data["services"][service_name][state.value],
)
count_per_state[state.value] += integration_data["services"][service_name][state.value]
# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name..." iteration
for state_name, counter in count_per_state.items():
alert_groups_total.add_metric(
labels_values + [state_name],
# todo:metrics: replace [state.value] when all metric cache is updated
# + [service_name, state.value],
counter,
)
else:
for state in AlertGroupState:
alert_groups_total.add_metric(labels_values + [state.value], integration_data[state.value])
Expand Down Expand Up @@ -143,24 +148,25 @@ def _get_response_time_metric(self, org_ids):

# clause below is needed for compatibility with old metric cache during rollout metrics with services
if "services" in integration_data:
response_time_values = []
# todo:metrics: for service_name, response_time
for _, response_time in integration_data["services"].items():
if not response_time:
continue
buckets, sum_value = self.get_buckets_with_sum(response_time)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
buckets=buckets,
sum_value=sum_value,
)
response_time_values.extend(response_time)
else:
response_time_values = integration_data["response_time"]
if not response_time_values:
continue
buckets, sum_value = self.get_buckets_with_sum(response_time_values)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(labels_values, buckets=buckets, sum_value=sum_value)
# todo:metrics: with enabling service_name label move "add_metric" under
# "for service_name, response_time..." iteration
buckets, sum_value = self.get_buckets_with_sum(response_time_values)
buckets = sorted(list(buckets.items()), key=lambda x: float(x[0]))
alert_groups_response_time_seconds.add_metric(
labels_values, # + [service_name] todo:metrics: uncomment when all metric cache is updated
buckets=buckets,
sum_value=sum_value,
)
org_id_from_key = RE_ALERT_GROUPS_RESPONSE_TIME.match(org_key).groups()[0]
processed_org_ids.add(int(org_id_from_key))
missing_org_ids = org_ids - processed_org_ids
Expand Down
10 changes: 7 additions & 3 deletions engine/apps/metrics_exporter/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ def _mock_cache_get(key, *args, **kwargs):
"acknowledged": 3,
"resolved": 5,
},
"test_service": {
"firing": 10,
"silenced": 10,
"acknowledged": 10,
"resolved": 10,
},
},
},
},
Expand All @@ -138,9 +144,7 @@ def _mock_cache_get(key, *args, **kwargs):
"org_id": 1,
"slug": "Test stack",
"id": 1,
"services": {
NO_SERVICE_VALUE: [2, 10, 200, 650],
},
"services": {NO_SERVICE_VALUE: [2, 10, 200, 650], "test_service": [4, 8, 12]},
},
},
USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
Expand Down
4 changes: 4 additions & 0 deletions engine/apps/metrics_exporter/tests/test_metrics_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ def test_application_metrics_collector_mixed_cache(
if metric.name == ALERT_GROUPS_TOTAL:
# integration with labels for each alert group state
assert len(metric.samples) == len(AlertGroupState) * 2
# check that values from different services were combined to one sample
assert {2, 3, 4, 5, 12, 13, 14, 15} == set(sample.value for sample in metric.samples)
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
# integration with labels for each value in collector's bucket + _count and _sum histogram values
assert len(metric.samples) == (len(collector._buckets) + 2) * 2
# check that values from different services were combined to one sample
assert 7.0 in set(sample.value for sample in metric.samples)
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
# metric with labels for each notified user
assert len(metric.samples) == 1
Expand Down

0 comments on commit 0790d45

Please sign in to comment.