Skip to content

Commit

Permalink
Drop stale metrics cache (#4695)
Browse files Browse the repository at this point in the history
# What this PR does
based on this PR - #4517

## Which issue(s) this PR closes

Related to #4455

## Checklist

- [x] Unit, integration, and e2e (if applicable) tests updated
- [x] Documentation added (or `pr:no public docs` PR label added if not
required)
- [x] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.
  • Loading branch information
Ferril authored Jul 17, 2024
1 parent 49d1127 commit 4e03732
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 0 deletions.
10 changes: 10 additions & 0 deletions engine/apps/metrics_exporter/metrics_collectors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import re
import typing

Expand Down Expand Up @@ -28,6 +29,7 @@

application_metrics_registry = CollectorRegistry()

logger = logging.getLogger(__name__)

# _RE_BASE_PATTERN allows for optional curly-brackets around the metric name as in some cases this may occur
# see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details regarding this
Expand Down Expand Up @@ -92,6 +94,10 @@ def _get_alert_groups_total_metric(self, org_ids):
)
for org_key, ag_states in org_ag_states.items():
for _, integration_data in ag_states.items():
if "services" not in integration_data:
logger.warning(f"Deleting stale metrics cache for {org_key}")
cache.delete(org_key)
break
# Labels values should have the same order as _integration_labels_with_state
labels_values = [
integration_data["integration_name"], # integration
Expand Down Expand Up @@ -125,6 +131,10 @@ def _get_response_time_metric(self, org_ids):
)
for org_key, ag_response_time in org_ag_response_times.items():
for _, integration_data in ag_response_time.items():
if "services" not in integration_data:
logger.warning(f"Deleting stale metrics cache for {org_key}")
cache.delete(org_key)
break
# Labels values should have the same order as _integration_labels
labels_values = [
integration_data["integration_name"], # integration
Expand Down
54 changes: 54 additions & 0 deletions engine/apps/metrics_exporter/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,60 @@ def _mock_cache_get_many(keys, *args, **kwargs):
monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)


@pytest.fixture()
def mock_cache_get_old_metrics_for_collector(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
if ALERT_GROUPS_TOTAL in key:
key = ALERT_GROUPS_TOTAL
elif ALERT_GROUPS_RESPONSE_TIME in key:
key = ALERT_GROUPS_RESPONSE_TIME
elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
test_metrics = {
ALERT_GROUPS_TOTAL: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"firing": 2,
"silenced": 4,
"acknowledged": 3,
"resolved": 5,
},
},
ALERT_GROUPS_RESPONSE_TIME: {
1: {
"integration_name": "Test metrics integration",
"team_name": "Test team",
"team_id": 1,
"org_id": 1,
"slug": "Test stack",
"id": 1,
"response_time": [2, 10, 200, 650],
},
},
USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
1: {
"org_id": 1,
"slug": "Test stack",
"id": 1,
"user_username": "Alex",
"counter": 4,
}
},
}
return test_metrics.get(key)

def _mock_cache_get_many(keys, *args, **kwargs):
return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}

monkeypatch.setattr(cache, "get", _mock_cache_get)
monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)


@pytest.fixture()
def mock_get_metrics_cache(monkeypatch):
def _mock_cache_get(key, *args, **kwargs):
Expand Down
37 changes: 37 additions & 0 deletions engine/apps/metrics_exporter/tests/test_metrics_collectors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import pytest
from django.core.cache import cache
from django.test import override_settings
from prometheus_client import CollectorRegistry, generate_latest

Expand All @@ -11,6 +12,7 @@
NO_SERVICE_VALUE,
USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
)
from apps.metrics_exporter.helpers import get_metric_alert_groups_response_time_key, get_metric_alert_groups_total_key
from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector
from apps.metrics_exporter.tests.conftest import METRICS_TEST_SERVICE_NAME

Expand Down Expand Up @@ -75,3 +77,38 @@ def get_expected_labels(service_name=NO_SERVICE_VALUE, **kwargs):
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
assert mocked_start_calculate_and_cache_metrics.called
test_metrics_registry.unregister(collector)


@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1])
@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async")
@pytest.mark.django_db
def test_application_metrics_collector_with_old_metrics_without_services(
mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_old_metrics_for_collector
):
"""Test that ApplicationMetricsCollector generates expected metrics from cache"""

org_id = 1
collector = ApplicationMetricsCollector()
test_metrics_registry = CollectorRegistry()
test_metrics_registry.register(collector)
for metric in test_metrics_registry.collect():
if metric.name == ALERT_GROUPS_TOTAL:
alert_groups_total_metrics_cache = cache.get(get_metric_alert_groups_total_key(org_id))
assert alert_groups_total_metrics_cache and "services" not in alert_groups_total_metrics_cache[1]
assert len(metric.samples) == 0
elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
alert_groups_response_time_metrics_cache = cache.get(get_metric_alert_groups_response_time_key(org_id))
assert (
alert_groups_response_time_metrics_cache
and "services" not in alert_groups_response_time_metrics_cache[1]
)
assert len(metric.samples) == 0
elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
# metric with labels for each notified user
assert len(metric.samples) == 1
result = generate_latest(test_metrics_registry).decode("utf-8")
assert result is not None
assert mocked_org_ids.called
# Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
assert mocked_start_calculate_and_cache_metrics.called
test_metrics_registry.unregister(collector)

0 comments on commit 4e03732

Please sign in to comment.