Drop stale metrics cache (#4695)

# What this PR does based on this PR - #4517 ## Which issue(s) this PR closes Related to #4455 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes.
grafana · Jul 17, 2024 · 4e03732 · 4e03732
1 parent 49d1127
commit 4e03732
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 0 deletions.
diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py
@@ -1,3 +1,4 @@
+import logging
 import re
 import typing
 
@@ -28,6 +29,7 @@
 
 application_metrics_registry = CollectorRegistry()
 
+logger = logging.getLogger(__name__)
 
 # _RE_BASE_PATTERN allows for optional curly-brackets around the metric name as in some cases this may occur
 # see common.cache.ensure_cache_key_allocates_to_the_same_hash_slot for more details regarding this
@@ -92,6 +94,10 @@ def _get_alert_groups_total_metric(self, org_ids):
         )
         for org_key, ag_states in org_ag_states.items():
             for _, integration_data in ag_states.items():
+                if "services" not in integration_data:
+                    logger.warning(f"Deleting stale metrics cache for {org_key}")
+                    cache.delete(org_key)
+                    break
                 # Labels values should have the same order as _integration_labels_with_state
                 labels_values = [
                     integration_data["integration_name"],  # integration
@@ -125,6 +131,10 @@ def _get_response_time_metric(self, org_ids):
         )
         for org_key, ag_response_time in org_ag_response_times.items():
             for _, integration_data in ag_response_time.items():
+                if "services" not in integration_data:
+                    logger.warning(f"Deleting stale metrics cache for {org_key}")
+                    cache.delete(org_key)
+                    break
                 # Labels values should have the same order as _integration_labels
                 labels_values = [
                     integration_data["integration_name"],  # integration

diff --git a/engine/apps/metrics_exporter/tests/conftest.py b/engine/apps/metrics_exporter/tests/conftest.py
@@ -113,6 +113,60 @@ def _mock_cache_get_many(keys, *args, **kwargs):
     monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)
 
 
+@pytest.fixture()
+def mock_cache_get_old_metrics_for_collector(monkeypatch):
+    def _mock_cache_get(key, *args, **kwargs):
+        if ALERT_GROUPS_TOTAL in key:
+            key = ALERT_GROUPS_TOTAL
+        elif ALERT_GROUPS_RESPONSE_TIME in key:
+            key = ALERT_GROUPS_RESPONSE_TIME
+        elif USER_WAS_NOTIFIED_OF_ALERT_GROUPS in key:
+            key = USER_WAS_NOTIFIED_OF_ALERT_GROUPS
+        test_metrics = {
+            ALERT_GROUPS_TOTAL: {
+                1: {
+                    "integration_name": "Test metrics integration",
+                    "team_name": "Test team",
+                    "team_id": 1,
+                    "org_id": 1,
+                    "slug": "Test stack",
+                    "id": 1,
+                    "firing": 2,
+                    "silenced": 4,
+                    "acknowledged": 3,
+                    "resolved": 5,
+                },
+            },
+            ALERT_GROUPS_RESPONSE_TIME: {
+                1: {
+                    "integration_name": "Test metrics integration",
+                    "team_name": "Test team",
+                    "team_id": 1,
+                    "org_id": 1,
+                    "slug": "Test stack",
+                    "id": 1,
+                    "response_time": [2, 10, 200, 650],
+                },
+            },
+            USER_WAS_NOTIFIED_OF_ALERT_GROUPS: {
+                1: {
+                    "org_id": 1,
+                    "slug": "Test stack",
+                    "id": 1,
+                    "user_username": "Alex",
+                    "counter": 4,
+                }
+            },
+        }
+        return test_metrics.get(key)
+
+    def _mock_cache_get_many(keys, *args, **kwargs):
+        return {key: _mock_cache_get(key) for key in keys if _mock_cache_get(key)}
+
+    monkeypatch.setattr(cache, "get", _mock_cache_get)
+    monkeypatch.setattr(cache, "get_many", _mock_cache_get_many)
+
+
 @pytest.fixture()
 def mock_get_metrics_cache(monkeypatch):
     def _mock_cache_get(key, *args, **kwargs):

diff --git a/engine/apps/metrics_exporter/tests/test_metrics_collectors.py b/engine/apps/metrics_exporter/tests/test_metrics_collectors.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+from django.core.cache import cache
 from django.test import override_settings
 from prometheus_client import CollectorRegistry, generate_latest
 
@@ -11,6 +12,7 @@
     NO_SERVICE_VALUE,
     USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
 )
+from apps.metrics_exporter.helpers import get_metric_alert_groups_response_time_key, get_metric_alert_groups_total_key
 from apps.metrics_exporter.metrics_collectors import ApplicationMetricsCollector
 from apps.metrics_exporter.tests.conftest import METRICS_TEST_SERVICE_NAME
 
@@ -75,3 +77,38 @@ def get_expected_labels(service_name=NO_SERVICE_VALUE, **kwargs):
         # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
         assert mocked_start_calculate_and_cache_metrics.called
         test_metrics_registry.unregister(collector)
+
+
+@patch("apps.metrics_exporter.metrics_collectors.get_organization_ids", return_value=[1])
+@patch("apps.metrics_exporter.metrics_collectors.start_calculate_and_cache_metrics.apply_async")
+@pytest.mark.django_db
+def test_application_metrics_collector_with_old_metrics_without_services(
+    mocked_org_ids, mocked_start_calculate_and_cache_metrics, mock_cache_get_old_metrics_for_collector
+):
+    """Test that ApplicationMetricsCollector generates expected metrics from cache"""
+
+    org_id = 1
+    collector = ApplicationMetricsCollector()
+    test_metrics_registry = CollectorRegistry()
+    test_metrics_registry.register(collector)
+    for metric in test_metrics_registry.collect():
+        if metric.name == ALERT_GROUPS_TOTAL:
+            alert_groups_total_metrics_cache = cache.get(get_metric_alert_groups_total_key(org_id))
+            assert alert_groups_total_metrics_cache and "services" not in alert_groups_total_metrics_cache[1]
+            assert len(metric.samples) == 0
+        elif metric.name == ALERT_GROUPS_RESPONSE_TIME:
+            alert_groups_response_time_metrics_cache = cache.get(get_metric_alert_groups_response_time_key(org_id))
+            assert (
+                alert_groups_response_time_metrics_cache
+                and "services" not in alert_groups_response_time_metrics_cache[1]
+            )
+            assert len(metric.samples) == 0
+        elif metric.name == USER_WAS_NOTIFIED_OF_ALERT_GROUPS:
+            # metric with labels for each notified user
+            assert len(metric.samples) == 1
+    result = generate_latest(test_metrics_registry).decode("utf-8")
+    assert result is not None
+    assert mocked_org_ids.called
+    # Since there is no recalculation timer for test org in cache, start_calculate_and_cache_metrics must be called
+    assert mocked_start_calculate_and_cache_metrics.called
+    test_metrics_registry.unregister(collector)