getsentry · armenzg · Oct 30, 2025 · Oct 30, 2025 · armenzg · Oct 30, 2025
diff --git a/src/sentry/deletions/tasks/nodestore.py b/src/sentry/deletions/tasks/nodestore.py
@@ -20,7 +20,14 @@
 from sentry.taskworker.namespaces import deletion_tasks
 from sentry.taskworker.retry import Retry
 from sentry.utils import metrics
-from sentry.utils.snuba import UnqualifiedQueryError, bulk_snuba_queries
+from sentry.utils.snuba import (
+    QueryExecutionTimeMaximum,
+    QueryMemoryLimitExceeded,
+    QueryTooManySimultaneous,
+    RateLimitExceeded,
+    UnqualifiedQueryError,
+    bulk_snuba_queries,
+)
 
 EVENT_CHUNK_SIZE = 10000
 # https://github.com/getsentry/snuba/blob/54feb15b7575142d4b3af7f50d2c2c865329f2db/snuba/datasets/configuration/issues/storages/search_issues.yaml#L139
@@ -125,17 +132,38 @@ def delete_events_for_groups_from_nodestore_and_eventstore(
             )
     except UnqualifiedQueryError as error:
         if error.args[0] == "All project_ids from the filter no longer exist":
-            # We currently don't have a way to handle this error, so we just track it and don't retry the task
-            metrics.incr(f"{prefix}.warning", tags={"type": "all-projects-deleted"}, sample_rate=1)
+            # When deleting groups, if the project gets deleted concurrently (e.g., by another deletion task),
+            # Snuba raises UnqualifiedQueryError with the message "All project_ids from the filter no longer exist".
+            # This happens because the task tries to fetch event IDs from Snuba for a project that no longer exists.
+            # This is not a transient error - retrying won't help since the project is permanently gone.
+            logger.info("All project_ids from the filter no longer exist")
+            # There may be no value to track this metric, but it's better to be safe than sorry.
+            metrics.incr(f"{prefix}.info", tags={"type": "all-projects-deleted"}, sample_rate=1)
         else:
-            metrics.incr(f"{prefix}.error", tags={"type": "unqualified-query-error"}, sample_rate=1)
-            # Report to Sentry to investigate
-            raise DeleteAborted(f"{error.args[0]}. We won't retry this task.") from error
+            metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1)
+            # Report to Sentry to investigate and abort the task
+            raise DeleteAborted(f"{error.args[0]}. Aborting this task.") from error
 
-    # TODO: Add specific error handling for retryable errors and raise RetryTask when appropriate
-    except Exception:
-        metrics.incr(f"{prefix}.error", tags={"type": "unhandled-exception"}, sample_rate=1)
-        raise DeleteAborted("Failed to delete events from nodestore. We won't retry this task.")
+    # These are transient Snuba errors that should be retried
+    except (
+        RateLimitExceeded,  # Concurrent query rate limit exceeded
+        QueryTooManySimultaneous,  # Too many simultaneous queries
+        QueryMemoryLimitExceeded,  # Query exceeded memory limit
+        QueryExecutionTimeMaximum,  # Query took too long
+    ) as error:
+        error_type = type(error).__name__
+        metrics.incr(f"{prefix}.retry", tags={"type": f"snuba-{error_type}"}, sample_rate=1)
+        logger.warning(
+            f"{prefix}.retry", extra={**extra, "error_type": error_type, "error": str(error)}
+        )
+        raise RetryTask(f"Snuba error: {error_type}. We will retry this task.") from error
+
+    except Exception as error:
+        metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1)
+        # Report to Sentry to investigate and abort the task
+        raise DeleteAborted(
+            f"Failed to delete events from nodestore. Aborting this task. {error}"
+        ) from error
 
 
 def fetch_events_from_eventstore(

diff --git a/tests/sentry/deletions/tasks/test_nodestore.py b/tests/sentry/deletions/tasks/test_nodestore.py
@@ -1,3 +1,5 @@
+from collections.abc import Sequence
+from unittest.mock import MagicMock, patch
 from uuid import uuid4
 
 import pytest
@@ -6,11 +8,19 @@
     delete_events_for_groups_from_nodestore_and_eventstore,
     fetch_events_from_eventstore,
 )
+from sentry.exceptions import DeleteAborted
 from sentry.services.eventstore.models import Event
 from sentry.snuba.dataset import Dataset
 from sentry.snuba.referrer import Referrer
+from sentry.taskworker.retry import RetryError
 from sentry.testutils.cases import TestCase
-from sentry.utils.snuba import UnqualifiedQueryError
+from sentry.utils.snuba import (
+    QueryExecutionTimeMaximum,
+    QueryMemoryLimitExceeded,
+    QueryTooManySimultaneous,
+    RateLimitExceeded,
+    UnqualifiedQueryError,
+)
 
 
 class NodestoreDeletionTaskTest(TestCase):
@@ -24,7 +34,9 @@ def create_n_events_with_group(self, n_events: int) -> list[Event]:
             events.append(event)
         return events
 
-    def fetch_events_from_eventstore(self, group_ids: list[int], dataset: Dataset) -> list[Event]:
+    def fetch_events_from_eventstore(
+        self, group_ids: Sequence[int], dataset: Dataset
+    ) -> list[Event]:
         return fetch_events_from_eventstore(
             project_id=self.project.id,
             group_ids=group_ids,
@@ -62,21 +74,25 @@ def test_simple_deletion_with_events(self) -> None:
         events_after = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
         assert len(events_after) == 0
 
-    def test_deletion_with_project_deleted(self) -> None:
-        """Test nodestore deletion when project is deleted."""
+    @patch("sentry.deletions.tasks.nodestore.metrics")
+    def test_deletion_with_all_projects_deleted(self, mock_metrics: MagicMock) -> None:
+        """
+        Test that when a project is deleted before the task runs, it increments
+        the correct metric and doesn't retry the task.
+
+        Root cause: When deleting groups, if the project gets deleted concurrently,
+        Snuba raises UnqualifiedQueryError with "All project_ids from the filter no longer exist".
+        This is not a transient error - retrying won't help since the project is gone.
+        The code tracks this scenario with a metric but allows the task to complete.
+        """
         events = self.create_n_events_with_group(n_events=5)
         group_ids = [event.group_id for event in events if event.group_id is not None]
 
-        # Verify events exist in both eventstore and nodestore before deletion
-        events = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
-        assert len(events) == 5
-
-        # Deleting the project will cause Snuba to raise an error when fetching the event IDs.
+        # Delete the project to trigger the error
         self.project.delete()
 
         with self.tasks():
-            # To delete events from the nodestore we fetch the event IDs from the eventstore (Snuba),
-            # however, when we delete the project, Snuba will raise an error.
+            # This will complete without raising DeleteAborted or RetryTask
             delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
                 kwargs={
                     "organization_id": self.project.organization_id,
@@ -89,5 +105,74 @@ def test_deletion_with_project_deleted(self) -> None:
                 },
             )
 
-        with pytest.raises(UnqualifiedQueryError):
-            self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
+        # Verify the metric was incremented with the correct tags
+        mock_metrics.incr.assert_any_call(
+            "deletions.nodestore.info", tags={"type": "all-projects-deleted"}, sample_rate=1
+        )
+
+    @patch("sentry.deletions.tasks.nodestore.metrics")
+    @patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore")
+    def test_unqualified_query_error(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None:
+        """Test that UnqualifiedQueryError errors are tracked with the correct metric."""
+        events = self.create_n_events_with_group(n_events=2)
+        group_ids = [event.group_id for event in events if event.group_id is not None]
+        mock_fetch.side_effect = UnqualifiedQueryError(
+            "All project_ids from the filter no longer exist"
+        )
+        with self.tasks():
+            with pytest.raises(DeleteAborted):
+                delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
+                    kwargs={
+                        "organization_id": self.project.organization_id,
+                        "project_id": self.project.id,
+                        "group_ids": group_ids,
+                        "times_seen": [1] * len(group_ids),
+                        "transaction_id": uuid4().hex,
+                        "dataset_str": Dataset.Events.value,
+                        "referrer": "deletions.groups",
+                    },
+                )
+
+            # Verify metric was incremented with correct tags
+            mock_metrics.incr.assert_any_call(
+                "deletions.nodestore.error",
+                tags={"type": "UnqualifiedQueryError"},
+                sample_rate=1,
+            )
+
+    @patch("sentry.deletions.tasks.nodestore.metrics")
+    @patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore")
+    def test_snuba_errors_retry(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None:
+        """Test that Snuba errors trigger a retry and are tracked with the correct metric."""
+        events = self.create_n_events_with_group(n_events=2)
+        group_ids = [event.group_id for event in events if event.group_id is not None]
+
+        for snuba_error in [
+            RateLimitExceeded("Rate limit exceeded"),
+            QueryTooManySimultaneous("Too many simultaneous queries"),
+            QueryMemoryLimitExceeded("Query exceeded memory limit"),
+            QueryExecutionTimeMaximum("Query took too long"),
+            UnqualifiedQueryError("All project_ids from the filter no longer exist"),
+        ]:
+            mock_fetch.side_effect = snuba_error
+            with self.tasks():
+                with pytest.raises(RetryError):
+                    delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
+                        kwargs={
+                            "organization_id": self.project.organization_id,
+                            "project_id": self.project.id,
+                            "group_ids": group_ids,
+                            "times_seen": [1] * len(group_ids),
+                            "transaction_id": uuid4().hex,
+                            "dataset_str": Dataset.Events.value,
+                            "referrer": "deletions.groups",
+                        },
+                    )
+
+            # Verify metric was incremented with correct tags
+            mock_metrics.incr.assert_any_call(
+                # We track that we're retrying the task if we get a Snuba error
+                "deletions.nodestore.retry",
+                tags={"type": f"snuba-{type(snuba_error).__name__}"},
+                sample_rate=1,
+            )