Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 38 additions & 10 deletions src/sentry/deletions/tasks/nodestore.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@
from sentry.taskworker.namespaces import deletion_tasks
from sentry.taskworker.retry import Retry
from sentry.utils import metrics
from sentry.utils.snuba import UnqualifiedQueryError, bulk_snuba_queries
from sentry.utils.snuba import (
QueryExecutionTimeMaximum,
QueryMemoryLimitExceeded,
QueryTooManySimultaneous,
RateLimitExceeded,
UnqualifiedQueryError,
bulk_snuba_queries,
)

EVENT_CHUNK_SIZE = 10000
# https://github.com/getsentry/snuba/blob/54feb15b7575142d4b3af7f50d2c2c865329f2db/snuba/datasets/configuration/issues/storages/search_issues.yaml#L139
Expand Down Expand Up @@ -125,17 +132,38 @@ def delete_events_for_groups_from_nodestore_and_eventstore(
)
except UnqualifiedQueryError as error:
if error.args[0] == "All project_ids from the filter no longer exist":
# We currently don't have a way to handle this error, so we just track it and don't retry the task
metrics.incr(f"{prefix}.warning", tags={"type": "all-projects-deleted"}, sample_rate=1)
# When deleting groups, if the project gets deleted concurrently (e.g., by another deletion task),
# Snuba raises UnqualifiedQueryError with the message "All project_ids from the filter no longer exist".
# This happens because the task tries to fetch event IDs from Snuba for a project that no longer exists.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to remember that we delete from the Nodestore as a best effort since eventually the events will ttl.

Alternatively, we could make the nodestore tasks delete the project rather than in the spawning task.

# This is not a transient error - retrying won't help since the project is permanently gone.
logger.info("All project_ids from the filter no longer exist")
# There may be no value to track this metric, but it's better to be safe than sorry.
metrics.incr(f"{prefix}.info", tags={"type": "all-projects-deleted"}, sample_rate=1)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reducing the metric from warning to info.

else:
metrics.incr(f"{prefix}.error", tags={"type": "unqualified-query-error"}, sample_rate=1)
# Report to Sentry to investigate
raise DeleteAborted(f"{error.args[0]}. We won't retry this task.") from error
metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using type(error).__name__ instead of unqualified-query-error to distinguish the errors in DD.

# Report to Sentry to investigate and abort the task
raise DeleteAborted(f"{error.args[0]}. Aborting this task.") from error

# TODO: Add specific error handling for retryable errors and raise RetryTask when appropriate
except Exception:
metrics.incr(f"{prefix}.error", tags={"type": "unhandled-exception"}, sample_rate=1)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric let me see that something changed in the last couple of days and to this Sentry issue.

Image

raise DeleteAborted("Failed to delete events from nodestore. We won't retry this task.")
# These are transient Snuba errors that should be retried
except (
RateLimitExceeded, # Concurrent query rate limit exceeded
QueryTooManySimultaneous, # Too many simultaneous queries
QueryMemoryLimitExceeded, # Query exceeded memory limit
QueryExecutionTimeMaximum, # Query took too long
) as error:
error_type = type(error).__name__
metrics.incr(f"{prefix}.retry", tags={"type": f"snuba-{error_type}"}, sample_rate=1)
logger.warning(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't trigger a Sentry event every time this happens since the retried task may succeed.

f"{prefix}.retry", extra={**extra, "error_type": error_type, "error": str(error)}
)
raise RetryTask(f"Snuba error: {error_type}. We will retry this task.") from error
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the new feature in this PR. I believe the errors above can be retried (we have an upper bound of how many times we would try).


except Exception as error:
metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using type(error).__name__ instead of unhandled-exception to distinguish in Datadog.

# Report to Sentry to investigate and abort the task
raise DeleteAborted(
f"Failed to delete events from nodestore. Aborting this task. {error}"
) from error


def fetch_events_from_eventstore(
Expand Down
111 changes: 98 additions & 13 deletions tests/sentry/deletions/tasks/test_nodestore.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections.abc import Sequence
from unittest.mock import MagicMock, patch
from uuid import uuid4

import pytest
Expand All @@ -6,11 +8,19 @@
delete_events_for_groups_from_nodestore_and_eventstore,
fetch_events_from_eventstore,
)
from sentry.exceptions import DeleteAborted
from sentry.services.eventstore.models import Event
from sentry.snuba.dataset import Dataset
from sentry.snuba.referrer import Referrer
from sentry.taskworker.retry import RetryError
from sentry.testutils.cases import TestCase
from sentry.utils.snuba import UnqualifiedQueryError
from sentry.utils.snuba import (
QueryExecutionTimeMaximum,
QueryMemoryLimitExceeded,
QueryTooManySimultaneous,
RateLimitExceeded,
UnqualifiedQueryError,
)


class NodestoreDeletionTaskTest(TestCase):
Expand All @@ -24,7 +34,9 @@ def create_n_events_with_group(self, n_events: int) -> list[Event]:
events.append(event)
return events

def fetch_events_from_eventstore(self, group_ids: list[int], dataset: Dataset) -> list[Event]:
def fetch_events_from_eventstore(
self, group_ids: Sequence[int], dataset: Dataset
) -> list[Event]:
return fetch_events_from_eventstore(
project_id=self.project.id,
group_ids=group_ids,
Expand Down Expand Up @@ -62,21 +74,25 @@ def test_simple_deletion_with_events(self) -> None:
events_after = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
assert len(events_after) == 0

def test_deletion_with_project_deleted(self) -> None:
"""Test nodestore deletion when project is deleted."""
@patch("sentry.deletions.tasks.nodestore.metrics")
def test_deletion_with_all_projects_deleted(self, mock_metrics: MagicMock) -> None:
"""
Test that when a project is deleted before the task runs, it increments
the correct metric and doesn't retry the task.

Root cause: When deleting groups, if the project gets deleted concurrently,
Snuba raises UnqualifiedQueryError with "All project_ids from the filter no longer exist".
This is not a transient error - retrying won't help since the project is gone.
The code tracks this scenario with a metric but allows the task to complete.
"""
events = self.create_n_events_with_group(n_events=5)
group_ids = [event.group_id for event in events if event.group_id is not None]

# Verify events exist in both eventstore and nodestore before deletion
events = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
assert len(events) == 5

# Deleting the project will cause Snuba to raise an error when fetching the event IDs.
# Delete the project to trigger the error
self.project.delete()

with self.tasks():
# To delete events from the nodestore we fetch the event IDs from the eventstore (Snuba),
# however, when we delete the project, Snuba will raise an error.
# This will complete without raising DeleteAborted or RetryTask
delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
kwargs={
"organization_id": self.project.organization_id,
Expand All @@ -89,5 +105,74 @@ def test_deletion_with_project_deleted(self) -> None:
},
)

with pytest.raises(UnqualifiedQueryError):
self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events)
# Verify the metric was incremented with the correct tags
mock_metrics.incr.assert_any_call(
"deletions.nodestore.info", tags={"type": "all-projects-deleted"}, sample_rate=1
)

@patch("sentry.deletions.tasks.nodestore.metrics")
@patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore")
def test_unqualified_query_error(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None:
"""Test that UnqualifiedQueryError errors are tracked with the correct metric."""
events = self.create_n_events_with_group(n_events=2)
group_ids = [event.group_id for event in events if event.group_id is not None]
mock_fetch.side_effect = UnqualifiedQueryError(
"All project_ids from the filter no longer exist"
)
with self.tasks():
with pytest.raises(DeleteAborted):
delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
kwargs={
"organization_id": self.project.organization_id,
"project_id": self.project.id,
"group_ids": group_ids,
"times_seen": [1] * len(group_ids),
"transaction_id": uuid4().hex,
"dataset_str": Dataset.Events.value,
"referrer": "deletions.groups",
},
)

# Verify metric was incremented with correct tags
mock_metrics.incr.assert_any_call(
"deletions.nodestore.error",
tags={"type": "UnqualifiedQueryError"},
sample_rate=1,
)

@patch("sentry.deletions.tasks.nodestore.metrics")
@patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore")
def test_snuba_errors_retry(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None:
"""Test that Snuba errors trigger a retry and are tracked with the correct metric."""
events = self.create_n_events_with_group(n_events=2)
group_ids = [event.group_id for event in events if event.group_id is not None]

for snuba_error in [
RateLimitExceeded("Rate limit exceeded"),
QueryTooManySimultaneous("Too many simultaneous queries"),
QueryMemoryLimitExceeded("Query exceeded memory limit"),
QueryExecutionTimeMaximum("Query took too long"),
UnqualifiedQueryError("All project_ids from the filter no longer exist"),
]:
mock_fetch.side_effect = snuba_error
with self.tasks():
with pytest.raises(RetryError):
delete_events_for_groups_from_nodestore_and_eventstore.apply_async(
kwargs={
"organization_id": self.project.organization_id,
"project_id": self.project.id,
"group_ids": group_ids,
"times_seen": [1] * len(group_ids),
"transaction_id": uuid4().hex,
"dataset_str": Dataset.Events.value,
"referrer": "deletions.groups",
},
)

# Verify metric was incremented with correct tags
mock_metrics.incr.assert_any_call(
# We track that we're retrying the task if we get a Snuba error
"deletions.nodestore.retry",
tags={"type": f"snuba-{type(snuba_error).__name__}"},
sample_rate=1,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Test Fails Due to Unexpected Error Handling

The test_snuba_errors_retry test expects UnqualifiedQueryError("All project_ids from the filter no longer exist") to trigger a retry. However, the code specifically handles this error by logging an info metric and completing normally, which causes the test to fail and contradicts the behavior in test_deletion_with_all_projects_deleted.

Fix in Cursor Fix in Web

Loading