-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
feat(deletions): Retry deleting from Nodestore for certain Snuba errors #102391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,14 @@ | |
| from sentry.taskworker.namespaces import deletion_tasks | ||
| from sentry.taskworker.retry import Retry | ||
| from sentry.utils import metrics | ||
| from sentry.utils.snuba import UnqualifiedQueryError, bulk_snuba_queries | ||
| from sentry.utils.snuba import ( | ||
| QueryExecutionTimeMaximum, | ||
| QueryMemoryLimitExceeded, | ||
| QueryTooManySimultaneous, | ||
| RateLimitExceeded, | ||
| UnqualifiedQueryError, | ||
| bulk_snuba_queries, | ||
| ) | ||
|
|
||
| EVENT_CHUNK_SIZE = 10000 | ||
| # https://github.com/getsentry/snuba/blob/54feb15b7575142d4b3af7f50d2c2c865329f2db/snuba/datasets/configuration/issues/storages/search_issues.yaml#L139 | ||
|
|
@@ -125,17 +132,38 @@ def delete_events_for_groups_from_nodestore_and_eventstore( | |
| ) | ||
| except UnqualifiedQueryError as error: | ||
| if error.args[0] == "All project_ids from the filter no longer exist": | ||
| # We currently don't have a way to handle this error, so we just track it and don't retry the task | ||
| metrics.incr(f"{prefix}.warning", tags={"type": "all-projects-deleted"}, sample_rate=1) | ||
| # When deleting groups, if the project gets deleted concurrently (e.g., by another deletion task), | ||
| # Snuba raises UnqualifiedQueryError with the message "All project_ids from the filter no longer exist". | ||
| # This happens because the task tries to fetch event IDs from Snuba for a project that no longer exists. | ||
| # This is not a transient error - retrying won't help since the project is permanently gone. | ||
| logger.info("All project_ids from the filter no longer exist") | ||
| # There may be no value to track this metric, but it's better to be safe than sorry. | ||
| metrics.incr(f"{prefix}.info", tags={"type": "all-projects-deleted"}, sample_rate=1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reducing the metric from |
||
| else: | ||
| metrics.incr(f"{prefix}.error", tags={"type": "unqualified-query-error"}, sample_rate=1) | ||
| # Report to Sentry to investigate | ||
| raise DeleteAborted(f"{error.args[0]}. We won't retry this task.") from error | ||
| metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
| # Report to Sentry to investigate and abort the task | ||
| raise DeleteAborted(f"{error.args[0]}. Aborting this task.") from error | ||
|
|
||
| # TODO: Add specific error handling for retryable errors and raise RetryTask when appropriate | ||
| except Exception: | ||
| metrics.incr(f"{prefix}.error", tags={"type": "unhandled-exception"}, sample_rate=1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This metric let me see that something changed in the last couple of days and to this Sentry issue.
|
||
| raise DeleteAborted("Failed to delete events from nodestore. We won't retry this task.") | ||
| # These are transient Snuba errors that should be retried | ||
| except ( | ||
| RateLimitExceeded, # Concurrent query rate limit exceeded | ||
| QueryTooManySimultaneous, # Too many simultaneous queries | ||
| QueryMemoryLimitExceeded, # Query exceeded memory limit | ||
| QueryExecutionTimeMaximum, # Query took too long | ||
| ) as error: | ||
| error_type = type(error).__name__ | ||
| metrics.incr(f"{prefix}.retry", tags={"type": f"snuba-{error_type}"}, sample_rate=1) | ||
| logger.warning( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't trigger a Sentry event every time this happens since the retried task may succeed. |
||
| f"{prefix}.retry", extra={**extra, "error_type": error_type, "error": str(error)} | ||
| ) | ||
| raise RetryTask(f"Snuba error: {error_type}. We will retry this task.") from error | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the new feature in this PR. I believe the errors above can be retried (we have an upper bound of how many times we would try). |
||
|
|
||
| except Exception as error: | ||
| metrics.incr(f"{prefix}.error", tags={"type": type(error).__name__}, sample_rate=1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
| # Report to Sentry to investigate and abort the task | ||
| raise DeleteAborted( | ||
| f"Failed to delete events from nodestore. Aborting this task. {error}" | ||
| ) from error | ||
|
|
||
|
|
||
| def fetch_events_from_eventstore( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| from collections.abc import Sequence | ||
| from unittest.mock import MagicMock, patch | ||
| from uuid import uuid4 | ||
|
|
||
| import pytest | ||
|
|
@@ -6,11 +8,19 @@ | |
| delete_events_for_groups_from_nodestore_and_eventstore, | ||
| fetch_events_from_eventstore, | ||
| ) | ||
| from sentry.exceptions import DeleteAborted | ||
| from sentry.services.eventstore.models import Event | ||
| from sentry.snuba.dataset import Dataset | ||
| from sentry.snuba.referrer import Referrer | ||
| from sentry.taskworker.retry import RetryError | ||
| from sentry.testutils.cases import TestCase | ||
| from sentry.utils.snuba import UnqualifiedQueryError | ||
| from sentry.utils.snuba import ( | ||
| QueryExecutionTimeMaximum, | ||
| QueryMemoryLimitExceeded, | ||
| QueryTooManySimultaneous, | ||
| RateLimitExceeded, | ||
| UnqualifiedQueryError, | ||
| ) | ||
|
|
||
|
|
||
| class NodestoreDeletionTaskTest(TestCase): | ||
|
|
@@ -24,7 +34,9 @@ def create_n_events_with_group(self, n_events: int) -> list[Event]: | |
| events.append(event) | ||
| return events | ||
|
|
||
| def fetch_events_from_eventstore(self, group_ids: list[int], dataset: Dataset) -> list[Event]: | ||
| def fetch_events_from_eventstore( | ||
| self, group_ids: Sequence[int], dataset: Dataset | ||
| ) -> list[Event]: | ||
| return fetch_events_from_eventstore( | ||
| project_id=self.project.id, | ||
| group_ids=group_ids, | ||
|
|
@@ -62,21 +74,25 @@ def test_simple_deletion_with_events(self) -> None: | |
| events_after = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events) | ||
| assert len(events_after) == 0 | ||
|
|
||
| def test_deletion_with_project_deleted(self) -> None: | ||
| """Test nodestore deletion when project is deleted.""" | ||
| @patch("sentry.deletions.tasks.nodestore.metrics") | ||
| def test_deletion_with_all_projects_deleted(self, mock_metrics: MagicMock) -> None: | ||
| """ | ||
| Test that when a project is deleted before the task runs, it increments | ||
| the correct metric and doesn't retry the task. | ||
|
|
||
| Root cause: When deleting groups, if the project gets deleted concurrently, | ||
| Snuba raises UnqualifiedQueryError with "All project_ids from the filter no longer exist". | ||
| This is not a transient error - retrying won't help since the project is gone. | ||
| The code tracks this scenario with a metric but allows the task to complete. | ||
| """ | ||
| events = self.create_n_events_with_group(n_events=5) | ||
| group_ids = [event.group_id for event in events if event.group_id is not None] | ||
|
|
||
| # Verify events exist in both eventstore and nodestore before deletion | ||
| events = self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events) | ||
| assert len(events) == 5 | ||
|
|
||
| # Deleting the project will cause Snuba to raise an error when fetching the event IDs. | ||
| # Delete the project to trigger the error | ||
| self.project.delete() | ||
|
|
||
| with self.tasks(): | ||
| # To delete events from the nodestore we fetch the event IDs from the eventstore (Snuba), | ||
| # however, when we delete the project, Snuba will raise an error. | ||
| # This will complete without raising DeleteAborted or RetryTask | ||
| delete_events_for_groups_from_nodestore_and_eventstore.apply_async( | ||
| kwargs={ | ||
| "organization_id": self.project.organization_id, | ||
|
|
@@ -89,5 +105,74 @@ def test_deletion_with_project_deleted(self) -> None: | |
| }, | ||
| ) | ||
|
|
||
| with pytest.raises(UnqualifiedQueryError): | ||
| self.fetch_events_from_eventstore(group_ids, dataset=Dataset.Events) | ||
| # Verify the metric was incremented with the correct tags | ||
| mock_metrics.incr.assert_any_call( | ||
| "deletions.nodestore.info", tags={"type": "all-projects-deleted"}, sample_rate=1 | ||
| ) | ||
|
|
||
| @patch("sentry.deletions.tasks.nodestore.metrics") | ||
| @patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore") | ||
| def test_unqualified_query_error(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None: | ||
| """Test that UnqualifiedQueryError errors are tracked with the correct metric.""" | ||
| events = self.create_n_events_with_group(n_events=2) | ||
| group_ids = [event.group_id for event in events if event.group_id is not None] | ||
| mock_fetch.side_effect = UnqualifiedQueryError( | ||
| "All project_ids from the filter no longer exist" | ||
| ) | ||
| with self.tasks(): | ||
| with pytest.raises(DeleteAborted): | ||
| delete_events_for_groups_from_nodestore_and_eventstore.apply_async( | ||
| kwargs={ | ||
| "organization_id": self.project.organization_id, | ||
| "project_id": self.project.id, | ||
| "group_ids": group_ids, | ||
| "times_seen": [1] * len(group_ids), | ||
| "transaction_id": uuid4().hex, | ||
| "dataset_str": Dataset.Events.value, | ||
| "referrer": "deletions.groups", | ||
| }, | ||
| ) | ||
|
|
||
| # Verify metric was incremented with correct tags | ||
| mock_metrics.incr.assert_any_call( | ||
| "deletions.nodestore.error", | ||
| tags={"type": "UnqualifiedQueryError"}, | ||
| sample_rate=1, | ||
| ) | ||
|
|
||
| @patch("sentry.deletions.tasks.nodestore.metrics") | ||
| @patch("sentry.deletions.tasks.nodestore.fetch_events_from_eventstore") | ||
| def test_snuba_errors_retry(self, mock_fetch: MagicMock, mock_metrics: MagicMock) -> None: | ||
| """Test that Snuba errors trigger a retry and are tracked with the correct metric.""" | ||
| events = self.create_n_events_with_group(n_events=2) | ||
| group_ids = [event.group_id for event in events if event.group_id is not None] | ||
|
|
||
| for snuba_error in [ | ||
| RateLimitExceeded("Rate limit exceeded"), | ||
| QueryTooManySimultaneous("Too many simultaneous queries"), | ||
| QueryMemoryLimitExceeded("Query exceeded memory limit"), | ||
| QueryExecutionTimeMaximum("Query took too long"), | ||
| UnqualifiedQueryError("All project_ids from the filter no longer exist"), | ||
| ]: | ||
| mock_fetch.side_effect = snuba_error | ||
| with self.tasks(): | ||
| with pytest.raises(RetryError): | ||
| delete_events_for_groups_from_nodestore_and_eventstore.apply_async( | ||
| kwargs={ | ||
| "organization_id": self.project.organization_id, | ||
| "project_id": self.project.id, | ||
| "group_ids": group_ids, | ||
| "times_seen": [1] * len(group_ids), | ||
| "transaction_id": uuid4().hex, | ||
| "dataset_str": Dataset.Events.value, | ||
| "referrer": "deletions.groups", | ||
| }, | ||
| ) | ||
|
|
||
| # Verify metric was incremented with correct tags | ||
| mock_metrics.incr.assert_any_call( | ||
| # We track that we're retrying the task if we get a Snuba error | ||
| "deletions.nodestore.retry", | ||
| tags={"type": f"snuba-{type(snuba_error).__name__}"}, | ||
| sample_rate=1, | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Test Fails Due to Unexpected Error HandlingThe |
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have to remember that we delete from the Nodestore as a best effort since eventually the events will ttl.
Alternatively, we could make the nodestore tasks delete the project rather than in the spawning task.