From bb4875f8a59c43639faaa8ec3c095d1e918f954c Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Thu, 28 Nov 2024 16:03:07 -0300 Subject: [PATCH 1/7] feat: add service account checks in plugin auth (#5305) Related to https://github.com/grafana/oncall-private/issues/2826 Related to https://github.com/grafana/irm/pull/459 Allow org sync requests from service account users. Also trigger a sync during public API requests if the org wasn't yet setup. --- engine/apps/auth_token/auth.py | 21 ++++++- .../auth_token/grafana/grafana_auth_token.py | 8 +++ engine/apps/auth_token/tests/helpers.py | 6 +- .../auth_token/tests/test_grafana_auth.py | 59 ++++++++++++++++--- .../apps/auth_token/tests/test_plugin_auth.py | 32 +++++++++- engine/apps/grafana_plugin/helpers/client.py | 3 + .../public_api/tests/test_alert_groups.py | 2 +- .../public_api/tests/test_integrations.py | 2 +- .../public_api/tests/test_resolution_notes.py | 2 +- 9 files changed, 118 insertions(+), 17 deletions(-) diff --git a/engine/apps/auth_token/auth.py b/engine/apps/auth_token/auth.py index 3a7e25d6bd..af1fc2c6b4 100644 --- a/engine/apps/auth_token/auth.py +++ b/engine/apps/auth_token/auth.py @@ -9,6 +9,7 @@ from rest_framework.authentication import BaseAuthentication, get_authorization_header from rest_framework.request import Request +from apps.auth_token.grafana.grafana_auth_token import setup_organization from apps.grafana_plugin.helpers.gcom import check_token from apps.grafana_plugin.sync_data import SyncPermission, SyncUser from apps.user_management.exceptions import OrganizationDeletedException, OrganizationMovedException @@ -133,6 +134,14 @@ def _get_user(request: Request, organization: Organization) -> User: except KeyError: user_id = context["UserID"] + if context.get("IsServiceAccount", False): + # no user involved in service account requests + logger.info(f"serviceaccount request - id={user_id}") + service_account_role = context.get("Role", "None") + if service_account_role.lower() != "admin": + raise exceptions.AuthenticationFailed("Service account requests must have Admin or Editor role.") + return None + try: return organization.users.get(user_id=user_id) except User.DoesNotExist: @@ -148,6 +157,9 @@ def _get_user(request: Request, organization: Organization) -> User: except (ValueError, TypeError): raise exceptions.AuthenticationFailed("Grafana context must be JSON dict.") + if context.get("IsServiceAccount", False): + raise exceptions.AuthenticationFailed("Service accounts requests are not allowed.") + try: user_id = context.get("UserId", context.get("UserID")) if user_id is not None: @@ -347,7 +359,7 @@ def authenticate(self, request): if not auth.startswith(ServiceAccountToken.GRAFANA_SA_PREFIX): return None - organization = self.get_organization(request) + organization = self.get_organization(request, auth) if not organization: raise exceptions.AuthenticationFailed("Invalid organization.") if organization.is_moved: @@ -357,12 +369,15 @@ def authenticate(self, request): return self.authenticate_credentials(organization, auth) - def get_organization(self, request): + def get_organization(self, request, auth): grafana_url = request.headers.get(X_GRAFANA_URL) if grafana_url: organization = Organization.objects.filter(grafana_url=grafana_url).first() if not organization: - raise exceptions.AuthenticationFailed("Invalid Grafana URL.") + success = setup_organization(grafana_url, auth) + if not success: + raise exceptions.AuthenticationFailed("Invalid Grafana URL.") + organization = Organization.objects.filter(grafana_url=grafana_url).first() return organization if settings.LICENSE == settings.CLOUD_LICENSE_NAME: diff --git a/engine/apps/auth_token/grafana/grafana_auth_token.py b/engine/apps/auth_token/grafana/grafana_auth_token.py index 6576e41793..6183a9da5f 100644 --- a/engine/apps/auth_token/grafana/grafana_auth_token.py +++ b/engine/apps/auth_token/grafana/grafana_auth_token.py @@ -52,3 +52,11 @@ def get_service_account_details(organization: Organization, token: str) -> typin grafana_api_client = GrafanaAPIClient(api_url=organization.grafana_url, api_token=token) user_data, _ = grafana_api_client.get_current_user() return user_data + + +def setup_organization(grafana_url: str, token: str): + grafana_api_client = GrafanaAPIClient(api_url=grafana_url, api_token=token) + _, call_status = grafana_api_client.setup_organization() + if call_status["status_code"] != 200: + return False + return True diff --git a/engine/apps/auth_token/tests/helpers.py b/engine/apps/auth_token/tests/helpers.py index bcecce6f2c..ebbff6af5b 100644 --- a/engine/apps/auth_token/tests/helpers.py +++ b/engine/apps/auth_token/tests/helpers.py @@ -3,16 +3,16 @@ import httpretty -def setup_service_account_api_mocks(organization, perms=None, user_data=None, perms_status=200, user_status=200): +def setup_service_account_api_mocks(grafana_url, perms=None, user_data=None, perms_status=200, user_status=200): # requires enabling httpretty if perms is None: perms = {} mock_response = httpretty.Response(status=perms_status, body=json.dumps(perms)) - perms_url = f"{organization.grafana_url}/api/access-control/user/permissions" + perms_url = f"{grafana_url}/api/access-control/user/permissions" httpretty.register_uri(httpretty.GET, perms_url, responses=[mock_response]) if user_data is None: user_data = {"login": "some-login", "uid": "service-account:42"} mock_response = httpretty.Response(status=user_status, body=json.dumps(user_data)) - user_url = f"{organization.grafana_url}/api/user" + user_url = f"{grafana_url}/api/user" httpretty.register_uri(httpretty.GET, user_url, responses=[mock_response]) diff --git a/engine/apps/auth_token/tests/test_grafana_auth.py b/engine/apps/auth_token/tests/test_grafana_auth.py index 3a8ec56c0d..3cb01727f5 100644 --- a/engine/apps/auth_token/tests/test_grafana_auth.py +++ b/engine/apps/auth_token/tests/test_grafana_auth.py @@ -10,7 +10,8 @@ from apps.auth_token.auth import X_GRAFANA_INSTANCE_ID, GrafanaServiceAccountAuthentication from apps.auth_token.models import ServiceAccountToken from apps.auth_token.tests.helpers import setup_service_account_api_mocks -from apps.user_management.models import ServiceAccountUser +from apps.user_management.models import Organization, ServiceAccountUser +from common.constants.plugin_ids import PluginID from settings.base import CLOUD_LICENSE_NAME, OPEN_SOURCE_LICENSE_NAME, SELF_HOSTED_SETTINGS @@ -98,13 +99,17 @@ def test_grafana_authentication_missing_org(): @pytest.mark.django_db @httpretty.activate(verbose=True, allow_net_connect=False) def test_grafana_authentication_invalid_grafana_url(): + grafana_url = "http://grafana.test" token = f"{ServiceAccountToken.GRAFANA_SA_PREFIX}xyz" headers = { "HTTP_AUTHORIZATION": token, - "HTTP_X_GRAFANA_URL": "http://grafana.test", # no org for this URL + "HTTP_X_GRAFANA_URL": grafana_url, # no org for this URL } request = APIRequestFactory().get("/", **headers) + request_sync_url = f"{grafana_url}/api/plugins/{PluginID.ONCALL}/resources/plugin/sync?wait=true&force=true" + httpretty.register_uri(httpretty.POST, request_sync_url, status=404) + with pytest.raises(exceptions.AuthenticationFailed) as exc: GrafanaServiceAccountAuthentication().authenticate(request) assert exc.value.detail == "Invalid Grafana URL." @@ -145,7 +150,7 @@ def test_grafana_authentication_permissions_call_fails(make_organization): # setup Grafana API responses # permissions endpoint returns a 401 - setup_service_account_api_mocks(organization, perms_status=401) + setup_service_account_api_mocks(organization.grafana_url, perms_status=401) with pytest.raises(exceptions.AuthenticationFailed) as exc: GrafanaServiceAccountAuthentication().authenticate(request) @@ -178,7 +183,7 @@ def test_grafana_authentication_existing_token( request = APIRequestFactory().get("/", **headers) # setup Grafana API responses - setup_service_account_api_mocks(organization, {"some-perm": "value"}) + setup_service_account_api_mocks(organization.grafana_url, {"some-perm": "value"}) user, auth_token = GrafanaServiceAccountAuthentication().authenticate(request) @@ -214,7 +219,7 @@ def test_grafana_authentication_token_created(make_organization): # setup Grafana API responses permissions = {"some-perm": "value"} user_data = {"login": "some-login", "uid": "service-account:42"} - setup_service_account_api_mocks(organization, permissions, user_data) + setup_service_account_api_mocks(organization.grafana_url, permissions, user_data) user, auth_token = GrafanaServiceAccountAuthentication().authenticate(request) @@ -256,7 +261,7 @@ def test_grafana_authentication_token_created_older_grafana(make_organization): # setup Grafana API responses permissions = {"some-perm": "value"} # User API fails for older Grafana versions - setup_service_account_api_mocks(organization, permissions, user_status=400) + setup_service_account_api_mocks(organization.grafana_url, permissions, user_status=400) user, auth_token = GrafanaServiceAccountAuthentication().authenticate(request) @@ -290,10 +295,50 @@ def test_grafana_authentication_token_reuse_service_account(make_organization, m "login": service_account.login, "uid": f"service-account:{service_account.grafana_id}", } - setup_service_account_api_mocks(organization, permissions, user_data) + setup_service_account_api_mocks(organization.grafana_url, permissions, user_data) user, auth_token = GrafanaServiceAccountAuthentication().authenticate(request) assert isinstance(user, ServiceAccountUser) assert user.service_account == service_account assert auth_token.service_account == service_account + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_grafana_authentication_token_setup_org_if_missing(make_organization): + grafana_url = "http://grafana.test" + token_string = "glsa_the-token" + + headers = { + "HTTP_AUTHORIZATION": token_string, + "HTTP_X_GRAFANA_URL": grafana_url, + } + request = APIRequestFactory().get("/", **headers) + + # setup Grafana API responses + permissions = {"some-perm": "value"} + setup_service_account_api_mocks(grafana_url, permissions) + + request_sync_url = f"{grafana_url}/api/plugins/{PluginID.ONCALL}/resources/plugin/sync?wait=true&force=true" + httpretty.register_uri(httpretty.POST, request_sync_url) + + assert Organization.objects.filter(grafana_url=grafana_url).count() == 0 + + def sync_org(): + make_organization(grafana_url=grafana_url, is_rbac_permissions_enabled=True) + return (True, {"status_code": 200}) + + with patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.setup_organization") as mock_setup_org: + mock_setup_org.side_effect = sync_org + user, auth_token = GrafanaServiceAccountAuthentication().authenticate(request) + + mock_setup_org.assert_called_once() + + assert isinstance(user, ServiceAccountUser) + service_account = user.service_account + # organization is created + organization = Organization.objects.filter(grafana_url=grafana_url).get() + assert organization.grafana_url == grafana_url + assert service_account.organization == organization + assert auth_token.service_account == user.service_account diff --git a/engine/apps/auth_token/tests/test_plugin_auth.py b/engine/apps/auth_token/tests/test_plugin_auth.py index d03c40b388..a44bd3777d 100644 --- a/engine/apps/auth_token/tests/test_plugin_auth.py +++ b/engine/apps/auth_token/tests/test_plugin_auth.py @@ -5,7 +5,7 @@ from rest_framework.exceptions import AuthenticationFailed from rest_framework.test import APIRequestFactory -from apps.auth_token.auth import PluginAuthentication +from apps.auth_token.auth import BasePluginAuthentication, PluginAuthentication INSTANCE_CONTEXT = '{"stack_id": 42, "org_id": 24, "grafana_token": "abc"}' @@ -171,3 +171,33 @@ def test_plugin_authentication_self_hosted_setup_new_user(make_organization, mak assert ret_user.user_id == 12 assert ret_token.organization == organization assert organization.users.count() == 1 + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "role,expected_raises", [("Admin", False), ("Editor", True), ("Viewer", True), ("Other", True)] +) +def test_plugin_authentication_service_account(make_organization, role, expected_raises): + # Setting gcom_token_org_last_time_synced to now, so it doesn't try to sync with gcom + organization = make_organization( + stack_id=42, org_id=24, gcom_token="123", api_token="abc", gcom_token_org_last_time_synced=timezone.now() + ) + + headers = { + "HTTP_AUTHORIZATION": "gcom:123", + "HTTP_X-Instance-Context": INSTANCE_CONTEXT, + "HTTP_X-Grafana-Context": json.dumps({"UserId": 12, "Role": role, "IsServiceAccount": True}), + } + request = APIRequestFactory().get("/", **headers) + + if expected_raises: + with pytest.raises(AuthenticationFailed): + BasePluginAuthentication().authenticate(request) + else: + ret_user, ret_token = BasePluginAuthentication().authenticate(request) + assert ret_user is None + assert ret_token.organization == organization + + # PluginAuthentication should always raise an exception if the request comes from a service account + with pytest.raises(AuthenticationFailed): + PluginAuthentication().authenticate(request) diff --git a/engine/apps/grafana_plugin/helpers/client.py b/engine/apps/grafana_plugin/helpers/client.py index 17d1cabd20..0037cb7410 100644 --- a/engine/apps/grafana_plugin/helpers/client.py +++ b/engine/apps/grafana_plugin/helpers/client.py @@ -337,6 +337,9 @@ def create_service_account_token( def get_service_account_token_permissions(self) -> APIClientResponse[typing.Dict[str, typing.List[str]]]: return self.api_get("api/access-control/user/permissions") + def setup_organization(self) -> APIClientResponse: + return self.api_post(f"api/plugins/{PluginID.ONCALL}/resources/plugin/sync?wait=true&force=true") + def sync(self, organization: "Organization") -> APIClientResponse: return self.api_post(f"api/plugins/{organization.active_ui_plugin_id}/resources/plugin/sync") diff --git a/engine/apps/public_api/tests/test_alert_groups.py b/engine/apps/public_api/tests/test_alert_groups.py index acc6b823e9..2b79f478ec 100644 --- a/engine/apps/public_api/tests/test_alert_groups.py +++ b/engine/apps/public_api/tests/test_alert_groups.py @@ -756,7 +756,7 @@ def test_actions_disabled_for_service_accounts( perms = { permissions.RBACPermission.Permissions.ALERT_GROUPS_WRITE.value: ["*"], } - setup_service_account_api_mocks(organization, perms=perms) + setup_service_account_api_mocks(organization.grafana_url, perms=perms) client = APIClient() disabled_actions = ["acknowledge", "unacknowledge", "resolve", "unresolve", "silence", "unsilence"] diff --git a/engine/apps/public_api/tests/test_integrations.py b/engine/apps/public_api/tests/test_integrations.py index 9a4e29c64f..9ac249fd1b 100644 --- a/engine/apps/public_api/tests/test_integrations.py +++ b/engine/apps/public_api/tests/test_integrations.py @@ -124,7 +124,7 @@ def test_create_integration_via_service_account( perms = { permissions.RBACPermission.Permissions.INTEGRATIONS_WRITE.value: ["*"], } - setup_service_account_api_mocks(organization, perms) + setup_service_account_api_mocks(organization.grafana_url, perms) client = APIClient() data_for_create = { diff --git a/engine/apps/public_api/tests/test_resolution_notes.py b/engine/apps/public_api/tests/test_resolution_notes.py index a98abac53e..63eaa64501 100644 --- a/engine/apps/public_api/tests/test_resolution_notes.py +++ b/engine/apps/public_api/tests/test_resolution_notes.py @@ -168,7 +168,7 @@ def test_create_resolution_note_via_service_account( perms = { permissions.RBACPermission.Permissions.ALERT_GROUPS_WRITE.value: ["*"], } - setup_service_account_api_mocks(organization, perms) + setup_service_account_api_mocks(organization.grafana_url, perms) alert_receive_channel = make_alert_receive_channel(organization) alert_group = make_alert_group(alert_receive_channel) From 417f9787de920da4398aaf0a5dd5ea92c491c9a8 Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Thu, 28 Nov 2024 18:11:00 -0300 Subject: [PATCH 2/7] fix: mobile app template preview use dynamic value (#5311) Related to https://github.com/grafana/support-escalations/issues/13689 --- .../api/tests/test_alert_receive_channel.py | 50 +++++++++++++++++++ engine/apps/mobile_app/alert_rendering.py | 2 +- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/engine/apps/api/tests/test_alert_receive_channel.py b/engine/apps/api/tests/test_alert_receive_channel.py index ac040cde1c..d1314eb3a0 100644 --- a/engine/apps/api/tests/test_alert_receive_channel.py +++ b/engine/apps/api/tests/test_alert_receive_channel.py @@ -9,6 +9,7 @@ from apps.alerts.models import AlertReceiveChannel, EscalationPolicy from apps.api.permissions import LegacyAccessControlRole +from apps.base.messaging import load_backend from apps.labels.models import LabelKeyCache, LabelValueCache from common.exceptions import BacksyncIntegrationRequestError @@ -842,6 +843,55 @@ def test_alert_receive_channel_preview_template_dynamic_payload( assert response.data["preview"] == data["payload"]["foo"] +@pytest.mark.django_db +@pytest.mark.parametrize("template_name", ["title", "message"]) +@pytest.mark.parametrize("backend_path", ["apps.mobile_app.backend.MobileAppBackend"]) +def test_alert_receive_channel_preview_template_dynamic_payload_custom_backends( + make_organization_and_user_with_plugin_token, + make_user_auth_headers, + make_alert_receive_channel, + template_name, + backend_path, + make_alert_group, + make_alert, +): + organization, user, token = make_organization_and_user_with_plugin_token() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + make_alert(alert_group=alert_group, raw_request_data=alert_receive_channel.config.example_payload) + + client = APIClient() + url = reverse( + "api-internal:alert_receive_channel-preview-template", kwargs={"pk": alert_receive_channel.public_primary_key} + ) + + # load backend + backend = load_backend(backend_path, notification_channel_id=111) + notification_channel = backend.backend_id.lower() + + data = { + "template_body": "{{ payload.foo }}", + "template_name": f"{notification_channel}_{template_name}", + "payload": {"foo": "bar" if template_name != "image_url" else "http://example.com/image.jpg"}, + } + + with patch( + "apps.alerts.incident_appearance.templaters.alert_templater.get_messaging_backend_from_id" + ) as mock_get_backend: + mock_get_backend.return_value = backend + from common.api_helpers import mixins + + with patch.object(mixins, "NOTIFICATION_CHANNEL_OPTIONS", new=(notification_channel,)): + with patch.dict( + mixins.NOTIFICATION_CHANNEL_TO_TEMPLATER_MAP, {notification_channel: backend.get_templater_class()} + ): + response = client.post(url, data=data, format="json", **make_user_auth_headers(user, token)) + + assert response.status_code == status.HTTP_200_OK + assert response.data["preview"] == data["payload"]["foo"] + + @pytest.mark.django_db @pytest.mark.parametrize( "role,expected_status", diff --git a/engine/apps/mobile_app/alert_rendering.py b/engine/apps/mobile_app/alert_rendering.py index 3c67d8d342..3a05e18fed 100644 --- a/engine/apps/mobile_app/alert_rendering.py +++ b/engine/apps/mobile_app/alert_rendering.py @@ -24,7 +24,7 @@ def _validate_fcm_length_limit(value: typing.Optional[str]) -> str: class AlertMobileAppTemplater(AlertTemplater): def _render_for(self): - return "MOBILE_APP" + return "mobile_app" def _postformat(self, templated_alert: TemplatedAlert) -> TemplatedAlert: templated_alert.title = _validate_fcm_length_limit(templated_alert.title) From 5227ee37985607c43e1a8b8cf5d59748a17488cd Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Fri, 29 Nov 2024 08:21:29 -0500 Subject: [PATCH 3/7] chore: random slack code cleanup (#5307) # What this PR does Related to https://github.com/grafana/oncall/pull/5287 Few random "clean-ups", type improvements, etc. Additionally, fixes a change made in #5292; we should wait to read from `slack_message.channel.slack_id`, until we've performed the data-migration mentioned in that PR (in the mean-time we should continue to use `slack_message._channel_id`). ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- engine/apps/alerts/models/alert_group.py | 9 +- .../alerts/models/alert_receive_channel.py | 30 +-- .../apps/alerts/tasks/compare_escalations.py | 4 - .../apps/alerts/tasks/escalate_alert_group.py | 3 +- engine/apps/alerts/tasks/notify_user.py | 3 +- engine/apps/alerts/tasks/unsilence.py | 6 +- engine/apps/alerts/tests/test_alert_group.py | 4 +- engine/apps/alerts/tests/test_maintenance.py | 14 +- engine/apps/alerts/tests/test_notify_user.py | 94 +++++--- engine/apps/alerts/tests/test_silence.py | 28 +-- engine/apps/api/tests/conftest.py | 11 +- .../api/tests/test_alert_receive_channel.py | 15 +- .../apps/slack/alert_group_slack_service.py | 10 +- engine/apps/slack/models/slack_message.py | 18 +- .../apps/slack/models/slack_user_identity.py | 5 +- .../alert_group_representative.py | 6 +- .../slack/scenarios/alertgroup_appearance.py | 5 +- .../apps/slack/scenarios/distribute_alerts.py | 222 ++++++++++++------ .../slack/scenarios/notification_delivery.py | 6 +- .../apps/slack/scenarios/resolution_note.py | 31 +-- .../slack/scenarios/shift_swap_requests.py | 5 +- engine/apps/slack/tasks.py | 3 +- .../scenario_steps/test_distribute_alerts.py | 16 +- .../scenario_steps/test_resolution_note.py | 8 +- .../test_slack_channel_integration.py | 7 +- engine/conftest.py | 9 - engine/requirements.in | 1 - engine/requirements.txt | 3 - 28 files changed, 315 insertions(+), 261 deletions(-) delete mode 100644 engine/apps/alerts/tasks/compare_escalations.py diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 54a5b5e204..442ab61e68 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -674,7 +674,7 @@ def acknowledge_by_user_or_backsync( organization_id = user.organization_id if user else self.channel.organization_id logger.debug(f"Started acknowledge_by_user_or_backsync for alert_group {self.pk}") - # if incident was silenced or resolved, unsilence/unresolve it without starting escalation + # if alert group was silenced or resolved, unsilence/unresolve it without starting escalation if self.silenced: self.un_silence() self.log_records.create( @@ -1990,6 +1990,13 @@ def slack_channel_id(self) -> str | None: @property def slack_message(self) -> typing.Optional["SlackMessage"]: + """ + `slack_message` property returns the first `SlackMessage` for the `AlertGroup`. This corresponds to the + Slack message representing the main message in Slack (ie. not a message in a thread). + + This should not be confused with `slack_messages`, which is a `RelatedManager` that returns all `SlackMessage` + instances for the `AlertGroup`. + """ try: # prefetched_slack_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer return self.prefetched_slack_messages[0] if self.prefetched_slack_messages else None diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index 7a351d2aad..91a7db6a12 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -43,7 +43,7 @@ from apps.alerts.models import AlertGroup, ChannelFilter from apps.labels.models import AlertReceiveChannelAssociatedLabel - from apps.user_management.models import Organization, Team + from apps.user_management.models import Organization, Team, User logger = logging.getLogger(__name__) @@ -391,7 +391,7 @@ def save(self, *args, **kwargs): return super().save(*args, **kwargs) - def change_team(self, team_id, user): + def change_team(self, team_id: int, user: "User") -> None: if team_id == self.team_id: raise TeamCanNotBeChangedError("Integration is already in this team") @@ -409,26 +409,26 @@ def grafana_alerting_sync_manager(self): return GrafanaAlertingSyncManager(self) @property - def is_alerting_integration(self): + def is_alerting_integration(self) -> bool: return self.integration in { AlertReceiveChannel.INTEGRATION_GRAFANA_ALERTING, AlertReceiveChannel.INTEGRATION_LEGACY_GRAFANA_ALERTING, } @cached_property - def team_name(self): + def team_name(self) -> str: return self.team.name if self.team else "No team" @cached_property - def team_id_or_no_team(self): + def team_id_or_no_team(self) -> str: return self.team_id if self.team else "no_team" @cached_property - def emojized_verbal_name(self): + def emojized_verbal_name(self) -> str: return emoji.emojize(self.verbal_name, language="alias") @property - def new_incidents_web_link(self): + def new_incidents_web_link(self) -> str: from apps.alerts.models import AlertGroup return UIURLBuilder(self.organization).alert_groups( @@ -436,7 +436,7 @@ def new_incidents_web_link(self): ) @property - def is_rate_limited_in_slack(self): + def is_rate_limited_in_slack(self) -> bool: return ( self.rate_limited_in_slack_at is not None and self.rate_limited_in_slack_at + SLACK_RATE_LIMIT_TIMEOUT > timezone.now() @@ -450,11 +450,11 @@ def start_send_rate_limit_message_task(self, delay=SLACK_RATE_LIMIT_DELAY): post_slack_rate_limit_message.apply_async((self.pk,), countdown=delay, task_id=task_id) @property - def alert_groups_count(self): + def alert_groups_count(self) -> int: return self.alert_groups.count() @property - def alerts_count(self): + def alerts_count(self) -> int: from apps.alerts.models import Alert return Alert.objects.filter(group__channel=self).count() @@ -464,7 +464,7 @@ def is_able_to_autoresolve(self) -> bool: return self.config.is_able_to_autoresolve @property - def is_demo_alert_enabled(self): + def is_demo_alert_enabled(self) -> bool: return self.config.is_demo_alert_enabled @property @@ -513,7 +513,7 @@ def get_or_create_manual_integration(cls, defaults, **kwargs): return alert_receive_channel @property - def short_name(self): + def short_name(self) -> str: if self.verbal_name is None: return self.created_name + "" if self.deleted_at is None else "(Deleted)" elif self.verbal_name == self.created_name: @@ -548,14 +548,14 @@ def integration_url(self) -> str | None: return create_engine_url(f"integrations/v1/{slug}/{self.token}/") @property - def inbound_email(self): + def inbound_email(self) -> typing.Optional[str]: if self.integration != AlertReceiveChannel.INTEGRATION_INBOUND_EMAIL: return None return f"{self.token}@{live_settings.INBOUND_EMAIL_DOMAIN}" @property - def default_channel_filter(self): + def default_channel_filter(self) -> typing.Optional["ChannelFilter"]: return self.channel_filters.filter(is_default=True).first() # Templating @@ -590,7 +590,7 @@ def templates(self): } @property - def is_available_for_custom_templates(self): + def is_available_for_custom_templates(self) -> bool: return True # Maintenance diff --git a/engine/apps/alerts/tasks/compare_escalations.py b/engine/apps/alerts/tasks/compare_escalations.py deleted file mode 100644 index 6187fa6565..0000000000 --- a/engine/apps/alerts/tasks/compare_escalations.py +++ /dev/null @@ -1,4 +0,0 @@ -def compare_escalations(request_id, active_escalation_id): - if request_id == active_escalation_id: - return True - return False diff --git a/engine/apps/alerts/tasks/escalate_alert_group.py b/engine/apps/alerts/tasks/escalate_alert_group.py index 2a98cc17e2..e47e0ca81b 100644 --- a/engine/apps/alerts/tasks/escalate_alert_group.py +++ b/engine/apps/alerts/tasks/escalate_alert_group.py @@ -6,7 +6,6 @@ from common.custom_celery_tasks import shared_dedicated_queue_retry_task -from .compare_escalations import compare_escalations from .task_logger import task_logger @@ -29,7 +28,7 @@ def escalate_alert_group(alert_group_pk): except IndexError: return f"Alert group with pk {alert_group_pk} doesn't exist" - if not compare_escalations(escalate_alert_group.request.id, alert_group.active_escalation_id): + if escalate_alert_group.request.id != alert_group.active_escalation_id: return "Active escalation ID mismatch. Duplication or non-active escalation triggered. Active: {}".format( alert_group.active_escalation_id ) diff --git a/engine/apps/alerts/tasks/notify_user.py b/engine/apps/alerts/tasks/notify_user.py index 97e3787268..9e7e9e548c 100644 --- a/engine/apps/alerts/tasks/notify_user.py +++ b/engine/apps/alerts/tasks/notify_user.py @@ -17,7 +17,6 @@ from apps.phone_notifications.phone_backend import PhoneBackend from common.custom_celery_tasks import shared_dedicated_queue_retry_task -from .compare_escalations import compare_escalations from .task_logger import task_logger if typing.TYPE_CHECKING: @@ -618,7 +617,7 @@ def send_bundled_notification(user_notification_bundle_id: int): ) return - if not compare_escalations(send_bundled_notification.request.id, user_notification_bundle.notification_task_id): + if send_bundled_notification.request.id != user_notification_bundle.notification_task_id: task_logger.info( f"send_bundled_notification: notification_task_id mismatch. " f"Duplication or non-active notification triggered. " diff --git a/engine/apps/alerts/tasks/unsilence.py b/engine/apps/alerts/tasks/unsilence.py index e970f91441..145b8e5a6d 100644 --- a/engine/apps/alerts/tasks/unsilence.py +++ b/engine/apps/alerts/tasks/unsilence.py @@ -5,7 +5,6 @@ from common.custom_celery_tasks import shared_dedicated_queue_retry_task -from .compare_escalations import compare_escalations from .send_alert_group_signal import send_alert_group_signal from .task_logger import task_logger @@ -17,17 +16,20 @@ def unsilence_task(alert_group_pk): from apps.alerts.models import AlertGroup, AlertGroupLogRecord task_logger.info(f"Start unsilence_task for alert_group {alert_group_pk}") + with transaction.atomic(): try: alert_group = AlertGroup.objects.filter(pk=alert_group_pk).select_for_update()[0] # Lock alert_group: except IndexError: task_logger.info(f"unsilence_task. alert_group {alert_group_pk} doesn't exist") return - if not compare_escalations(unsilence_task.request.id, alert_group.unsilence_task_uuid): + + if unsilence_task.request.id != alert_group.unsilence_task_uuid: task_logger.info( f"unsilence_task. alert_group {alert_group.pk}.ID mismatch.Active: {alert_group.unsilence_task_uuid}" ) return + if alert_group.status == AlertGroup.SILENCED and alert_group.is_root_alert_group: initial_state = alert_group.state task_logger.info(f"unsilence alert_group {alert_group_pk} and start escalation if needed") diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index 5bcab73b88..e837516f86 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -152,11 +152,11 @@ def test_delete( # Check that appropriate Slack API calls are made assert mock_chat_delete.call_count == 2 assert mock_chat_delete.call_args_list[0] == call( - channel=resolution_note_1.slack_channel_id, ts=resolution_note_1.ts + channel=resolution_note_1.slack_channel.slack_id, ts=resolution_note_1.ts ) assert mock_chat_delete.call_args_list[1] == call(channel=slack_message.channel.slack_id, ts=slack_message.slack_id) mock_reactions_remove.assert_called_once_with( - channel=resolution_note_2.slack_channel_id, name="memo", timestamp=resolution_note_2.ts + channel=resolution_note_2.slack_channel.slack_id, name="memo", timestamp=resolution_note_2.ts ) diff --git a/engine/apps/alerts/tests/test_maintenance.py b/engine/apps/alerts/tests/test_maintenance.py index eed1993b06..38b0f8b15f 100644 --- a/engine/apps/alerts/tests/test_maintenance.py +++ b/engine/apps/alerts/tests/test_maintenance.py @@ -16,9 +16,7 @@ def maintenance_test_setup( @pytest.mark.django_db -def test_start_maintenance_integration( - maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task -): +def test_start_maintenance_integration(maintenance_test_setup, make_alert_receive_channel): organization, user = maintenance_test_setup alert_receive_channel = make_alert_receive_channel( @@ -37,9 +35,7 @@ def test_start_maintenance_integration( @pytest.mark.django_db -def test_start_maintenance_integration_multiple_previous_instances( - maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task -): +def test_start_maintenance_integration_multiple_previous_instances(maintenance_test_setup, make_alert_receive_channel): organization, user = maintenance_test_setup alert_receive_channel = make_alert_receive_channel( @@ -64,9 +60,7 @@ def test_start_maintenance_integration_multiple_previous_instances( @pytest.mark.django_db -def test_maintenance_integration_will_not_start_twice( - maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task -): +def test_maintenance_integration_will_not_start_twice(maintenance_test_setup, make_alert_receive_channel): organization, user = maintenance_test_setup alert_receive_channel = make_alert_receive_channel( @@ -91,7 +85,6 @@ def test_alert_attached_to_maintenance_incident_integration( maintenance_test_setup, make_alert_receive_channel, make_alert_with_custom_create_method, - mock_start_disable_maintenance_task, ): organization, user = maintenance_test_setup @@ -122,7 +115,6 @@ def test_stop_maintenance( maintenance_test_setup, make_alert_receive_channel, make_alert_with_custom_create_method, - mock_start_disable_maintenance_task, ): organization, user = maintenance_test_setup alert_receive_channel = make_alert_receive_channel( diff --git a/engine/apps/alerts/tests/test_notify_user.py b/engine/apps/alerts/tests/test_notify_user.py index d40ac5e812..4a4b92a309 100644 --- a/engine/apps/alerts/tests/test_notify_user.py +++ b/engine/apps/alerts/tests/test_notify_user.py @@ -530,47 +530,73 @@ def test_send_bundle_notification( alert_group_1 = make_alert_group(alert_receive_channel=alert_receive_channel) alert_group_2 = make_alert_group(alert_receive_channel=alert_receive_channel) alert_group_3 = make_alert_group(alert_receive_channel=alert_receive_channel) + + task_id = "test_task_id" notification_bundle = make_user_notification_bundle( - user, UserNotificationPolicy.NotificationChannel.SMS, notification_task_id="test_task_id", eta=timezone.now() + user, UserNotificationPolicy.NotificationChannel.SMS, notification_task_id=task_id, eta=timezone.now() ) + notification_bundle.append_notification(alert_group_1, notification_policy) notification_bundle.append_notification(alert_group_2, notification_policy) notification_bundle.append_notification(alert_group_3, notification_policy) + assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 3 + alert_group_3.resolve() - with patch("apps.alerts.tasks.notify_user.compare_escalations", return_value=True): - # send notification for 2 active alert groups - send_bundled_notification(notification_bundle.id) - assert f"alert_group {alert_group_3.id} is not active, skip notification" in caplog.text - assert "perform bundled notification for alert groups with ids:" in caplog.text - # check bundle_uuid was set, notification for resolved alert group was deleted - assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0 - assert notification_bundle.notifications.all().count() == 2 - assert not notification_bundle.notifications.filter(alert_group=alert_group_3).exists() - - # send notification for 1 active alert group - notification_bundle.notifications.update(bundle_uuid=None) - alert_group_2.resolve() - send_bundled_notification(notification_bundle.id) - assert f"alert_group {alert_group_2.id} is not active, skip notification" in caplog.text - assert ( - f"there is only one alert group in bundled notification, perform regular notification. " - f"alert_group {alert_group_1.id}" - ) in caplog.text - # check bundle_uuid was set - assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0 - assert notification_bundle.notifications.all().count() == 1 - # cleanup notifications - notification_bundle.notifications.all().delete() - - # send notification for 0 active alert group - notification_bundle.append_notification(alert_group_1, notification_policy) - alert_group_1.resolve() - send_bundled_notification(notification_bundle.id) - assert f"alert_group {alert_group_1.id} is not active, skip notification" in caplog.text - assert f"no alert groups to notify about or notification is not allowed for user {user.id}" in caplog.text - # check all notifications were deleted - assert notification_bundle.notifications.all().count() == 0 + + # send notification for 2 active alert groups + send_bundled_notification.apply((notification_bundle.id,), task_id=task_id) + + assert f"alert_group {alert_group_3.id} is not active, skip notification" in caplog.text + assert "perform bundled notification for alert groups with ids:" in caplog.text + + # check bundle_uuid was set, notification for resolved alert group was deleted + assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0 + assert notification_bundle.notifications.all().count() == 2 + assert not notification_bundle.notifications.filter(alert_group=alert_group_3).exists() + + # send notification for 1 active alert group + notification_bundle.notifications.update(bundle_uuid=None) + + # since we're calling send_bundled_notification several times within this test, we need to reset task_id + # because it gets set to None after the first call + notification_bundle.notification_task_id = task_id + notification_bundle.save() + + alert_group_2.resolve() + + send_bundled_notification.apply((notification_bundle.id,), task_id=task_id) + + assert f"alert_group {alert_group_2.id} is not active, skip notification" in caplog.text + assert ( + f"there is only one alert group in bundled notification, perform regular notification. " + f"alert_group {alert_group_1.id}" + ) in caplog.text + + # check bundle_uuid was set + assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0 + assert notification_bundle.notifications.all().count() == 1 + + # cleanup notifications + notification_bundle.notifications.all().delete() + + # send notification for 0 active alert group + notification_bundle.append_notification(alert_group_1, notification_policy) + + # since we're calling send_bundled_notification several times within this test, we need to reset task_id + # because it gets set to None after the first call + notification_bundle.notification_task_id = task_id + notification_bundle.save() + + alert_group_1.resolve() + + send_bundled_notification.apply((notification_bundle.id,), task_id=task_id) + + assert f"alert_group {alert_group_1.id} is not active, skip notification" in caplog.text + assert f"no alert groups to notify about or notification is not allowed for user {user.id}" in caplog.text + + # check all notifications were deleted + assert notification_bundle.notifications.all().count() == 0 @pytest.mark.django_db diff --git a/engine/apps/alerts/tests/test_silence.py b/engine/apps/alerts/tests/test_silence.py index d72d860212..07dcd6b238 100644 --- a/engine/apps/alerts/tests/test_silence.py +++ b/engine/apps/alerts/tests/test_silence.py @@ -5,14 +5,8 @@ @pytest.mark.django_db -def test_silence_alert_group( - make_organization_and_user, - make_alert_receive_channel, - make_alert_group, - make_alert, - mock_start_disable_maintenance_task, -): - organization, user = make_organization_and_user() +def test_silence_alert_group(make_organization_and_user, make_alert_receive_channel, make_alert_group): + organization, _ = make_organization_and_user() alert_receive_channel = make_alert_receive_channel( organization, integration=AlertReceiveChannel.INTEGRATION_GRAFANA ) @@ -24,14 +18,8 @@ def test_silence_alert_group( @pytest.mark.django_db -def test_silence_by_user_alert_group( - make_organization_and_user, - make_alert_receive_channel, - make_alert_group, - make_alert, - mock_start_disable_maintenance_task, -): - organization, user = make_organization_and_user() +def test_silence_by_user_alert_group(make_organization_and_user, make_alert_receive_channel, make_alert_group): + organization, _ = make_organization_and_user() alert_receive_channel = make_alert_receive_channel( organization, integration=AlertReceiveChannel.INTEGRATION_GRAFANA ) @@ -43,13 +31,7 @@ def test_silence_by_user_alert_group( @pytest.mark.django_db -def test_unsilence_alert_group( - make_organization_and_user, - make_alert_receive_channel, - make_alert_group, - make_alert, - mock_start_disable_maintenance_task, -): +def test_unsilence_alert_group(make_organization_and_user, make_alert_receive_channel, make_alert_group): organization, user = make_organization_and_user() alert_receive_channel = make_alert_receive_channel( organization, integration=AlertReceiveChannel.INTEGRATION_GRAFANA diff --git a/engine/apps/api/tests/conftest.py b/engine/apps/api/tests/conftest.py index cfd3ddafd0..29797876bd 100644 --- a/engine/apps/api/tests/conftest.py +++ b/engine/apps/api/tests/conftest.py @@ -2,7 +2,6 @@ from django.utils import timezone from apps.slack.client import SlackClient -from apps.slack.scenarios.distribute_alerts import AlertShootingStep @pytest.fixture() @@ -22,7 +21,7 @@ def _mock_api_call(*args, **kwargs): @pytest.fixture() -def make_resolved_ack_new_silenced_alert_groups(make_alert_group, make_alert_receive_channel, make_alert): +def make_resolved_ack_new_silenced_alert_groups(make_alert_group, make_alert): def _make_alert_groups_all_statuses(alert_receive_channel, channel_filter, alert_raw_request_data, **kwargs): resolved_alert_group = make_alert_group( alert_receive_channel, @@ -56,11 +55,3 @@ def _make_alert_groups_all_statuses(alert_receive_channel, channel_filter, alert return resolved_alert_group, ack_alert_group, new_alert_group, silenced_alert_group return _make_alert_groups_all_statuses - - -@pytest.fixture() -def mock_alert_shooting_step_post_alert_group_to_slack(monkeypatch): - def mock_post_alert_group_to_slack(*args, **kwargs): - return None - - monkeypatch.setattr(AlertShootingStep, "_post_alert_group_to_slack", mock_post_alert_group_to_slack) diff --git a/engine/apps/api/tests/test_alert_receive_channel.py b/engine/apps/api/tests/test_alert_receive_channel.py index d1314eb3a0..6378de0c47 100644 --- a/engine/apps/api/tests/test_alert_receive_channel.py +++ b/engine/apps/api/tests/test_alert_receive_channel.py @@ -414,12 +414,7 @@ def test_update_alert_receive_channel(alert_receive_channel_internal_api_setup, @pytest.mark.django_db -def test_integration_filter_by_maintenance( - alert_receive_channel_internal_api_setup, - make_user_auth_headers, - mock_start_disable_maintenance_task, - mock_alert_shooting_step_post_alert_group_to_slack, -): +def test_integration_filter_by_maintenance(alert_receive_channel_internal_api_setup, make_user_auth_headers): user, token, alert_receive_channel = alert_receive_channel_internal_api_setup client = APIClient() mode = AlertReceiveChannel.MAINTENANCE @@ -437,12 +432,7 @@ def test_integration_filter_by_maintenance( @pytest.mark.django_db -def test_integration_filter_by_debug( - alert_receive_channel_internal_api_setup, - make_user_auth_headers, - mock_start_disable_maintenance_task, - mock_alert_shooting_step_post_alert_group_to_slack, -): +def test_integration_filter_by_debug(alert_receive_channel_internal_api_setup, make_user_auth_headers): user, token, alert_receive_channel = alert_receive_channel_internal_api_setup client = APIClient() mode = AlertReceiveChannel.DEBUG_MAINTENANCE @@ -1141,7 +1131,6 @@ def test_start_maintenance_integration( @pytest.mark.django_db def test_stop_maintenance_integration( - mock_start_disable_maintenance_task, make_user_auth_headers, make_organization_and_user_with_plugin_token, make_escalation_chain, diff --git a/engine/apps/slack/alert_group_slack_service.py b/engine/apps/slack/alert_group_slack_service.py index 12c1a1de60..6ff514040f 100644 --- a/engine/apps/slack/alert_group_slack_service.py +++ b/engine/apps/slack/alert_group_slack_service.py @@ -39,7 +39,10 @@ def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: try: self._slack_client.chat_update( - channel=alert_group.slack_message.channel.slack_id, + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=alert_group.slack_message.channel.slack_id, + channel=alert_group.slack_message._channel_id, ts=alert_group.slack_message.slack_id, attachments=alert_group.render_slack_attachments(), blocks=alert_group.render_slack_blocks(), @@ -78,7 +81,10 @@ def publish_message_to_alert_group_thread( try: result = self._slack_client.chat_postMessage( - channel=slack_message.channel.slack_id, + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=slack_message.channel.slack_id, + channel=slack_message._channel_id, text=text, attachments=attachments, thread_ts=slack_message.slack_id, diff --git a/engine/apps/slack/models/slack_message.py b/engine/apps/slack/models/slack_message.py index 83eaafca7e..37fa6f01e8 100644 --- a/engine/apps/slack/models/slack_message.py +++ b/engine/apps/slack/models/slack_message.py @@ -70,10 +70,9 @@ class SlackMessage(models.Model): ack_reminder_message_ts = models.CharField(max_length=100, null=True, default=None) - created_at = models.DateTimeField(auto_now_add=True) - cached_permalink = models.URLField(max_length=250, null=True, default=None) + created_at = models.DateTimeField(auto_now_add=True) last_updated = models.DateTimeField(null=True, default=None) alert_group = models.ForeignKey( @@ -84,8 +83,10 @@ class SlackMessage(models.Model): related_name="slack_messages", ) - # ID of a latest celery task to update the message active_update_task_id = models.CharField(max_length=100, null=True, default=None) + """ + ID of the latest celery task to update the message + """ class Meta: # slack_id is unique within the context of a channel or conversation @@ -105,7 +106,11 @@ def permalink(self) -> typing.Optional[str]: try: result = SlackClient(self.slack_team_identity).chat_getPermalink( - channel=self.channel.slack_id, message_ts=self.slack_id + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=self.channel.slack_id, + channel=self._channel_id, + message_ts=self.slack_id, ) except SlackAPIError: return None @@ -117,7 +122,9 @@ def permalink(self) -> typing.Optional[str]: @property def deep_link(self) -> str: - return f"https://slack.com/app_redirect?channel={self.channel.slack_id}&team={self.slack_team_identity.slack_id}&message={self.slack_id}" + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + return f"https://slack.com/app_redirect?channel={self._channel_id}&team={self.slack_team_identity.slack_id}&message={self.slack_id}" @classmethod def send_slack_notification( @@ -131,7 +138,6 @@ def send_slack_notification( Still some more investigation needed to confirm this, but for now, we'll pass in the `alert_group` as an argument """ - from apps.base.models import UserNotificationPolicyLogRecord slack_message = alert_group.slack_message diff --git a/engine/apps/slack/models/slack_user_identity.py b/engine/apps/slack/models/slack_user_identity.py index 5788253634..7d04090b53 100644 --- a/engine/apps/slack/models/slack_user_identity.py +++ b/engine/apps/slack/models/slack_user_identity.py @@ -132,9 +132,12 @@ def send_link_to_slack_message(self, slack_message: "SlackMessage"): "elements": [ { "type": "mrkdwn", + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # f"<#{slack_message.channel.slack_id}>.\n" "text": ( f"You received this message because you're not a member of " - f"<#{slack_message.channel.slack_id}>.\n" + f"<#{slack_message._channel_id}>.\n" "Please join the channel to get notified right in the alert group thread." ), } diff --git a/engine/apps/slack/representatives/alert_group_representative.py b/engine/apps/slack/representatives/alert_group_representative.py index 77d995e6bc..1d9be1d568 100644 --- a/engine/apps/slack/representatives/alert_group_representative.py +++ b/engine/apps/slack/representatives/alert_group_representative.py @@ -43,8 +43,8 @@ def on_create_alert_slack_representative_async(alert_pk): logger.debug( f"Process on_create_alert_slack_representative for alert {alert_pk} from alert_group {alert.group_id}" ) - AlertShootingStep = ScenarioStep.get_step("distribute_alerts", "AlertShootingStep") - step = AlertShootingStep(organization.slack_team_identity, organization) + IncomingAlertStep = ScenarioStep.get_step("distribute_alerts", "IncomingAlertStep") + step = IncomingAlertStep(organization.slack_team_identity, organization) step.process_signal(alert) else: logger.debug( @@ -91,7 +91,7 @@ def on_alert_group_action_triggered_async(log_record_id): class AlertGroupSlackRepresentative(AlertGroupAbstractRepresentative): - def __init__(self, log_record): + def __init__(self, log_record: "AlertGroupLogRecord"): self.log_record = log_record def is_applicable(self): diff --git a/engine/apps/slack/scenarios/alertgroup_appearance.py b/engine/apps/slack/scenarios/alertgroup_appearance.py index 2f7a0113fe..1a201ecd39 100644 --- a/engine/apps/slack/scenarios/alertgroup_appearance.py +++ b/engine/apps/slack/scenarios/alertgroup_appearance.py @@ -87,7 +87,10 @@ def process_scenario( slack_message = alert_group.slack_message self._slack_client.chat_update( - channel=slack_message.channel.slack_id, + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=slack_message.channel.slack_id, + channel=slack_message._channel_id, ts=slack_message.slack_id, attachments=alert_group.render_slack_attachments(), blocks=alert_group.render_slack_blocks(), diff --git a/engine/apps/slack/scenarios/distribute_alerts.py b/engine/apps/slack/scenarios/distribute_alerts.py index 8e11fafc67..ad327bdcd8 100644 --- a/engine/apps/slack/scenarios/distribute_alerts.py +++ b/engine/apps/slack/scenarios/distribute_alerts.py @@ -50,69 +50,147 @@ logger.setLevel(logging.DEBUG) -class AlertShootingStep(scenario_step.ScenarioStep): +class IncomingAlertStep(scenario_step.ScenarioStep): def process_signal(self, alert: Alert) -> None: + """ + 🐉 Here lay dragons 🐉 + + Below is some added context as to why we have the explicit `AlertGroup.objects.filter(...).update(...)` calls. + + For example: + ``` + num_updated_rows = AlertGroup.objects.filter( + pk=alert.group.pk, slack_message_sent=False, + ).update(slack_message_sent=True) + + if num_updated_rows == 1: + # post new message + else: + # update existing message + ``` + + This piece of code guarantees that when 2 alerts are created concurrently we don't end up posting a + message twice. We rely on the fact that the `UPDATE` SQL statements are atomic. This is NOT the same as: + + ``` + if not alert_group.slack_message_sent: + # post new message + alert_group.slack_message_sent = True + else: + # update existing message + ``` + + This would end up posting multiple Slack messages for a single alert group (classic race condition). And then + all kinds of unexpected behaviours would arise, because some parts of the codebase assume there can only be 1 + `SlackMessage` per `AlertGroup`. + + ``` + AlertGroup.objects.filter(pk=alert.group.pk, slack_message_sent=False).update(slack_message_sent=True) + ``` + + The power of this 👆, is that it doesn't actually do `SELECT ...;` and then `UPDATE ...;` it actually does a + single `UPDATE alerts.alert_group WHERE id= AND slack_message_sent IS FALSE`; + which makes it atomic and concurrency-safe. + + See this conversation for more context https://raintank-corp.slack.com/archives/C06K1MQ07GS/p1732800180834819?thread_ts=1732748893.183939&cid=C06K1MQ07GS + """ + + alert_group = alert.group + + if not alert_group: + # this case should hypothetically never happen, it's mostly to appease mypy with the + # fact that alert.group can "technically" be None + logger.warning( + f"Skip IncomingAlertStep.process_signal because alert {alert.pk} doesn't actually " + "have an alert group associated with it" + ) + return + + alert_group_pk = alert_group.pk + alert_receive_channel = alert_group.channel + should_skip_escalation_in_slack = alert_group.skip_escalation_in_slack + slack_team_identity = self.slack_team_identity + slack_team_identity_pk = slack_team_identity.pk + # do not try to post alert group message to slack if its channel is rate limited - if alert.group.channel.is_rate_limited_in_slack: + if alert_receive_channel.is_rate_limited_in_slack: logger.info("Skip posting or updating alert_group in Slack due to rate limit") AlertGroup.objects.filter( - pk=alert.group.pk, + pk=alert_group_pk, slack_message_sent=False, ).update(slack_message_sent=True, reason_to_skip_escalation=AlertGroup.RATE_LIMITED) return - num_updated_rows = AlertGroup.objects.filter(pk=alert.group.pk, slack_message_sent=False).update( + num_updated_rows = AlertGroup.objects.filter(pk=alert_group_pk, slack_message_sent=False).update( slack_message_sent=True ) if num_updated_rows == 1: + # this will be the case in the event that we haven't yet created a Slack message for this alert group + + slack_channel = ( + alert_group.channel_filter.slack_channel + if alert_group.channel_filter + # if channel filter is deleted mid escalation, use default Slack channel + else alert_receive_channel.organization.default_slack_channel + ) + slack_channel_id = slack_channel.slack_id + try: - slack_channel = ( - alert.group.channel_filter.slack_channel - if alert.group.channel_filter - # if channel filter is deleted mid escalation, use default Slack channel - else alert.group.channel.organization.default_slack_channel + self._post_alert_group_to_slack( + slack_team_identity=slack_team_identity, + alert_group=alert_group, + alert=alert, + attachments=alert_group.render_slack_attachments(), + slack_channel=slack_channel, + blocks=alert_group.render_slack_blocks(), ) - self._send_first_alert(alert, slack_channel) except (SlackAPIError, TimeoutError): - AlertGroup.objects.filter(pk=alert.group.pk).update(slack_message_sent=False) + AlertGroup.objects.filter(pk=alert_group_pk).update(slack_message_sent=False) raise - if alert.group.channel.maintenance_mode == AlertReceiveChannel.DEBUG_MAINTENANCE: - self._send_debug_mode_notice(alert.group, slack_channel) + if alert_receive_channel.maintenance_mode == AlertReceiveChannel.DEBUG_MAINTENANCE: + # send debug mode notice + text = "Escalations are silenced due to Debug mode" + self._slack_client.chat_postMessage( + channel=slack_channel_id, + text=text, + attachments=[], + thread_ts=alert_group.slack_message.slack_id, + mrkdwn=True, + blocks=[ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": text, + }, + }, + ], + ) + + if not alert_group.is_maintenance_incident and not should_skip_escalation_in_slack: + send_message_to_thread_if_bot_not_in_channel.apply_async( + (alert_group_pk, slack_team_identity_pk, slack_channel_id), + countdown=1, # delay for message so that the log report is published first + ) - if alert.group.is_maintenance_incident: - # not sending log report message for maintenance incident - pass - else: - # check if alert group was posted to slack before posting message to thread - if not alert.group.skip_escalation_in_slack: - self._send_message_to_thread_if_bot_not_in_channel(alert.group, slack_channel) else: - # check if alert group was posted to slack before updating its message - if not alert.group.skip_escalation_in_slack: + # if a new alert comes in, and is grouped to an alert group that has already been posted to Slack, + # then we will update that existing Slack message + if not should_skip_escalation_in_slack: update_task_id = update_incident_slack_message.apply_async( - (self.slack_team_identity.pk, alert.group.pk), + (self.slack_team_identity.pk, alert_group_pk), countdown=10, ) cache.set( - get_cache_key_update_incident_slack_message(alert.group.pk), + get_cache_key_update_incident_slack_message(alert_group_pk), update_task_id, timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, ) else: logger.info("Skip updating alert_group in Slack due to rate limit") - def _send_first_alert(self, alert: Alert, slack_channel: typing.Optional[SlackChannel]) -> None: - self._post_alert_group_to_slack( - slack_team_identity=self.slack_team_identity, - alert_group=alert.group, - alert=alert, - attachments=alert.group.render_slack_attachments(), - slack_channel=slack_channel, - blocks=alert.group.render_slack_blocks(), - ) - def _post_alert_group_to_slack( self, slack_team_identity: SlackTeamIdentity, @@ -182,31 +260,6 @@ def _post_alert_group_to_slack( finally: alert.save() - def _send_debug_mode_notice(self, alert_group: AlertGroup, slack_channel: SlackChannel) -> None: - blocks: Block.AnyBlocks = [] - - text = "Escalations are silenced due to Debug mode" - blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": text}}) - - self._slack_client.chat_postMessage( - channel=slack_channel.slack_id, - text=text, - attachments=[], - thread_ts=alert_group.slack_message.slack_id, - mrkdwn=True, - blocks=blocks, - ) - - def _send_message_to_thread_if_bot_not_in_channel( - self, - alert_group: AlertGroup, - slack_channel: SlackChannel, - ) -> None: - send_message_to_thread_if_bot_not_in_channel.apply_async( - (alert_group.pk, self.slack_team_identity.pk, slack_channel.slack_id), - countdown=1, # delay for message so that the log report is published first - ) - def process_scenario( self, slack_user_identity: SlackUserIdentity, @@ -472,8 +525,11 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: if slack_user_identity: self._slack_client.chat_postEphemeral( + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=alert_group.slack_message.channel.slack_id, + channel=alert_group.slack_message._channel_id, user=slack_user_identity.slack_id, - channel=alert_group.slack_message.channel.slack_id, text="{}{}".format(ephemeral_text[:1].upper(), ephemeral_text[1:]), unfurl_links=True, ) @@ -714,7 +770,10 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: if slack_message.ack_reminder_message_ts: try: self._slack_client.chat_update( - channel=slack_message.channel.slack_id, + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=slack_message.channel.slack_id, + channel=slack_message._channel_id, ts=slack_message.ack_reminder_message_ts, text=text, attachments=message_attachments, @@ -788,12 +847,14 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: from apps.user_management.models import Organization alert_group = log_record.alert_group - slack_channel = alert_group.slack_message.channel + organization = alert_group.channel.organization + slack_message = alert_group.slack_message + slack_channel = slack_message.channel user_verbal = log_record.author.get_username_with_slack_verbal(mention=True) text = f"{user_verbal}, please confirm that you're still working on this Alert Group." - if alert_group.channel.organization.unacknowledge_timeout != Organization.UNACKNOWLEDGE_TIMEOUT_NEVER: + if organization.unacknowledge_timeout != Organization.UNACKNOWLEDGE_TIMEOUT_NEVER: try: response = self._slack_client.chat_postMessage( channel=slack_channel.slack_id, @@ -815,14 +876,12 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: "text": "Confirm", "type": "button", "style": "primary", - "value": make_value( - {"alert_group_pk": alert_group.pk}, alert_group.channel.organization - ), + "value": make_value({"alert_group_pk": alert_group.pk}, organization), }, ], } ], - thread_ts=alert_group.slack_message.slack_id, + thread_ts=slack_message.slack_id, ) except (SlackAPITokenError, SlackAPIChannelArchivedError, SlackAPIChannelNotFoundError): pass @@ -831,13 +890,13 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p1732555465144099 alert_group.slack_messages.create( slack_id=response["ts"], - organization=alert_group.channel.organization, + organization=organization, _channel_id=slack_channel.slack_id, channel=slack_channel, ) - alert_group.slack_message.ack_reminder_message_ts = response["ts"] - alert_group.slack_message.save(update_fields=["ack_reminder_message_ts"]) + slack_message.ack_reminder_message_ts = response["ts"] + slack_message.save(update_fields=["ack_reminder_message_ts"]) else: text = f"This is a reminder that the Alert Group is still acknowledged by {user_verbal}" self.alert_group_slack_service.publish_message_to_alert_group_thread(alert_group, text=text) @@ -846,9 +905,12 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: class WipeGroupStep(scenario_step.ScenarioStep): def process_signal(self, log_record: AlertGroupLogRecord) -> None: alert_group = log_record.alert_group - user_verbal = log_record.author.get_username_with_slack_verbal() - text = f"Wiped by {user_verbal}" - self.alert_group_slack_service.publish_message_to_alert_group_thread(alert_group, text=text) + + self.alert_group_slack_service.publish_message_to_alert_group_thread( + alert_group, + text=f"Wiped by {log_record.author.get_username_with_slack_verbal()}", + ) + self.alert_group_slack_service.update_alert_group_slack_message(alert_group) @@ -859,7 +921,9 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: # Remove "memo" emoji from resolution note messages for message in alert_group.resolution_note_slack_messages.filter(added_to_resolution_note=True): try: - self._slack_client.reactions_remove(channel=message.slack_channel_id, name="memo", timestamp=message.ts) + self._slack_client.reactions_remove( + channel=message.slack_channel_slack_id, name="memo", timestamp=message.ts + ) except SlackAPIRatelimitError: # retries on ratelimit are handled in apps.alerts.tasks.delete_alert_group.delete_alert_group raise @@ -870,7 +934,7 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: # Remove resolution note messages posted by OnCall bot for message in alert_group.resolution_note_slack_messages.filter(posted_by_bot=True): try: - self._slack_client.chat_delete(channel=message.slack_channel_id, ts=message.ts) + self._slack_client.chat_delete(channel=message.slack_channel_slack_id, ts=message.ts) except SlackAPIRatelimitError: # retries on ratelimit are handled in apps.alerts.tasks.delete_alert_group.delete_alert_group raise @@ -881,7 +945,13 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: # Remove alert group Slack messages for message in alert_group.slack_messages.all(): try: - self._slack_client.chat_delete(channel=message.channel.slack_id, ts=message.slack_id) + self._slack_client.chat_delete( + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=message.channel.slack_id, + channel=message._channel_id, + ts=message.slack_id, + ) except SlackAPIRatelimitError: # retries on ratelimit are handled in apps.alerts.tasks.delete_alert_group.delete_alert_group raise diff --git a/engine/apps/slack/scenarios/notification_delivery.py b/engine/apps/slack/scenarios/notification_delivery.py index d77286ad62..fc3992311d 100644 --- a/engine/apps/slack/scenarios/notification_delivery.py +++ b/engine/apps/slack/scenarios/notification_delivery.py @@ -18,7 +18,11 @@ def process_signal(self, log_record: "AlertGroupLogRecord") -> None: user = log_record.author alert_group = log_record.alert_group - slack_channel_id = alert_group.slack_message.channel.slack_id + + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # slack_channel_id = alert_group_slack_message.channel.slack_id + slack_channel_id = alert_group.slack_message._channel_id user_verbal_with_mention = user.get_username_with_slack_verbal(mention=True) diff --git a/engine/apps/slack/scenarios/resolution_note.py b/engine/apps/slack/scenarios/resolution_note.py index 046967b175..130a102f85 100644 --- a/engine/apps/slack/scenarios/resolution_note.py +++ b/engine/apps/slack/scenarios/resolution_note.py @@ -236,9 +236,7 @@ def process_signal(self, alert_group: "AlertGroup", resolution_note: "Resolution else: self.post_or_update_resolution_note_in_thread(resolution_note) - self.update_alert_group_resolution_note_button( - alert_group=alert_group, - ) + self.update_alert_group_resolution_note_button(alert_group) def remove_resolution_note_slack_message(self, resolution_note: "ResolutionNote") -> None: if (resolution_note_slack_message := resolution_note.resolution_note_slack_message) is not None: @@ -263,9 +261,13 @@ def post_or_update_resolution_note_in_thread(self, resolution_note: "ResolutionN resolution_note_slack_message = resolution_note.resolution_note_slack_message alert_group = resolution_note.alert_group alert_group_slack_message = alert_group.slack_message - slack_channel_id = alert_group_slack_message.channel.slack_id blocks = self.get_resolution_note_blocks(resolution_note) + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # slack_channel_id = alert_group_slack_message.channel.slack_id + slack_channel_id = alert_group_slack_message._channel_id + slack_channel = SlackChannel.objects.get( slack_id=slack_channel_id, slack_team_identity=self.slack_team_identity ) @@ -655,11 +657,6 @@ def get_invite_bot_tip_blocks(self, channel: str) -> Block.AnyBlocks: ] -class ReadEditPostmortemStep(ResolutionNoteModalStep): - # Left for backward compatibility with slack messages created before postmortems -> resolution note change - pass - - class AddRemoveThreadMessageStep(UpdateResolutionNoteStep, scenario_step.ScenarioStep): def process_scenario( self, @@ -687,6 +684,7 @@ def process_scenario( if add_to_resolution_note and slack_thread_message is not None: slack_thread_message.added_to_resolution_note = True slack_thread_message.save(update_fields=["added_to_resolution_note"]) + if resolution_note is None: ResolutionNote( alert_group=alert_group, @@ -696,6 +694,7 @@ def process_scenario( ).save() else: resolution_note.recreate() + self.add_resolution_note_reaction(slack_thread_message) elif not add_to_resolution_note: # Check if resolution_note can be removed @@ -716,14 +715,16 @@ def process_scenario( else: if resolution_note_pk is not None and resolution_note is None: # old version of step resolution_note = ResolutionNote.objects.get(pk=resolution_note_pk) + resolution_note.delete() + if slack_thread_message: slack_thread_message.added_to_resolution_note = False slack_thread_message.save(update_fields=["added_to_resolution_note"]) self.remove_resolution_note_reaction(slack_thread_message) - self.update_alert_group_resolution_note_button( - alert_group, - ) + + self.update_alert_group_resolution_note_button(alert_group) + resolution_note_data = json.loads(payload["actions"][0]["value"]) resolution_note_data["resolution_note_window_action"] = "edit_update" ResolutionNoteModalStep(slack_team_identity, self.organization, self.user).process_scenario( @@ -735,12 +736,6 @@ def process_scenario( STEPS_ROUTING: ScenarioRoute.RoutingSteps = [ - { - "payload_type": PayloadType.BLOCK_ACTIONS, - "block_action_type": BlockActionType.BUTTON, - "block_action_id": ReadEditPostmortemStep.routing_uid(), - "step": ReadEditPostmortemStep, - }, { "payload_type": PayloadType.BLOCK_ACTIONS, "block_action_type": BlockActionType.BUTTON, diff --git a/engine/apps/slack/scenarios/shift_swap_requests.py b/engine/apps/slack/scenarios/shift_swap_requests.py index 33c29ea920..48637131b7 100644 --- a/engine/apps/slack/scenarios/shift_swap_requests.py +++ b/engine/apps/slack/scenarios/shift_swap_requests.py @@ -186,8 +186,11 @@ def post_message_to_thread( if not shift_swap_request.slack_message: return + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 self._slack_client.chat_postMessage( - channel=shift_swap_request.slack_message.channel.slack_id, + # channel=shift_swap_request.slack_message.channel.slack_id, + channel=shift_swap_request.slack_message._channel_id, thread_ts=shift_swap_request.slack_message.slack_id, reply_broadcast=reply_broadcast, blocks=blocks, diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index 84e5ca7653..13b26bbeda 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -9,7 +9,6 @@ from django.core.cache import cache from django.utils import timezone -from apps.alerts.tasks.compare_escalations import compare_escalations from apps.slack.alert_group_slack_service import AlertGroupSlackService from apps.slack.client import SlackClient from apps.slack.constants import CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, SLACK_BOT_ID @@ -296,7 +295,7 @@ def post_slack_rate_limit_message(integration_id): logger.warning(f"AlertReceiveChannel {integration_id} doesn't exist") return - if not compare_escalations(post_slack_rate_limit_message.request.id, integration.rate_limit_message_task_id): + if post_slack_rate_limit_message.request.id != integration.rate_limit_message_task_id: logger.info( f"post_slack_rate_limit_message. integration {integration_id}. ID mismatch. " f"Active: {integration.rate_limit_message_task_id}" diff --git a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py index 4844121027..12943f9682 100644 --- a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py +++ b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py @@ -5,7 +5,7 @@ from apps.alerts.models import AlertGroup from apps.slack.errors import get_error_class from apps.slack.models import SlackMessage -from apps.slack.scenarios.distribute_alerts import AlertShootingStep +from apps.slack.scenarios.distribute_alerts import IncomingAlertStep from apps.slack.scenarios.scenario_step import ScenarioStep from apps.slack.tests.conftest import build_slack_response @@ -28,7 +28,7 @@ def test_skip_escalations_error( reason, slack_error, ): - SlackAlertShootingStep = ScenarioStep.get_step("distribute_alerts", "AlertShootingStep") + SlackIncomingAlertStep = ScenarioStep.get_step("distribute_alerts", "IncomingAlertStep") organization, _, slack_team_identity, _ = make_organization_and_user_with_slack_identities() alert_receive_channel = make_alert_receive_channel(organization) alert_group = make_alert_group(alert_receive_channel) @@ -36,7 +36,7 @@ def test_skip_escalations_error( slack_channel = make_slack_channel(slack_team_identity) - step = SlackAlertShootingStep(slack_team_identity) + step = SlackIncomingAlertStep(slack_team_identity) with patch.object(step._slack_client, "api_call") as mock_slack_api_call: error_response = build_slack_response({"error": slack_error}) @@ -66,7 +66,7 @@ def test_timeout_error( make_alert_group, make_alert, ): - SlackAlertShootingStep = ScenarioStep.get_step("distribute_alerts", "AlertShootingStep") + SlackIncomingAlertStep = ScenarioStep.get_step("distribute_alerts", "IncomingAlertStep") slack_team_identity = make_slack_team_identity() slack_channel = make_slack_channel(slack_team_identity) organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) @@ -74,7 +74,7 @@ def test_timeout_error( alert_group = make_alert_group(alert_receive_channel) alert = make_alert(alert_group, raw_request_data="{}") - step = SlackAlertShootingStep(slack_team_identity) + step = SlackIncomingAlertStep(slack_team_identity) with pytest.raises(TimeoutError): with patch.object(step._slack_client, "api_call") as mock_slack_api_call: @@ -89,9 +89,9 @@ def test_timeout_error( assert not alert.delivered -@patch.object(AlertShootingStep, "_post_alert_group_to_slack") +@patch.object(IncomingAlertStep, "_post_alert_group_to_slack") @pytest.mark.django_db -def test_alert_shooting_no_channel_filter( +def test_incoming_alert_no_channel_filter( mock_post_alert_group_to_slack, make_slack_team_identity, make_slack_channel, @@ -109,7 +109,7 @@ def test_alert_shooting_no_channel_filter( alert_group = make_alert_group(alert_receive_channel, channel_filter=None) alert = make_alert(alert_group, raw_request_data={}) - step = AlertShootingStep(slack_team_identity, organization) + step = IncomingAlertStep(slack_team_identity, organization) step.process_signal(alert) assert mock_post_alert_group_to_slack.call_args[1]["slack_channel"] == slack_channel diff --git a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py index 6fc62816ae..0f1598041c 100644 --- a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py +++ b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py @@ -347,17 +347,18 @@ def test_resolution_notes_modal_closed_before_update( assert call_args[0] == "views.update" +@patch.object(SlackClient, "reactions_add") @patch.object(SlackClient, "chat_getPermalink", return_value={"permalink": "https://example.com"}) @pytest.mark.django_db def test_add_to_resolution_note( - _, + _mock_chat_getPermalink, + mock_reactions_add, make_organization_and_user_with_slack_identities, make_alert_receive_channel, make_alert_group, make_alert, make_slack_message, make_slack_channel, - settings, ): organization, user, slack_team_identity, slack_user_identity = make_organization_and_user_with_slack_identities() alert_receive_channel = make_alert_receive_channel(organization) @@ -382,8 +383,7 @@ def test_add_to_resolution_note( AddToResolutionNoteStep = ScenarioStep.get_step("resolution_note", "AddToResolutionNoteStep") step = AddToResolutionNoteStep(organization=organization, user=user, slack_team_identity=slack_team_identity) - with patch.object(SlackClient, "reactions_add") as mock_reactions_add: - step.process_scenario(slack_user_identity, slack_team_identity, payload) + step.process_scenario(slack_user_identity, slack_team_identity, payload) mock_reactions_add.assert_called_once() assert alert_group.resolution_notes.get().text == "Test resolution note" diff --git a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py index 45384f5316..993692d6bc 100644 --- a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py +++ b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py @@ -386,12 +386,7 @@ def test_save_thread_message_for_resolution_note( def test_delete_thread_message_from_resolution_note_no_slack_user_identity( self, MockResolutionNoteSlackMessage, make_organization_and_user_with_slack_identities ) -> None: - ( - organization, - user, - slack_team_identity, - slack_user_identity, - ) = make_organization_and_user_with_slack_identities() + organization, user, slack_team_identity, _ = make_organization_and_user_with_slack_identities() step = SlackChannelMessageEventStep(slack_team_identity, organization, user) step.delete_thread_message_from_resolution_note(None, {}) diff --git a/engine/conftest.py b/engine/conftest.py index 7ef2555835..bcd7c47cc7 100644 --- a/engine/conftest.py +++ b/engine/conftest.py @@ -19,7 +19,6 @@ Alert, AlertGroupLogRecord, AlertReceiveChannel, - MaintainableObject, ResolutionNote, listen_for_alertgrouplogrecord, listen_for_alertreceivechannel_model_save, @@ -825,14 +824,6 @@ def _make_slack_channel(slack_team_identity, **kwargs): return _make_slack_channel -@pytest.fixture() -def mock_start_disable_maintenance_task(monkeypatch): - def mocked_start_disable_maintenance_task(*args, **kwargs): - return uuid.uuid4() - - monkeypatch.setattr(MaintainableObject, "start_disable_maintenance_task", mocked_start_disable_maintenance_task) - - @pytest.fixture() def make_organization_and_user_with_plugin_token(make_organization_and_user, make_token_for_organization): def _make_organization_and_user_with_plugin_token(role: typing.Optional[LegacyAccessControlRole] = None): diff --git a/engine/requirements.in b/engine/requirements.in index 4ce7179e79..1ecf5d5195 100644 --- a/engine/requirements.in +++ b/engine/requirements.in @@ -9,7 +9,6 @@ django-cors-headers==3.7.0 # pyroscope-io==0.8.1 django-dbconn-retry==0.1.7 django-debug-toolbar==4.1 -django-deprecate-fields==0.1.1 django-filter==2.4.0 django-ipware==4.0.2 django-log-request-id==1.6.0 diff --git a/engine/requirements.txt b/engine/requirements.txt index 5c60801e2e..7ac3a6ea28 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -82,7 +82,6 @@ django==4.2.16 # django-anymail # django-cors-headers # django-debug-toolbar - # django-deprecate-fields # django-filter # django-log-request-id # django-migration-linter @@ -106,8 +105,6 @@ django-dbconn-retry==0.1.7 # via -r requirements.in django-debug-toolbar==4.1.0 # via -r requirements.in -django-deprecate-fields==0.1.1 - # via -r requirements.in django-filter==2.4.0 # via -r requirements.in django-ipware==4.0.2 From 1829da934faed477c174812dde709fe125510ccf Mon Sep 17 00:00:00 2001 From: Matias Bordese Date: Mon, 2 Dec 2024 08:48:46 -0300 Subject: [PATCH 4/7] fix: service accounts creating webhooks through API (#5312) There is no user to be set when creating a webhook via the public API if authenticated using service account tokens (insight logs will still keep track of the service account information behind the request). --- .../apps/public_api/serializers/webhooks.py | 6 +++ engine/apps/public_api/tests/test_webhooks.py | 44 +++++++++++++++++++ .../user_management/models/service_account.py | 4 ++ 3 files changed, 54 insertions(+) diff --git a/engine/apps/public_api/serializers/webhooks.py b/engine/apps/public_api/serializers/webhooks.py index 40513c5cb2..879af9fefc 100644 --- a/engine/apps/public_api/serializers/webhooks.py +++ b/engine/apps/public_api/serializers/webhooks.py @@ -3,6 +3,7 @@ from rest_framework import fields, serializers from rest_framework.validators import UniqueTogetherValidator +from apps.user_management.models import ServiceAccountUser from apps.webhooks.models import Webhook, WebhookResponse from apps.webhooks.models.webhook import PUBLIC_WEBHOOK_HTTP_METHODS, WEBHOOK_FIELD_PLACEHOLDER from apps.webhooks.presets.preset_options import WebhookPresetOptions @@ -161,6 +162,11 @@ def validate_http_method(self, http_method): def validate_preset(self, preset): raise serializers.ValidationError(PRESET_VALIDATION_MESSAGE) + def validate_user(self, user): + if isinstance(user, ServiceAccountUser): + return None + return user + def validate(self, data): if ( self.instance diff --git a/engine/apps/public_api/tests/test_webhooks.py b/engine/apps/public_api/tests/test_webhooks.py index 2dcdae9788..ec6c85d8c1 100644 --- a/engine/apps/public_api/tests/test_webhooks.py +++ b/engine/apps/public_api/tests/test_webhooks.py @@ -1,10 +1,13 @@ import json +import httpretty import pytest from django.urls import reverse from rest_framework import status from rest_framework.test import APIClient +from apps.api import permissions +from apps.auth_token.tests.helpers import setup_service_account_api_mocks from apps.public_api.serializers.webhooks import PRESET_VALIDATION_MESSAGE from apps.webhooks.models import Webhook from apps.webhooks.tests.test_webhook_presets import ADVANCED_WEBHOOK_PRESET_ID, TEST_WEBHOOK_PRESET_ID @@ -235,6 +238,47 @@ def test_create_webhook_nested_data(make_organization_and_user_with_token): assert response.json() == expected_result +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_create_webhook_via_service_account( + make_organization, + make_service_account_for_organization, + make_token_for_service_account, +): + organization = make_organization(grafana_url="http://grafana.test") + service_account = make_service_account_for_organization(organization) + token_string = "glsa_token" + make_token_for_service_account(service_account, token_string) + + perms = { + permissions.RBACPermission.Permissions.OUTGOING_WEBHOOKS_WRITE.value: ["*"], + } + setup_service_account_api_mocks(organization.grafana_url, perms) + + client = APIClient() + url = reverse("api-public:webhooks-list") + data = { + "name": "Test outgoing webhook", + "url": "https://example.com", + "http_method": "POST", + "trigger_type": "acknowledge", + } + response = client.post( + url, + data=data, + format="json", + HTTP_AUTHORIZATION=f"{token_string}", + HTTP_X_GRAFANA_URL=organization.grafana_url, + ) + if not organization.is_rbac_permissions_enabled: + assert response.status_code == status.HTTP_403_FORBIDDEN + else: + assert response.status_code == status.HTTP_201_CREATED + webhook = Webhook.objects.get(public_primary_key=response.data["id"]) + expected_result = _get_expected_result(webhook) + assert response.data == expected_result + + @pytest.mark.django_db def test_update_webhook( make_organization_and_user_with_token, diff --git a/engine/apps/user_management/models/service_account.py b/engine/apps/user_management/models/service_account.py index 5082f7b965..50e4e578a5 100644 --- a/engine/apps/user_management/models/service_account.py +++ b/engine/apps/user_management/models/service_account.py @@ -25,6 +25,10 @@ def id(self): def pk(self): return self.service_account.id + @property + def current_team(self): + return None + @property def organization_id(self): return self.organization.id From 26ff937e97fb87efd090bf5591f9ec92f4f60c1f Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 2 Dec 2024 10:51:13 -0500 Subject: [PATCH 5/7] fix: patch slack ID reference issue (#5314) ## Which issue(s) this PR closes Fixes https://github.com/grafana/irm/issues/469 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- engine/apps/alerts/models/alert_group.py | 16 +- engine/apps/alerts/models/channel_filter.py | 10 +- engine/apps/alerts/tests/test_alert_group.py | 63 +- .../apps/alerts/tests/test_channel_filter.py | 64 ++ engine/apps/alerts/tests/test_notify_all.py | 10 +- .../apps/slack/scenarios/distribute_alerts.py | 187 ++--- engine/apps/slack/tasks.py | 4 +- .../scenario_steps/test_distribute_alerts.py | 649 +++++++++++++++--- engine/apps/slack/tests/test_installation.py | 17 +- 9 files changed, 796 insertions(+), 224 deletions(-) diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 442ab61e68..3db1d9edfb 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -1980,12 +1980,16 @@ def is_presented_in_slack(self): @property def slack_channel_id(self) -> str | None: - if not self.channel.organization.slack_team_identity: - return None - elif self.slack_message: - return self.slack_message.channel.slack_id - elif self.channel_filter: - return self.channel_filter.slack_channel_id_or_org_default_id + channel_filter = self.channel_filter + + if self.slack_message: + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # + # return self.slack_message.channel.slack_id + return self.slack_message._channel_id + elif channel_filter and channel_filter.slack_channel_or_org_default: + return channel_filter.slack_channel_or_org_default.slack_id return None @property diff --git a/engine/apps/alerts/models/channel_filter.py b/engine/apps/alerts/models/channel_filter.py index 3ea2ea8bcb..0f67f575c8 100644 --- a/engine/apps/alerts/models/channel_filter.py +++ b/engine/apps/alerts/models/channel_filter.py @@ -172,14 +172,8 @@ def slack_channel_slack_id(self) -> typing.Optional[str]: return self.slack_channel.slack_id if self.slack_channel else None @property - def slack_channel_id_or_org_default_id(self): - organization = self.alert_receive_channel.organization - - if organization.slack_team_identity is None: - return None - elif self.slack_channel_slack_id is None: - return organization.default_slack_channel_slack_id - return self.slack_channel_slack_id + def slack_channel_or_org_default(self) -> typing.Optional["SlackChannel"]: + return self.slack_channel or self.alert_receive_channel.organization.default_slack_channel @property def str_for_clients(self): diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index e837516f86..d4fdbf7710 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -707,7 +707,7 @@ def test_delete_by_user( @pytest.mark.django_db -def test_integration_config_on_alert_group_created(make_organization, make_alert_receive_channel, make_channel_filter): +def test_integration_config_on_alert_group_created(make_organization, make_alert_receive_channel): organization = make_organization() alert_receive_channel = make_alert_receive_channel(organization, grouping_id_template="group_to_one_group") @@ -806,3 +806,64 @@ def test_alert_group_created_if_resolve_condition_but_auto_resolving_disabled( # the alert will create a new alert group assert alert.group != resolved_alert_group + + +class TestAlertGroupSlackChannelID: + @pytest.mark.django_db + def test_slack_channel_id_with_slack_message( + self, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that slack_channel_id returns the _channel_id from slack_message when slack_message exists. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(slack_channel, alert_group=alert_group) + + # Assert that slack_channel_id returns the _channel_id from slack_message + assert alert_group.slack_channel_id == slack_message._channel_id + + @pytest.mark.django_db + def test_slack_channel_id_with_channel_filter( + self, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_channel_filter, + make_slack_channel, + make_alert_group, + ): + """ + Test that slack_channel_id returns the slack_id from channel_filter.slack_channel_or_org_default. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + slack_channel = make_slack_channel(slack_team_identity) + channel_filter = make_channel_filter(alert_receive_channel, slack_channel=slack_channel) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + + # Assert that slack_channel_id returns the slack_id from the channel filter's Slack channel + assert alert_group.slack_channel_id == slack_channel.slack_id + + @pytest.mark.django_db + def test_slack_channel_id_no_slack_message_no_channel_filter( + self, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + ): + """ + Test that slack_channel_id returns None when there is no slack_message and no channel_filter. + """ + organization, _ = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel, channel_filter=None) + + # Assert that slack_channel_id is None + assert alert_group.slack_channel_id is None diff --git a/engine/apps/alerts/tests/test_channel_filter.py b/engine/apps/alerts/tests/test_channel_filter.py index ba19e68894..b6079449c9 100644 --- a/engine/apps/alerts/tests/test_channel_filter.py +++ b/engine/apps/alerts/tests/test_channel_filter.py @@ -152,3 +152,67 @@ def test_channel_filter_using_filter_labels( assert ChannelFilter.select_filter(alert_receive_channel, {"title": "Test Title", "value": 5}, labels) == ( custom_channel_filter if should_match else default_channel_filter ) + + +class TestChannelFilterSlackChannelOrOrgDefault: + @pytest.mark.django_db + def test_slack_channel_or_org_default_with_slack_channel( + self, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_channel_filter, + make_slack_channel, + ): + """ + Test that slack_channel_or_org_default returns self.slack_channel when it is set. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + slack_channel = make_slack_channel(slack_team_identity) + channel_filter = make_channel_filter(alert_receive_channel=alert_receive_channel, slack_channel=slack_channel) + + # Assert that slack_channel_or_org_default returns slack_channel + assert channel_filter.slack_channel_or_org_default == slack_channel + + @pytest.mark.django_db + def test_slack_channel_or_org_default_with_org_default( + self, + make_slack_team_identity, + make_organization, + make_alert_receive_channel, + make_channel_filter, + make_slack_channel, + ): + """ + Test that slack_channel_or_org_default returns organization's default_slack_channel when slack_channel is None. + """ + slack_team_identity = make_slack_team_identity() + default_slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization( + slack_team_identity=slack_team_identity, + default_slack_channel=default_slack_channel, + ) + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel, slack_channel=None) + + # Assert that slack_channel_or_org_default returns organization's default_slack_channel + assert channel_filter.slack_channel_or_org_default == default_slack_channel + + @pytest.mark.django_db + def test_slack_channel_or_org_default_none( + self, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_channel_filter, + ): + """ + Test that slack_channel_or_org_default returns None when both slack_channel and organization's default_slack_channel are None. + """ + organization, _ = make_organization_with_slack_team_identity() + assert organization.default_slack_channel is None + + alert_receive_channel = make_alert_receive_channel(organization) + channel_filter = make_channel_filter(alert_receive_channel=alert_receive_channel, slack_channel=None) + + # Assert that slack_channel_or_org_default returns None + assert channel_filter.slack_channel_or_org_default is None diff --git a/engine/apps/alerts/tests/test_notify_all.py b/engine/apps/alerts/tests/test_notify_all.py index 0649314cd4..b7a277860d 100644 --- a/engine/apps/alerts/tests/test_notify_all.py +++ b/engine/apps/alerts/tests/test_notify_all.py @@ -9,10 +9,8 @@ @pytest.mark.django_db def test_notify_all( - make_organization, - make_slack_team_identity, + make_organization_and_user_with_slack_identities, make_slack_channel, - make_user, make_user_notification_policy, make_escalation_chain, make_escalation_policy, @@ -20,13 +18,9 @@ def test_notify_all( make_alert_receive_channel, make_alert_group, ): - organization = make_organization() - slack_team_identity = make_slack_team_identity() + organization, user, slack_team_identity, _ = make_organization_and_user_with_slack_identities() slack_channel = make_slack_channel(slack_team_identity) - organization.slack_team_identity = slack_team_identity - organization.save() - user = make_user(organization=organization) make_user_notification_policy( user=user, step=UserNotificationPolicy.Step.NOTIFY, diff --git a/engine/apps/slack/scenarios/distribute_alerts.py b/engine/apps/slack/scenarios/distribute_alerts.py index ad327bdcd8..cd088bf7b5 100644 --- a/engine/apps/slack/scenarios/distribute_alerts.py +++ b/engine/apps/slack/scenarios/distribute_alerts.py @@ -22,7 +22,7 @@ SlackAPIRestrictedActionError, SlackAPITokenError, ) -from apps.slack.models import SlackChannel, SlackTeamIdentity, SlackUserIdentity +from apps.slack.models import SlackTeamIdentity, SlackUserIdentity from apps.slack.scenarios import scenario_step from apps.slack.slack_formatter import SlackFormatter from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel, update_incident_slack_message @@ -94,7 +94,6 @@ def process_signal(self, alert: Alert) -> None: See this conversation for more context https://raintank-corp.slack.com/archives/C06K1MQ07GS/p1732800180834819?thread_ts=1732748893.183939&cid=C06K1MQ07GS """ - alert_group = alert.group if not alert_group: @@ -108,6 +107,8 @@ def process_signal(self, alert: Alert) -> None: alert_group_pk = alert_group.pk alert_receive_channel = alert_group.channel + organization = alert_receive_channel.organization + channel_filter = alert_group.channel_filter should_skip_escalation_in_slack = alert_group.skip_escalation_in_slack slack_team_identity = self.slack_team_identity slack_team_identity_pk = slack_team_identity.pk @@ -128,26 +129,115 @@ def process_signal(self, alert: Alert) -> None: if num_updated_rows == 1: # this will be the case in the event that we haven't yet created a Slack message for this alert group + # if channel filter is deleted mid escalation, use the organization's default Slack channel slack_channel = ( - alert_group.channel_filter.slack_channel - if alert_group.channel_filter - # if channel filter is deleted mid escalation, use default Slack channel - else alert_receive_channel.organization.default_slack_channel + channel_filter.slack_channel_or_org_default if channel_filter else organization.default_slack_channel ) + + # slack_channel can be None if the channel filter is deleted mid escalation, OR the channel filter does + # not have a slack channel + # + # In these cases, we try falling back to the organization's default slack channel. If that is also None, + # slack_channel will be None, and we will skip posting to Slack + # (because we don't know where to post the message to). + if slack_channel is None: + logger.info( + f"Skipping posting message to Slack for alert_group {alert_group_pk} because we don't know which " + f"Slack channel to post to. channel_filter={channel_filter} " + f"organization.default_slack_channel={organization.default_slack_channel}" + ) + + alert_group.slack_message_sent = False + alert_group.reason_to_skip_escalation = AlertGroup.CHANNEL_NOT_SPECIFIED + alert_group.save(update_fields=["slack_message_sent", "reason_to_skip_escalation"]) + return + slack_channel_id = slack_channel.slack_id try: - self._post_alert_group_to_slack( - slack_team_identity=slack_team_identity, - alert_group=alert_group, - alert=alert, + result = self._slack_client.chat_postMessage( + channel=slack_channel.slack_id, attachments=alert_group.render_slack_attachments(), - slack_channel=slack_channel, blocks=alert_group.render_slack_blocks(), ) - except (SlackAPIError, TimeoutError): - AlertGroup.objects.filter(pk=alert_group_pk).update(slack_message_sent=False) - raise + + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p1732555465144099 + alert_group.slack_messages.create( + slack_id=result["ts"], + organization=alert_group.channel.organization, + _channel_id=slack_channel.slack_id, + channel=slack_channel, + ) + + alert.delivered = True + alert.save() + except (SlackAPIError, TimeoutError) as e: + reason_to_skip_escalation: typing.Optional[int] = None + extra_log_msg: typing.Optional[str] = None + reraise_exception = False + + if isinstance(e, TimeoutError): + reraise_exception = True + elif isinstance(e, SlackAPIRatelimitError): + extra_log_msg = "not delivering alert due to slack rate limit" + if not alert_receive_channel.is_maintenace_integration: + # we do not want to rate limit maintenace alerts.. + reason_to_skip_escalation = AlertGroup.RATE_LIMITED + extra_log_msg += ( + f" integration is a maintenance integration alert_receive_channel={alert_receive_channel}" + ) + else: + reraise_exception = True + elif isinstance(e, SlackAPITokenError): + reason_to_skip_escalation = AlertGroup.ACCOUNT_INACTIVE + extra_log_msg = "not delivering alert due to account_inactive" + elif isinstance(e, SlackAPIInvalidAuthError): + reason_to_skip_escalation = AlertGroup.INVALID_AUTH + extra_log_msg = "not delivering alert due to invalid_auth" + elif isinstance(e, (SlackAPIChannelArchivedError, SlackAPIChannelNotFoundError)): + reason_to_skip_escalation = AlertGroup.CHANNEL_ARCHIVED + extra_log_msg = "not delivering alert due to channel is archived" + elif isinstance(e, SlackAPIRestrictedActionError): + reason_to_skip_escalation = AlertGroup.RESTRICTED_ACTION + extra_log_msg = "not delivering alert due to workspace restricted action" + else: + # this is some other SlackAPIError.. + reraise_exception = True + + log_msg = f"{e.__class__.__name__} while posting alert {alert.pk} for {alert_group_pk} to Slack" + if extra_log_msg: + log_msg += f" ({extra_log_msg})" + + logger.warning(log_msg) + + update_fields = [] + if reason_to_skip_escalation is not None: + alert_group.reason_to_skip_escalation = reason_to_skip_escalation + update_fields.append("reason_to_skip_escalation") + + # Only set slack_message_sent to False under certain circumstances - the idea here is to prevent + # attempts to post a message to Slack, ONLY in cases when we are sure it's not possible + # (e.g. slack token error; because we call this step with every new alert in the alert group) + # + # In these cases, with every next alert in the alert group, num_updated_rows (above) should return 0, + # and we should skip "post the first message" part for the new alerts + if reraise_exception is True: + alert_group.slack_message_sent = False + update_fields.append("slack_message_sent") + + alert_group.save(update_fields=update_fields) + + if reraise_exception: + raise e + + # don't reraise an Exception, but we must return early here because the AlertGroup does not have a + # SlackMessage associated with it (eg. the failed called to chat_postMessage above, means + # that we never called alert_group.slack_messages.create), + # + # Given that, it would be impossible to try posting a debug mode notice or + # send_message_to_thread_if_bot_not_in_channel without the SlackMessage's thread_ts + return if alert_receive_channel.maintenance_mode == AlertReceiveChannel.DEBUG_MAINTENANCE: # send debug mode notice @@ -191,75 +281,6 @@ def process_signal(self, alert: Alert) -> None: else: logger.info("Skip updating alert_group in Slack due to rate limit") - def _post_alert_group_to_slack( - self, - slack_team_identity: SlackTeamIdentity, - alert_group: AlertGroup, - alert: Alert, - attachments, - slack_channel: typing.Optional[SlackChannel], - blocks: Block.AnyBlocks, - ) -> None: - # slack_channel can be None if org default slack channel for slack_team_identity is not set - if slack_channel is None: - logger.info( - f"Failed to post message to Slack for alert_group {alert_group.pk} because slack_channel is None" - ) - - alert_group.reason_to_skip_escalation = AlertGroup.CHANNEL_NOT_SPECIFIED - alert_group.save(update_fields=["reason_to_skip_escalation"]) - - return - - try: - result = self._slack_client.chat_postMessage( - channel=slack_channel.slack_id, - attachments=attachments, - blocks=blocks, - ) - - # TODO: once _channel_id has been fully migrated to channel, remove _channel_id - # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p1732555465144099 - alert_group.slack_messages.create( - slack_id=result["ts"], - organization=alert_group.channel.organization, - _channel_id=slack_channel.slack_id, - channel=slack_channel, - ) - - alert.delivered = True - except SlackAPITokenError: - alert_group.reason_to_skip_escalation = AlertGroup.ACCOUNT_INACTIVE - alert_group.save(update_fields=["reason_to_skip_escalation"]) - logger.info("Not delivering alert due to account_inactive.") - except SlackAPIInvalidAuthError: - alert_group.reason_to_skip_escalation = AlertGroup.INVALID_AUTH - alert_group.save(update_fields=["reason_to_skip_escalation"]) - logger.info("Not delivering alert due to invalid_auth.") - except SlackAPIChannelArchivedError: - alert_group.reason_to_skip_escalation = AlertGroup.CHANNEL_ARCHIVED - alert_group.save(update_fields=["reason_to_skip_escalation"]) - logger.info("Not delivering alert due to channel is archived.") - except SlackAPIRatelimitError as e: - # don't rate limit maintenance alert - if not alert_group.channel.is_maintenace_integration: - alert_group.reason_to_skip_escalation = AlertGroup.RATE_LIMITED - alert_group.save(update_fields=["reason_to_skip_escalation"]) - alert_group.channel.start_send_rate_limit_message_task(e.retry_after) - logger.info("Not delivering alert due to slack rate limit.") - else: - raise e - except SlackAPIChannelNotFoundError: - alert_group.reason_to_skip_escalation = AlertGroup.CHANNEL_ARCHIVED - alert_group.save(update_fields=["reason_to_skip_escalation"]) - logger.info("Not delivering alert due to channel is archived.") - except SlackAPIRestrictedActionError: - alert_group.reason_to_skip_escalation = AlertGroup.RESTRICTED_ACTION - alert_group.save(update_fields=["reason_to_skip_escalation"]) - logger.info("Not delivering alert due to workspace restricted action.") - finally: - alert.save() - def process_scenario( self, slack_user_identity: SlackUserIdentity, diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index 13b26bbeda..bfd6f22fd5 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -303,14 +303,14 @@ def post_slack_rate_limit_message(integration_id): return default_route = integration.channel_filters.get(is_default=True) - if (slack_channel_id := default_route.slack_channel_id_or_org_default_id) is not None: + if (slack_channel := default_route.slack_channel_or_org_default) is not None: text = ( f"Delivering and updating alert groups of integration {integration.verbal_name} in Slack is " f"temporarily stopped due to rate limit. You could find new alert groups at " f"<{integration.new_incidents_web_link}|web page " '"Alert Groups">' ) - post_message_to_channel(integration.organization, slack_channel_id, text) + post_message_to_channel(integration.organization, slack_channel.slack_id, text) @shared_dedicated_queue_retry_task( diff --git a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py index 12943f9682..e815d78036 100644 --- a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py +++ b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py @@ -1,115 +1,556 @@ +from datetime import timedelta from unittest.mock import patch import pytest +from django.core.cache import cache +from django.utils import timezone -from apps.alerts.models import AlertGroup -from apps.slack.errors import get_error_class +from apps.alerts.models import AlertGroup, AlertReceiveChannel +from apps.slack.errors import SlackAPIFetchMembersFailedError, SlackAPIRatelimitError, get_error_class from apps.slack.models import SlackMessage from apps.slack.scenarios.distribute_alerts import IncomingAlertStep -from apps.slack.scenarios.scenario_step import ScenarioStep from apps.slack.tests.conftest import build_slack_response +from apps.slack.utils import get_cache_key_update_incident_slack_message +SLACK_MESSAGE_TS = "1234567890.123456" +SLACK_POST_MESSAGE_SUCCESS_RESPONSE = {"ts": SLACK_MESSAGE_TS} -@pytest.mark.django_db -@pytest.mark.parametrize( - "reason,slack_error", - [ - (reason, slack_error) - for reason, slack_error in AlertGroup.REASONS_TO_SKIP_ESCALATIONS - if reason != AlertGroup.NO_REASON - ], -) -def test_skip_escalations_error( - make_organization_and_user_with_slack_identities, - make_alert_receive_channel, - make_alert_group, - make_alert, - make_slack_channel, - reason, - slack_error, -): - SlackIncomingAlertStep = ScenarioStep.get_step("distribute_alerts", "IncomingAlertStep") - organization, _, slack_team_identity, _ = make_organization_and_user_with_slack_identities() - alert_receive_channel = make_alert_receive_channel(organization) - alert_group = make_alert_group(alert_receive_channel) - alert = make_alert(alert_group, raw_request_data="{}") - - slack_channel = make_slack_channel(slack_team_identity) - - step = SlackIncomingAlertStep(slack_team_identity) - - with patch.object(step._slack_client, "api_call") as mock_slack_api_call: - error_response = build_slack_response({"error": slack_error}) - error_class = get_error_class(error_response) - mock_slack_api_call.side_effect = error_class(error_response) - - channel = slack_channel - if reason == AlertGroup.CHANNEL_NOT_SPECIFIED: - channel = None - - step._post_alert_group_to_slack(slack_team_identity, alert_group, alert, None, channel, []) - - alert_group.refresh_from_db() - alert.refresh_from_db() - assert alert_group.reason_to_skip_escalation == reason - assert alert_group.slack_message is None - assert SlackMessage.objects.count() == 0 - assert not alert.delivered - - -@pytest.mark.django_db -def test_timeout_error( - make_slack_team_identity, - make_slack_channel, - make_organization, - make_alert_receive_channel, - make_alert_group, - make_alert, -): - SlackIncomingAlertStep = ScenarioStep.get_step("distribute_alerts", "IncomingAlertStep") - slack_team_identity = make_slack_team_identity() - slack_channel = make_slack_channel(slack_team_identity) - organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) - alert_receive_channel = make_alert_receive_channel(organization) - alert_group = make_alert_group(alert_receive_channel) - alert = make_alert(alert_group, raw_request_data="{}") - - step = SlackIncomingAlertStep(slack_team_identity) - - with pytest.raises(TimeoutError): - with patch.object(step._slack_client, "api_call") as mock_slack_api_call: - mock_slack_api_call.side_effect = TimeoutError + +class TestIncomingAlertStep: + @patch("apps.slack.client.SlackClient.chat_postMessage", return_value=SLACK_POST_MESSAGE_SUCCESS_RESPONSE) + @pytest.mark.django_db + def test_process_signal_success_first_message( + self, + mock_chat_postMessage, + make_organization_with_slack_team_identity, + make_slack_channel, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + """ + Test the success case where process_signal posts the first Slack message for the alert group. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + + slack_channel = make_slack_channel(slack_team_identity) + organization.default_slack_channel = slack_channel + organization.save() + + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel, slack_message_sent=False) + alert = make_alert(alert_group, raw_request_data={}) + + # Ensure slack_message_sent is False initially + assert not alert_group.slack_message_sent + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + alert_group.refresh_from_db() + alert.refresh_from_db() + + assert alert_group.slack_message_sent is True + + assert alert_group.slack_message is not None + assert SlackMessage.objects.count() == 1 + assert alert_group.slack_message.slack_id == SLACK_MESSAGE_TS + assert alert_group.slack_message.channel == slack_channel + + assert alert.delivered is True + + @patch("apps.slack.client.SlackClient.chat_postMessage", return_value=SLACK_POST_MESSAGE_SUCCESS_RESPONSE) + @pytest.mark.django_db + def test_incoming_alert_no_channel_filter( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + + # Simulate an alert group with channel filter deleted in the middle of the escalation + # it should use the org default Slack channel to post the message to + alert_group = make_alert_group(alert_receive_channel, channel_filter=None) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity, organization) + step.process_signal(alert) + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @pytest.mark.django_db + def test_process_signal_no_alert_group( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_alert, + ): + slack_team_identity = make_slack_team_identity() + alert = make_alert(alert_group=None, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + mock_chat_postMessage.assert_not_called() + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @pytest.mark.django_db + def test_process_signal_channel_rate_limited( + self, + mock_chat_postMessage, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + organization, slack_team_identity = make_organization_with_slack_team_identity() + + # Set rate_limited_in_slack_at to a recent time to simulate rate limiting + alert_receive_channel = make_alert_receive_channel( + organization, + rate_limited_in_slack_at=timezone.now() - timedelta(seconds=10), + ) + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + mock_chat_postMessage.assert_not_called() + + alert_group.refresh_from_db() + assert alert_group.slack_message_sent is True + assert alert_group.reason_to_skip_escalation == AlertGroup.RATE_LIMITED + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @pytest.mark.django_db + def test_process_signal_no_slack_channel( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + slack_team_identity = make_slack_team_identity() + organization = make_organization(default_slack_channel=None) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel, channel_filter=None) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + alert_group.refresh_from_db() + assert alert_group.slack_message_sent is False + assert alert_group.reason_to_skip_escalation == AlertGroup.CHANNEL_NOT_SPECIFIED + + mock_chat_postMessage.assert_not_called() + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @pytest.mark.django_db + def test_process_signal_debug_maintenance_mode( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_organization, + make_slack_channel, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + """ + Test the scenario where the alert receive channel is in DEBUG_MAINTENANCE mode. + It should post the initial message and then send a debug mode notice in the same thread. + """ + # Mock chat_postMessage to handle both calls + # Set side_effect to return different values for each call + mock_chat_postMessage.side_effect = [ + SLACK_POST_MESSAGE_SUCCESS_RESPONSE, # create alert group slack message call return value + {"ok": True}, # debug mode notice call return value + ] + + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + + alert_receive_channel = make_alert_receive_channel( + organization, + maintenance_mode=AlertReceiveChannel.DEBUG_MAINTENANCE, + ) + + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + # Ensure slack_message_sent is False initially + assert not alert_group.slack_message_sent + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + assert mock_chat_postMessage.call_count == 2 + + _, create_alert_group_slack_message_call_kwargs = mock_chat_postMessage.call_args_list[0] + _, debug_mode_notice_call_kwargs = mock_chat_postMessage.call_args_list[1] + + assert create_alert_group_slack_message_call_kwargs["channel"] == slack_channel.slack_id + + text = "Escalations are silenced due to Debug mode" + assert debug_mode_notice_call_kwargs == { + "channel": slack_channel.slack_id, + "text": text, + "attachments": [], + "thread_ts": SLACK_MESSAGE_TS, # ts from first call + "mrkdwn": True, + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": text, + }, + }, + ], + } + + alert_group.refresh_from_db() + alert.refresh_from_db() + + assert alert_group.slack_message_sent is True + + assert alert_group.slack_message is not None + assert SlackMessage.objects.count() == 1 + assert alert_group.slack_message.slack_id == SLACK_MESSAGE_TS + assert alert_group.slack_message.channel == slack_channel + + assert alert.delivered is True + + @patch("apps.slack.client.SlackClient.chat_postMessage", return_value=SLACK_POST_MESSAGE_SUCCESS_RESPONSE) + @patch("apps.slack.scenarios.distribute_alerts.send_message_to_thread_if_bot_not_in_channel") + @pytest.mark.django_db + def test_process_signal_send_message_to_thread_if_bot_not_in_channel( + self, + mock_send_message_to_thread_if_bot_not_in_channel, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + assert alert_group.is_maintenance_incident is False + assert alert_group.skip_escalation_in_slack is False + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + mock_send_message_to_thread_if_bot_not_in_channel.apply_async.assert_called_once_with( + (alert_group.pk, slack_team_identity.pk, slack_channel.slack_id), countdown=1 + ) + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @pytest.mark.django_db + def test_process_signal_update_existing_message( + self, + mock_update_incident_slack_message, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + mocked_update_incident_task_id = "1234" + mock_update_incident_slack_message.apply_async.return_value = mocked_update_incident_task_id + + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + + # Simulate that slack_message_sent is already True and skip_escalation_in_slack is False + alert_group = make_alert_group( + alert_receive_channel, + slack_message_sent=True, + reason_to_skip_escalation=AlertGroup.NO_REASON, + ) + assert alert_group.skip_escalation_in_slack is False + + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + # assert that the background task is scheduled + mock_update_incident_slack_message.apply_async.assert_called_once_with( + (slack_team_identity.pk, alert_group.pk), countdown=10 + ) + mock_chat_postMessage.assert_not_called() + + # Verify that the cache is set correctly + assert cache.get(get_cache_key_update_incident_slack_message(alert_group.pk)) == mocked_update_incident_task_id + + @patch("apps.slack.client.SlackClient.chat_postMessage") + @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @pytest.mark.django_db + def test_process_signal_do_not_update_due_to_skip_escalation( + self, + mock_update_incident_slack_message, + mock_chat_postMessage, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + """ + Test that when skip_escalation_in_slack is True, the update task is not scheduled. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + + # Simulate that slack_message_sent is already True and skip escalation due to RATE_LIMITED + alert_group = make_alert_group( + alert_receive_channel, + slack_message_sent=True, + reason_to_skip_escalation=AlertGroup.RATE_LIMITED, # Ensures skip_escalation_in_slack is True + ) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + step.process_signal(alert) + + # assert that the background task is not scheduled + mock_update_incident_slack_message.apply_async.assert_not_called() + mock_chat_postMessage.assert_not_called() + + @patch("apps.slack.client.SlackClient.chat_postMessage", side_effect=TimeoutError) + @pytest.mark.django_db + def test_process_signal_timeout_error( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + with pytest.raises(TimeoutError): + step.process_signal(alert) + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + alert_group.refresh_from_db() + alert.refresh_from_db() + + # Ensure that slack_message_sent is set back to False, this will allow us to retry.. a TimeoutError may have + # been a transient error that is "recoverable" + assert alert_group.slack_message_sent is False + + assert alert_group.slack_message is None + assert SlackMessage.objects.count() == 0 + assert not alert.delivered + + @pytest.mark.parametrize( + "reason,slack_error", + [ + (reason, slack_error) + for reason, slack_error in AlertGroup.REASONS_TO_SKIP_ESCALATIONS + # we can skip NO_REASON because well this means theres no reason to skip the escalation + # we can skip CHANNEL_NOT_SPECIFIED because this is handled "higher up" in process_signal + if reason not in [AlertGroup.NO_REASON, AlertGroup.CHANNEL_NOT_SPECIFIED] + ], + ) + @pytest.mark.django_db + def test_process_signal_slack_errors( + self, + make_slack_team_identity, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_channel, + reason, + slack_error, + ): + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + + with patch.object(step._slack_client, "chat_postMessage") as mock_chat_postMessage: + error_response = build_slack_response({"error": slack_error}) + error_class = get_error_class(error_response) + mock_chat_postMessage.side_effect = error_class(error_response) + + step.process_signal(alert) + + alert_group.refresh_from_db() + alert.refresh_from_db() + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + # For these Slack errors, retrying won't really help, so we should not set slack_message_sent back to False + assert alert_group.slack_message_sent is True + + assert alert_group.reason_to_skip_escalation == reason + assert alert_group.slack_message is None + assert SlackMessage.objects.count() == 0 + assert not alert.delivered + + @patch( + "apps.slack.client.SlackClient.chat_postMessage", + side_effect=SlackAPIRatelimitError(build_slack_response({"error": "ratelimited"})), + ) + @pytest.mark.django_db + def test_process_signal_slack_api_ratelimit_for_maintenance_integration( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + """ + Test that when a SlackAPIRatelimitError occurs for a maintenance integration, + the exception is re-raised and slack_message_sent is set back to False. + """ + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel( + organization, integration=AlertReceiveChannel.INTEGRATION_MAINTENANCE + ) + + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + + with pytest.raises(SlackAPIRatelimitError): + step.process_signal(alert) + + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + alert_group.refresh_from_db() + + # Ensure that slack_message_sent is set back to False, this will allow us to retry.. a SlackAPIRatelimitError, + # may have been a transient error that is "recoverable" + # + # NOTE: we only want to retry for maintenance integrations, for other integrations we should not retry (this + # case is tested above under test_process_signal_slack_errors) + assert alert_group.slack_message_sent is False + + assert alert_group.reason_to_skip_escalation == AlertGroup.NO_REASON # Should remain unchanged + assert SlackMessage.objects.count() == 0 + assert not alert.delivered + + @patch( + "apps.slack.client.SlackClient.chat_postMessage", + side_effect=SlackAPIFetchMembersFailedError(build_slack_response({"error": "fetch_members_failed"})), + ) + @pytest.mark.django_db + def test_process_signal_unhandled_slack_error( + self, + mock_chat_postMessage, + make_slack_team_identity, + make_slack_channel, + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert, + ): + """ + Test that when an unhandled SlackAPIError occurs, the exception is re-raised + and slack_message_sent is set back to False. + """ + slack_team_identity = make_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) + alert_receive_channel = make_alert_receive_channel(organization) + + alert_group = make_alert_group(alert_receive_channel) + alert = make_alert(alert_group, raw_request_data={}) + + step = IncomingAlertStep(slack_team_identity) + + with pytest.raises(SlackAPIFetchMembersFailedError): step.process_signal(alert) - alert_group.refresh_from_db() - alert.refresh_from_db() - assert alert_group.slack_message is None - assert alert_group.slack_message_sent is False - assert SlackMessage.objects.count() == 0 - assert not alert.delivered - - -@patch.object(IncomingAlertStep, "_post_alert_group_to_slack") -@pytest.mark.django_db -def test_incoming_alert_no_channel_filter( - mock_post_alert_group_to_slack, - make_slack_team_identity, - make_slack_channel, - make_organization, - make_alert_receive_channel, - make_alert_group, - make_alert, -): - slack_team_identity = make_slack_team_identity() - slack_channel = make_slack_channel(slack_team_identity, slack_id="DEFAULT_CHANNEL_ID") - organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) - alert_receive_channel = make_alert_receive_channel(organization) - - # simulate an alert group with channel filter deleted in the middle of the escalation - alert_group = make_alert_group(alert_receive_channel, channel_filter=None) - alert = make_alert(alert_group, raw_request_data={}) - - step = IncomingAlertStep(slack_team_identity, organization) - step.process_signal(alert) - - assert mock_post_alert_group_to_slack.call_args[1]["slack_channel"] == slack_channel + mock_chat_postMessage.assert_called_once_with( + channel=slack_channel.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + alert_group.refresh_from_db() + + # For these Slack errors that we don't explictly want to handle, retrying won't really help, so we should not + # set slack_message_sent back to False + assert alert_group.slack_message_sent is False + + assert alert_group.reason_to_skip_escalation == AlertGroup.NO_REASON # Should remain unchanged + assert SlackMessage.objects.count() == 0 + assert not alert.delivered diff --git a/engine/apps/slack/tests/test_installation.py b/engine/apps/slack/tests/test_installation.py index 23f313e1f1..3248abc847 100644 --- a/engine/apps/slack/tests/test_installation.py +++ b/engine/apps/slack/tests/test_installation.py @@ -119,25 +119,18 @@ def test_install_slack_integration_legacy(settings, make_organization_and_user, @pytest.mark.django_db def test_uninstall_slack_integration( mock_clean_slack_integration_leftovers, - make_organization_and_user, - make_slack_team_identity, - make_slack_user_identity, + make_organization_and_user_with_slack_identities, ): - slack_team_identity = make_slack_team_identity() - organization, user = make_organization_and_user() - organization.slack_team_identity = slack_team_identity - organization.save() - organization.refresh_from_db() + organization, user, _, _ = make_organization_and_user_with_slack_identities() - slack_user_identity = make_slack_user_identity(slack_team_identity=slack_team_identity) - user.slack_user_identity = slack_user_identity - user.save() - user.refresh_from_db() + assert organization.slack_team_identity is not None + assert user.slack_user_identity is not None uninstall_slack_integration(organization, user) organization.refresh_from_db() user.refresh_from_db() + assert organization.slack_team_identity is None assert user.slack_user_identity is None From fa071bcd6e4f8f7acf01b7b560c9bcce50bf663f Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 2 Dec 2024 10:53:18 -0500 Subject: [PATCH 6/7] chore: add `pytest-socket` library + disable network calls in tests (#5315) # What this PR does Inspired by [this discussion](https://github.com/grafana/oncall/pull/5307#discussion_r1862449480). _tldr;_ ensures that if any of our tests try making an external network call, they will fail. Setup an example test: ```python def test_external_network_call(): import requests response = requests.get('https://www.example.com') assert response.status_code == 200 ``` and it worked (failed; [example CI test run](https://github.com/grafana/oncall/actions/runs/12106416991/job/33752144727?pr=5315#step:6:389)) as expected: ```bash __________________________ test_external_network_call __________________________ def test_external_network_call(): import requests > response = requests.get('https://www.example.com') requests = apps/test_joey.py:4: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/requests/api.py:73: in get return request("get", url, params=params, **kwargs) kwargs = {} params = None url = 'https://www.example.com' /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/requests/api.py:59: in request return session.request(method=method, url=url, **kwargs) kwargs = {'params': None} method = 'get' session = url = 'https://www.example.com' /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/requests/sessions.py:589: in request resp = self.send(prep, **send_kwargs) allow_redirects = True auth = None cert = None cookies = None data = None files = None headers = None hooks = None json = None method = 'get' params = None prep = proxies = {} req = self = send_kwargs = {'allow_redirects': True, 'cert': None, 'proxies': OrderedDict(), 'stream': False, ...} settings = {'cert': None, 'proxies': OrderedDict(), 'stream': False, 'verify': True} stream = None timeout = None url = 'https://www.example.com' verify = None /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/requests/sessions.py:703: in send r = adapter.send(request, **kwargs) adapter = allow_redirects = True hooks = {'response': []} kwargs = {'cert': None, 'proxies': OrderedDict(), 'stream': False, 'timeout': None, ...} request = self = start = 1733064371.649901 stream = False /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/requests/adapters.py:667: in send resp = conn.urlopen( cert = None chunked = False conn = proxies = OrderedDict() request = self = stream = False timeout = Timeout(connect=None, read=None, total=None) url = '/' verify = True /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/connectionpool.py:715: in urlopen httplib_response = self._make_request( assert_same_host = False body = None body_pos = None chunked = False clean_exit = False conn = None destination_scheme = None err = None headers = {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'} http_tunnel_required = False is_new_proxy_conn = False method = 'GET' parsed_url = Url(scheme=None, auth=None, host=None, port=None, path='/', query=None, fragment=None) pool_timeout = None redirect = False release_conn = False release_this_conn = True response_kw = {'decode_content': False, 'preload_content': False} retries = Retry(total=0, connect=None, read=False, redirect=None, status=None) self = timeout = Timeout(connect=None, read=None, total=None) timeout_obj = Timeout(connect=None, read=None, total=None) url = '/' /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/connectionpool.py:404: in _make_request self._validate_conn(conn) chunked = False conn = httplib_request_kw = {'body': None, 'headers': {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}} method = 'GET' self = timeout = Timeout(connect=None, read=None, total=None) timeout_obj = Timeout(connect=None, read=None, total=None) url = '/' /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/connectionpool.py:1060: in _validate_conn conn.connect() __class__ = conn = self = /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/connection.py:363: in connect self.sock = conn = self._new_conn() self = /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/connection.py:174: in _new_conn conn = connection.create_connection( extra_kw = {'socket_options': [(6, 1, 1)]} self = /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/urllib3/util/connection.py:85: in create_connection sock.connect(sa) address = ('www.example.com', 443) af = canonname = '' err = None family = host = 'www.example.com' port = 443 proto = 6 res = (, , 6, '', ('93.184.215.14', 443)) sa = ('93.184.215.14', 443) sock = socket_options = [(6, 1, 1)] socktype = source_address = None timeout = None _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ inst = args = (('93.184.215.14', 443),), host = '93.184.215.14' def guarded_connect(inst, *args): host = host_from_connect_args(args) if host in allowed_ip_hosts_and_hostnames or ( _is_unix_socket(inst.family) and allow_unix_socket ): return _true_connect(inst, *args) > raise SocketConnectBlockedError(allowed_list, host) E pytest_socket.SocketConnectBlockedError: A test tried to use socket.socket.connect() with host "93.184.215.14" (allowed: "calendar.google.com (142.251.167.100,142.251.167.101,142.251.167.102,142.251.167.113,142.251.167.138,142.251.167.139,2607:f8b0:4004:c09::65,2607:f8b0:4004:c09::66,2607:f8b0:4004:c09::71,2607:f8b0:4004:c09::8b),localhost (127.0.0.1,::1),oncall-dev-mariadb ()"). allow_unix_socket = False allowed_ip_hosts_and_hostnames = {'127.0.0.1', '142.251.167.100', '142.251.167.101', '142.251.167.102', '142.251.167.113', '142.251.167.138', ...} allowed_list = ['calendar.google.com (142.251.167.100,142.251.167.101,142.251.167.102,142.251.167.113,142.251.167.138,142.251.167.139...8b0:4004:c09::66,2607:f8b0:4004:c09::71,2607:f8b0:4004:c09::8b)', 'localhost (127.0.0.1,::1)', 'oncall-dev-mariadb ()'] args = (('93.184.215.14', 443),) host = '93.184.215.14' inst = /opt/hostedtoolcache/Python/3.12.3/x64/lib/python3.12/site-packages/pytest_socket.py:252: SocketConnectBlockedError ``` ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- engine/apps/api/tests/test_schedule_export.py | 10 ++++------ engine/apps/api/tests/test_schedules.py | 17 +++++++++++++++-- .../apps/api/tests/test_user_schedule_export.py | 2 -- .../apps/auth_token/tests/test_plugin_auth.py | 9 +++++++-- .../apps/grafana_plugin/tests/test_sync_v2.py | 4 ++++ engine/requirements-dev.in | 1 + engine/requirements-dev.txt | 7 ++++--- engine/tox.ini | 9 ++++++++- 8 files changed, 43 insertions(+), 16 deletions(-) diff --git a/engine/apps/api/tests/test_schedule_export.py b/engine/apps/api/tests/test_schedule_export.py index f747067e60..1990024546 100644 --- a/engine/apps/api/tests/test_schedule_export.py +++ b/engine/apps/api/tests/test_schedule_export.py @@ -7,8 +7,6 @@ from apps.auth_token.models import ScheduleExportAuthToken from apps.schedules.models import OnCallScheduleICal -ICAL_URL = "https://calendar.google.com/calendar/ical/amixr.io_37gttuakhrtr75ano72p69rt78%40group.calendar.google.com/private-1d00a680ba5be7426c3eb3ef1616e26d/basic.ics" # noqa - @pytest.mark.django_db @pytest.mark.parametrize( @@ -32,7 +30,7 @@ def test_get_schedule_export_token( organization, schedule_class=OnCallScheduleICal, name="test_ical_schedule", - ical_url_primary=ICAL_URL, + ical_url_primary="https://example.com", ) ScheduleExportAuthToken.create_auth_token(user=user, organization=organization, schedule=schedule) @@ -68,7 +66,7 @@ def test_schedule_export_token_not_found( organization, schedule_class=OnCallScheduleICal, name="test_ical_schedule", - ical_url_primary=ICAL_URL, + ical_url_primary="https://example.com", ) url = reverse("api-internal:schedule-export-token", kwargs={"pk": schedule.public_primary_key}) @@ -102,7 +100,7 @@ def test_schedule_create_export_token( organization, schedule_class=OnCallScheduleICal, name="test_ical_schedule", - ical_url_primary=ICAL_URL, + ical_url_primary="https://example.com", ) url = reverse("api-internal:schedule-export-token", kwargs={"pk": schedule.public_primary_key}) @@ -136,7 +134,7 @@ def test_schedule_delete_export_token( organization, schedule_class=OnCallScheduleICal, name="test_ical_schedule", - ical_url_primary=ICAL_URL, + ical_url_primary="https://example.com", ) instance, _ = ScheduleExportAuthToken.create_auth_token(user=user, organization=organization, schedule=schedule) diff --git a/engine/apps/api/tests/test_schedules.py b/engine/apps/api/tests/test_schedules.py index 8efcb6b236..5c4e9bbf01 100644 --- a/engine/apps/api/tests/test_schedules.py +++ b/engine/apps/api/tests/test_schedules.py @@ -23,7 +23,20 @@ from apps.slack.models import SlackUserGroup from common.api_helpers.utils import create_engine_url, serialize_datetime_as_utc_timestamp -ICAL_URL = "https://calendar.google.com/calendar/ical/amixr.io_37gttuakhrtr75ano72p69rt78%40group.calendar.google.com/private-1d00a680ba5be7426c3eb3ef1616e26d/basic.ics" +ICAL_URL = "https://example.com" + + +@pytest.fixture(autouse=True) +def patch_fetch_ical_file(): + """ + NOTE: we patch this method for all tests in this file to avoid making actual HTTP requests.. we don't really need + to test the actual fetching of the ical file in these tests, so just simply mock out the response as an empty string + + Alternatively, if we really needed to, we could save .ical files locally here, and read/return those as the + return_value + """ + with patch("apps.schedules.ical_utils.fetch_ical_file", return_value=""): + yield @pytest.fixture() @@ -64,7 +77,7 @@ def schedule_internal_api_setup( def test_get_list_schedules( schedule_internal_api_setup, make_escalation_chain, make_escalation_policy, make_user_auth_headers ): - user, token, calendar_schedule, ical_schedule, web_schedule, slack_channel = schedule_internal_api_setup + user, token, calendar_schedule, ical_schedule, web_schedule, _ = schedule_internal_api_setup client = APIClient() url = reverse("api-internal:schedule-list") diff --git a/engine/apps/api/tests/test_user_schedule_export.py b/engine/apps/api/tests/test_user_schedule_export.py index eec82fa0da..7ebcd884d7 100644 --- a/engine/apps/api/tests/test_user_schedule_export.py +++ b/engine/apps/api/tests/test_user_schedule_export.py @@ -6,8 +6,6 @@ from apps.api.permissions import LegacyAccessControlRole from apps.auth_token.models import UserScheduleExportAuthToken -ICAL_URL = "https://calendar.google.com/calendar/ical/amixr.io_37gttuakhrtr75ano72p69rt78%40group.calendar.google.com/private-1d00a680ba5be7426c3eb3ef1616e26d/basic.ics" # noqa - @pytest.mark.django_db @pytest.mark.parametrize( diff --git a/engine/apps/auth_token/tests/test_plugin_auth.py b/engine/apps/auth_token/tests/test_plugin_auth.py index a44bd3777d..44440e73f7 100644 --- a/engine/apps/auth_token/tests/test_plugin_auth.py +++ b/engine/apps/auth_token/tests/test_plugin_auth.py @@ -1,4 +1,5 @@ import json +from unittest.mock import patch import pytest from django.utils import timezone @@ -79,8 +80,12 @@ def test_plugin_authentication_fail(authorization, instance_context): request = APIRequestFactory().get("/", **headers) - with pytest.raises(AuthenticationFailed): - PluginAuthentication().authenticate(request) + class MockCheckTokenResponse: + organization = None + + with patch("apps.auth_token.auth.check_token", return_value=MockCheckTokenResponse): + with pytest.raises(AuthenticationFailed): + PluginAuthentication().authenticate(request) @pytest.mark.django_db diff --git a/engine/apps/grafana_plugin/tests/test_sync_v2.py b/engine/apps/grafana_plugin/tests/test_sync_v2.py index 8f1a9271aa..59291a7ba9 100644 --- a/engine/apps/grafana_plugin/tests/test_sync_v2.py +++ b/engine/apps/grafana_plugin/tests/test_sync_v2.py @@ -142,6 +142,7 @@ def test_sync_v2_content_encoding( mock_sync.assert_called() +@patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.check_token", return_value=(None, {"connected": True})) @pytest.mark.parametrize( "irm_enabled,expected", [ @@ -151,6 +152,9 @@ def test_sync_v2_content_encoding( ) @pytest.mark.django_db def test_sync_v2_irm_enabled( + # mock this out so that we're not making a real network call, the sync v2 endpoint ends up calling + # user_management.sync._sync_organization which calls GrafanaApiClient.check_token + _mock_grafana_api_client_check_token, make_organization_and_user_with_plugin_token, make_user_auth_headers, settings, diff --git a/engine/requirements-dev.in b/engine/requirements-dev.in index 7d599ee8d2..ce5f71426a 100644 --- a/engine/requirements-dev.in +++ b/engine/requirements-dev.in @@ -8,6 +8,7 @@ mypy==1.4.1 pre-commit==2.15.0 pytest==8.2.2 pytest-django==4.8.0 +pytest-socket==0.7.0 pytest-xdist[psutil]==3.6.1 pytest_factoryboy==2.7.0 types-beautifulsoup4==4.12.0.5 diff --git a/engine/requirements-dev.txt b/engine/requirements-dev.txt index 16256b89d2..03cf4606a6 100644 --- a/engine/requirements-dev.txt +++ b/engine/requirements-dev.txt @@ -91,11 +91,14 @@ pytest==8.2.2 # -r requirements-dev.in # pytest-django # pytest-factoryboy + # pytest-socket # pytest-xdist pytest-django==4.8.0 # via -r requirements-dev.in pytest-factoryboy==2.7.0 # via -r requirements-dev.in +pytest-socket==0.7.0 + # via -r requirements-dev.in pytest-xdist==3.6.1 # via -r requirements-dev.in python-dateutil==2.8.2 @@ -111,9 +114,7 @@ requests==2.32.3 # -c requirements.txt # djangorestframework-stubs setuptools==75.3.0 - # via - # -c requirements.txt - # nodeenv + # via nodeenv six==1.16.0 # via # -c requirements.txt diff --git a/engine/tox.ini b/engine/tox.ini index 908559ba5e..be2959bbe7 100644 --- a/engine/tox.ini +++ b/engine/tox.ini @@ -14,7 +14,14 @@ banned-modules = # # --dist no = temporarily disable xdist as it's leading to flaky tests :( # https://github.com/grafana/oncall-private/issues/2733 -addopts = --dist no --no-migrations --color=yes --showlocals + +# From pytest-socket docs (https://github.com/miketheman/pytest-socket): +# A plugin to use with Pytest to disable or restrict socket calls during tests to ensure network calls are prevented +# --disable-socket = tests should fail on any access to socket or libraries using socket with a SocketBlockedErro +# --allow-hosts = allow connections to the given hostnames/IPs. +# - localhost = our tests on CI use localhost as the host to connect to databases running locally in docker container +# - oncall-dev-mariadb = if you're running things locally, with a MariaDB instance running, there's a good chance the hostname will be this +addopts = --dist no --no-migrations --color=yes --showlocals --disable-socket --allow-hosts=localhost,oncall-dev-mariadb # https://pytest-django.readthedocs.io/en/latest/faq.html#my-tests-are-not-being-found-why python_files = tests.py test_*.py *_tests.py From 26946f0d43ac61beb078c69d31f96a6d93164965 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 2 Dec 2024 14:40:30 -0500 Subject: [PATCH 7/7] fix: improve Slack rate limiting logic when updating alert groups (#5287) # What this PR does https://www.loom.com/share/1ac33822301444748133ffe72638ddc4 The two asks in the [original GH issue](https://github.com/grafana/oncall-private/issues/2947) were: > 1. Make the error message clearer. We can identify if it's delivering or updating and being rate-limited. This is possible because Slack sets limits per API method. Also, this limit is a per-slack channel while we are posting messages & applying ratelimit per on-call integration, which confuses customers. > 2. Debounce update alert group message in Slack Both of these have been addressed in this PR ## Which issue(s) this PR closes Closes https://github.com/grafana/oncall-private/issues/2947 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- .../alerts/models/alert_receive_channel.py | 8 +- .../apps/slack/alert_group_slack_service.py | 39 +- engine/apps/slack/constants.py | 1 - .../0007_migrate_slackmessage_channel_id.py | 0 engine/apps/slack/models/slack_message.py | 92 ++++- .../apps/slack/scenarios/distribute_alerts.py | 79 ++-- .../apps/slack/scenarios/resolution_note.py | 7 +- .../scenarios/slack_channel_integration.py | 5 +- engine/apps/slack/tasks.py | 161 ++++++-- .../scenario_steps/test_distribute_alerts.py | 49 ++- .../scenario_steps/test_resolution_note.py | 4 + .../test_slack_channel_integration.py | 16 +- .../test_update_alert_group_slack_message.py | 377 ++++++++++++++++++ engine/apps/slack/tests/test_slack_message.py | 317 +++++++++++++-- engine/apps/slack/utils.py | 5 - engine/settings/celery_task_routes.py | 2 + 16 files changed, 975 insertions(+), 187 deletions(-) rename engine/apps/slack/{ => migrations}/0007_migrate_slackmessage_channel_id.py (100%) create mode 100644 engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index 91a7db6a12..f4661a3a10 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -29,7 +29,7 @@ metrics_remove_deleted_integration_from_cache, metrics_update_integration_cache, ) -from apps.slack.constants import SLACK_RATE_LIMIT_DELAY, SLACK_RATE_LIMIT_TIMEOUT +from apps.slack.constants import SLACK_RATE_LIMIT_TIMEOUT from apps.slack.tasks import post_slack_rate_limit_message from apps.slack.utils import post_message_to_channel from common.api_helpers.utils import create_engine_url @@ -442,12 +442,14 @@ def is_rate_limited_in_slack(self) -> bool: and self.rate_limited_in_slack_at + SLACK_RATE_LIMIT_TIMEOUT > timezone.now() ) - def start_send_rate_limit_message_task(self, delay=SLACK_RATE_LIMIT_DELAY): + def start_send_rate_limit_message_task(self, error_message_verb: str, delay: int) -> None: task_id = celery_uuid() + self.rate_limit_message_task_id = task_id self.rate_limited_in_slack_at = timezone.now() self.save(update_fields=["rate_limit_message_task_id", "rate_limited_in_slack_at"]) - post_slack_rate_limit_message.apply_async((self.pk,), countdown=delay, task_id=task_id) + + post_slack_rate_limit_message.apply_async((self.pk, error_message_verb), countdown=delay, task_id=task_id) @property def alert_groups_count(self) -> int: diff --git a/engine/apps/slack/alert_group_slack_service.py b/engine/apps/slack/alert_group_slack_service.py index 6ff514040f..019a6e217c 100644 --- a/engine/apps/slack/alert_group_slack_service.py +++ b/engine/apps/slack/alert_group_slack_service.py @@ -3,13 +3,9 @@ from apps.slack.client import SlackClient from apps.slack.errors import ( - SlackAPICantUpdateMessageError, SlackAPIChannelArchivedError, - SlackAPIChannelInactiveError, SlackAPIChannelNotFoundError, SlackAPIInvalidAuthError, - SlackAPIMessageNotFoundError, - SlackAPIRatelimitError, SlackAPITokenError, ) @@ -34,41 +30,12 @@ def __init__( else: self._slack_client = SlackClient(slack_team_identity) - def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: - logger.info(f"Update message for alert_group {alert_group.pk}") - - try: - self._slack_client.chat_update( - # TODO: once _channel_id has been fully migrated to channel, remove _channel_id - # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 - # channel=alert_group.slack_message.channel.slack_id, - channel=alert_group.slack_message._channel_id, - ts=alert_group.slack_message.slack_id, - attachments=alert_group.render_slack_attachments(), - blocks=alert_group.render_slack_blocks(), - ) - logger.info(f"Message has been updated for alert_group {alert_group.pk}") - except SlackAPIRatelimitError as e: - if not alert_group.channel.is_maintenace_integration: - if not alert_group.channel.is_rate_limited_in_slack: - alert_group.channel.start_send_rate_limit_message_task(e.retry_after) - logger.info( - f"Message has not been updated for alert_group {alert_group.pk} due to slack rate limit." - ) - else: - raise - except ( - SlackAPIMessageNotFoundError, - SlackAPICantUpdateMessageError, - SlackAPIChannelInactiveError, - SlackAPITokenError, - SlackAPIChannelNotFoundError, - ): - pass - def publish_message_to_alert_group_thread( self, alert_group: "AlertGroup", attachments=None, mrkdwn=True, unfurl_links=True, text=None ) -> None: + """ + TODO: refactor this method and move it to the `SlackMessage` model, such that we can remove this class.. + """ # TODO: refactor checking the possibility of sending message to slack # do not try to post message to slack if integration is rate limited if alert_group.channel.is_rate_limited_in_slack: diff --git a/engine/apps/slack/constants.py b/engine/apps/slack/constants.py index f72529fdb8..f21b7299c2 100644 --- a/engine/apps/slack/constants.py +++ b/engine/apps/slack/constants.py @@ -10,7 +10,6 @@ SLACK_RATE_LIMIT_TIMEOUT = datetime.timedelta(minutes=5) SLACK_RATE_LIMIT_DELAY = 10 -CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME = 60 * 10 BLOCK_SECTION_TEXT_MAX_SIZE = 2800 PRIVATE_METADATA_MAX_LENGTH = 3000 diff --git a/engine/apps/slack/0007_migrate_slackmessage_channel_id.py b/engine/apps/slack/migrations/0007_migrate_slackmessage_channel_id.py similarity index 100% rename from engine/apps/slack/0007_migrate_slackmessage_channel_id.py rename to engine/apps/slack/migrations/0007_migrate_slackmessage_channel_id.py diff --git a/engine/apps/slack/models/slack_message.py b/engine/apps/slack/models/slack_message.py index 37fa6f01e8..3355b701c1 100644 --- a/engine/apps/slack/models/slack_message.py +++ b/engine/apps/slack/models/slack_message.py @@ -3,7 +3,10 @@ import typing import uuid +from celery import uuid as celery_uuid +from django.core.cache import cache from django.db import models +from django.utils import timezone from apps.slack.client import SlackClient from apps.slack.constants import BLOCK_SECTION_TEXT_MAX_SIZE @@ -15,6 +18,7 @@ SlackAPIRatelimitError, SlackAPITokenError, ) +from apps.slack.tasks import update_alert_group_slack_message if typing.TYPE_CHECKING: from apps.alerts.models import AlertGroup @@ -30,6 +34,8 @@ class SlackMessage(models.Model): alert_group: typing.Optional["AlertGroup"] channel: "SlackChannel" + ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS = 45 + id = models.CharField(primary_key=True, default=uuid.uuid4, editable=False, max_length=36) slack_id = models.CharField(max_length=100) @@ -85,7 +91,7 @@ class SlackMessage(models.Model): active_update_task_id = models.CharField(max_length=100, null=True, default=None) """ - ID of the latest celery task to update the message + DEPRECATED/TODO: drop this field in a separate PR/release """ class Meta: @@ -259,3 +265,87 @@ def send_slack_notification( slack_user_identity.send_link_to_slack_message(slack_message) except (SlackAPITokenError, SlackAPIMethodNotSupportedForChannelTypeError): pass + + def _get_update_message_cache_key(self) -> str: + return f"update_alert_group_slack_message_{self.alert_group.pk}" + + def get_active_update_task_id(self) -> typing.Optional[str]: + return cache.get(self._get_update_message_cache_key(), default=None) + + def set_active_update_task_id(self, task_id: str) -> None: + """ + NOTE: we store the task ID in the cache for twice the debounce interval to ensure that the task ID is + EVENTUALLY removed. The background task which updates the message will remove the task ID from the cache, but + this is a safety measure in case the task fails to run or complete. The task ID would be removed from the cache + which would then allow the message to be updated again in a subsequent call to this method. + """ + cache.set( + self._get_update_message_cache_key(), + task_id, + timeout=self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS * 2, + ) + + def mark_active_update_task_as_complete(self) -> None: + self.last_updated = timezone.now() + self.save(update_fields=["last_updated"]) + + cache.delete(self._get_update_message_cache_key()) + + def update_alert_groups_message(self, debounce: bool) -> None: + """ + Schedule an update task for the associated alert group's Slack message, respecting the debounce interval. + + This method ensures that updates to the Slack message related to an alert group are not performed + too frequently, adhering to the `ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS` debounce interval. + It schedules a background task to update the message after the appropriate countdown. + + The method performs the following steps: + - Checks if there's already an active update task ID set in the cache. If so, exits to prevent + duplicate scheduling. + - Calculates the time since the last update (`last_updated` field) and determines the remaining time needed + to respect the debounce interval. + - Schedules the `update_alert_group_slack_message` task with the calculated countdown. + - Stores the task ID in the cache to prevent multiple tasks from being scheduled. + + debounce: bool - this is intended to be used when we want to debounce updates to the message. Examples: + - when set to True, we will skip scheduling an update task if there's an active update task (eg. debounce it) + - when set to False, we will immediately schedule an update task + """ + if not self.alert_group: + logger.warning( + f"skipping update_alert_groups_message as SlackMessage {self.pk} has no alert_group associated with it" + ) + return + + active_update_task_id = self.get_active_update_task_id() + if debounce and active_update_task_id is not None: + logger.info( + f"skipping update_alert_groups_message as SlackMessage {self.pk} has an active update task " + f"{active_update_task_id} and debounce is set to True" + ) + return + + now = timezone.now() + + # we previously weren't updating the last_updated field for messages, so there will be cases + # where the last_updated field is None + last_updated = self.last_updated or now + + time_since_last_update = (now - last_updated).total_seconds() + remaining_time = self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS - int(time_since_last_update) + countdown = max(remaining_time, 10) if debounce else 0 + + logger.info( + f"updating message for alert_group {self.alert_group.pk} in {countdown} seconds " + f"(debounce interval: {self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS})" + ) + + task_id = celery_uuid() + + # NOTE: we need to persist the task ID in the cache before scheduling the task to prevent + # a race condition where the task starts before the task ID is stored in the cache as the task + # does a check to verify that the celery task id matches the one stored in the cache + # + # (see update_alert_group_slack_message task for more details) + self.set_active_update_task_id(task_id) + update_alert_group_slack_message.apply_async((self.pk,), countdown=countdown, task_id=task_id) diff --git a/engine/apps/slack/scenarios/distribute_alerts.py b/engine/apps/slack/scenarios/distribute_alerts.py index cd088bf7b5..8be2095f05 100644 --- a/engine/apps/slack/scenarios/distribute_alerts.py +++ b/engine/apps/slack/scenarios/distribute_alerts.py @@ -3,15 +3,12 @@ import typing from datetime import datetime -from django.core.cache import cache - from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertSlackRenderer from apps.alerts.models import Alert, AlertGroup, AlertGroupLogRecord, AlertReceiveChannel, Invitation from apps.api.permissions import RBACPermission from apps.slack.chatops_proxy_routing import make_private_metadata, make_value -from apps.slack.constants import CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME from apps.slack.errors import ( SlackAPIChannelArchivedError, SlackAPIChannelNotFoundError, @@ -25,7 +22,7 @@ from apps.slack.models import SlackTeamIdentity, SlackUserIdentity from apps.slack.scenarios import scenario_step from apps.slack.slack_formatter import SlackFormatter -from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel, update_incident_slack_message +from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel from apps.slack.types import ( Block, BlockActionType, @@ -36,7 +33,6 @@ PayloadType, ScenarioRoute, ) -from apps.slack.utils import get_cache_key_update_incident_slack_message from common.utils import clean_markup, is_string_with_visible_characters from .step_mixins import AlertGroupActionsMixin @@ -116,6 +112,7 @@ def process_signal(self, alert: Alert) -> None: # do not try to post alert group message to slack if its channel is rate limited if alert_receive_channel.is_rate_limited_in_slack: logger.info("Skip posting or updating alert_group in Slack due to rate limit") + AlertGroup.objects.filter( pk=alert_group_pk, slack_message_sent=False, @@ -184,9 +181,9 @@ def process_signal(self, alert: Alert) -> None: if not alert_receive_channel.is_maintenace_integration: # we do not want to rate limit maintenace alerts.. reason_to_skip_escalation = AlertGroup.RATE_LIMITED - extra_log_msg += ( - f" integration is a maintenance integration alert_receive_channel={alert_receive_channel}" - ) + extra_log_msg += f" integration is a maintenance integration alert_receive_channel={alert_receive_channel.pk}" + + alert_receive_channel.start_send_rate_limit_message_task("Delivering", e.retry_after) else: reraise_exception = True elif isinstance(e, SlackAPITokenError): @@ -268,18 +265,24 @@ def process_signal(self, alert: Alert) -> None: else: # if a new alert comes in, and is grouped to an alert group that has already been posted to Slack, # then we will update that existing Slack message - if not should_skip_escalation_in_slack: - update_task_id = update_incident_slack_message.apply_async( - (self.slack_team_identity.pk, alert_group_pk), - countdown=10, + + alert_group_slack_message = alert_group.slack_message + if not alert_group_slack_message: + logger.info( + f"Skip updating alert group in Slack because alert_group {alert_group_pk} doesn't " + "have a slack message associated with it" ) - cache.set( - get_cache_key_update_incident_slack_message(alert_group_pk), - update_task_id, - timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, + return + elif should_skip_escalation_in_slack: + logger.info( + f"Skip updating alert group in Slack because alert_group {alert_group_pk} is set to skip escalation" ) - else: - logger.info("Skip updating alert_group in Slack due to rate limit") + return + + # NOTE: very important. We need to debounce the update_alert_groups_message call here. This is because + # we may possibly receive a flood of incoming alerts. We do not want to trigger a Slack message update + # for each of these, and hence we should instead debounce them + alert_group_slack_message.update_alert_groups_message(debounce=True) def process_scenario( self, @@ -324,13 +327,16 @@ def process_scenario( # for old version with user slack_id selection warning_text = "Oops! Something goes wrong, please try again" self.open_warning_window(payload, warning_text) + if selected_user is not None: Invitation.invite_user(selected_user, alert_group, self.user) else: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class SilenceGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -360,7 +366,8 @@ def process_scenario( ) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnSilenceGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -381,7 +388,8 @@ def process_scenario( alert_group.un_silence_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class SelectAttachGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -555,7 +563,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: unfurl_links=True, ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def process_scenario( self, @@ -625,7 +634,8 @@ def process_scenario( alert_group.un_attach_by_user(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class StopInvitationProcess(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -658,7 +668,8 @@ def process_scenario( Invitation.stop_invitation(invitation_id, self.user) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.invitation.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class ResolveGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -696,11 +707,11 @@ def process_scenario( alert_group.resolve_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - alert_group = log_record.alert_group # Do not rerender alert_groups which happened while maintenance. # They have no slack messages, since they just attached to the maintenance incident. - if not alert_group.happened_while_maintenance: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + if not log_record.alert_group.happened_while_maintenance: + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnResolveGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -721,7 +732,8 @@ def process_scenario( alert_group.un_resolve_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class AcknowledgeGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -742,7 +754,8 @@ def process_scenario( alert_group.acknowledge_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnAcknowledgeGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -811,7 +824,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: alert_group, attachments=message_attachments, text=text ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_message.update_alert_groups_message(debounce=False) logger.debug(f"Finished process_signal in UnAcknowledgeGroupStep for alert_group {alert_group.pk}") @@ -932,7 +946,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: text=f"Wiped by {log_record.author.get_username_with_slack_verbal()}", ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) class DeleteGroupStep(scenario_step.ScenarioStep): diff --git a/engine/apps/slack/scenarios/resolution_note.py b/engine/apps/slack/scenarios/resolution_note.py index 130a102f85..9fe9536871 100644 --- a/engine/apps/slack/scenarios/resolution_note.py +++ b/engine/apps/slack/scenarios/resolution_note.py @@ -222,7 +222,8 @@ def process_scenario( except SlackAPIError: pass - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_message.update_alert_groups_message(debounce=False) else: warning_text = "Unable to add this message to resolution note." self.open_warning_window(payload, warning_text) @@ -261,6 +262,7 @@ def post_or_update_resolution_note_in_thread(self, resolution_note: "ResolutionN resolution_note_slack_message = resolution_note.resolution_note_slack_message alert_group = resolution_note.alert_group alert_group_slack_message = alert_group.slack_message + blocks = self.get_resolution_note_blocks(resolution_note) # TODO: once _channel_id has been fully migrated to channel, remove _channel_id @@ -321,7 +323,8 @@ def post_or_update_resolution_note_in_thread(self, resolution_note: "ResolutionN def update_alert_group_resolution_note_button(self, alert_group: "AlertGroup") -> None: if alert_group.slack_message is not None: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def add_resolution_note_reaction(self, slack_thread_message: "ResolutionNoteSlackMessage"): try: diff --git a/engine/apps/slack/scenarios/slack_channel_integration.py b/engine/apps/slack/scenarios/slack_channel_integration.py index 4d9b6be314..140b2af7b3 100644 --- a/engine/apps/slack/scenarios/slack_channel_integration.py +++ b/engine/apps/slack/scenarios/slack_channel_integration.py @@ -145,9 +145,10 @@ def delete_thread_message_from_resolution_note( except ResolutionNoteSlackMessage.DoesNotExist: pass else: - alert_group = slack_thread_message.alert_group slack_thread_message.delete() - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_thread_message.alert_group.slack_message.update_alert_groups_message(debounce=False) STEPS_ROUTING: ScenarioRoute.RoutingSteps = [ diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index bfd6f22fd5..deff2cba28 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -11,19 +11,19 @@ from apps.slack.alert_group_slack_service import AlertGroupSlackService from apps.slack.client import SlackClient -from apps.slack.constants import CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, SLACK_BOT_ID +from apps.slack.constants import SLACK_BOT_ID from apps.slack.errors import ( + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPIChannelNotFoundError, SlackAPIInvalidAuthError, + SlackAPIMessageNotFoundError, SlackAPIPlanUpgradeRequiredError, SlackAPIRatelimitError, SlackAPITokenError, SlackAPIUsergroupNotFoundError, ) -from apps.slack.utils import ( - get_cache_key_update_incident_slack_message, - get_populate_slack_channel_task_id_key, - post_message_to_channel, -) +from apps.slack.utils import get_populate_slack_channel_task_id_key, post_message_to_channel from common.custom_celery_tasks import shared_dedicated_queue_retry_task from common.utils import batch_queryset @@ -32,39 +32,120 @@ @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) -def update_incident_slack_message(slack_team_identity_pk, alert_group_pk): - cache_key = get_cache_key_update_incident_slack_message(alert_group_pk) - cached_task_id = cache.get(cache_key) - current_task_id = update_incident_slack_message.request.id - - if cached_task_id is None: - update_task_id = update_incident_slack_message.apply_async( - (slack_team_identity_pk, alert_group_pk), - countdown=10, +def update_alert_group_slack_message(slack_message_pk: int) -> None: + """ + Background task to update the Slack message for an alert group. + + This function is intended to be executed as a Celery task. It performs the following: + - Compares the current task ID with the task ID stored in the cache. + - If they do not match, it means a newer task has been scheduled, so the current task exits to prevent duplicated updates. + - Does the actual update of the Slack message. + - Upon successful completion, clears the task ID from the cache to allow future updates (also note that + the task ID is set in the cache with a timeout, so it will be automatically cleared after a certain period, even + if this task fails to clear it. See `SlackMessage.update_alert_groups_message` for more details). + + Args: + slack_message_pk (int): The primary key of the `SlackMessage` instance to update. + """ + from apps.slack.models import SlackMessage + + current_task_id = update_alert_group_slack_message.request.id + + logger.info( + f"update_alert_group_slack_message for slack message {slack_message_pk} started with task_id {current_task_id}" + ) + + try: + slack_message = SlackMessage.objects.get(pk=slack_message_pk) + except SlackMessage.DoesNotExist: + logger.warning(f"SlackMessage {slack_message_pk} doesn't exist") + return + + active_update_task_id = slack_message.get_active_update_task_id() + if current_task_id != active_update_task_id: + logger.warning( + f"update_alert_group_slack_message skipped, because current_task_id ({current_task_id}) " + f"does not equal to active_update_task_id ({active_update_task_id}) " ) - cache.set(cache_key, update_task_id, timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME) + return - return ( - f"update_incident_slack_message rescheduled because of current task_id ({current_task_id})" - f" for alert_group {alert_group_pk} doesn't exist in cache" + alert_group = slack_message.alert_group + if not alert_group: + logger.warning( + f"skipping update_alert_group_slack_message as SlackMessage {slack_message_pk} " + "doesn't have an alert group associated with it" ) - if not current_task_id == cached_task_id: - return ( - f"update_incident_slack_message skipped, because of current task_id ({current_task_id})" - f" doesn't equal to cached task_id ({cached_task_id}) for alert_group {alert_group_pk}" + return + + alert_group_pk = alert_group.pk + alert_receive_channel = alert_group.channel + alert_receive_channel_is_rate_limited = alert_receive_channel.is_rate_limited_in_slack + + if alert_group.skip_escalation_in_slack: + logger.warning( + f"skipping update_alert_group_slack_message as AlertGroup {alert_group_pk} " + "has skip_escalation_in_slack set to True" ) + return + elif alert_receive_channel_is_rate_limited: + logger.warning( + f"skipping update_alert_group_slack_message as AlertGroup {alert_group.pk}'s " + f"integration ({alert_receive_channel.pk}) is rate-limited" + ) + return + + slack_client = SlackClient(slack_message.slack_team_identity) + try: + slack_client.chat_update( + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=slack_message.channel.slack_id, + channel=slack_message._channel_id, + ts=slack_message.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + logger.info(f"Message has been updated for alert_group {alert_group_pk}") + except SlackAPIRatelimitError as e: + if not alert_receive_channel.is_maintenace_integration: + if not alert_receive_channel_is_rate_limited: + alert_receive_channel.start_send_rate_limit_message_task("Updating", e.retry_after) + logger.info(f"Message has not been updated for alert_group {alert_group_pk} due to slack rate limit.") + else: + raise + except ( + SlackAPIMessageNotFoundError, + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPITokenError, + SlackAPIChannelNotFoundError, + ): + pass + + slack_message.mark_active_update_task_as_complete() + + +@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) +def update_incident_slack_message(slack_team_identity_pk: int, alert_group_pk: int) -> None: + """ + TODO: this method has been deprecated, and all references to it removed, remove it once task queues no + longer reference it. + """ from apps.alerts.models import AlertGroup - from apps.slack.models import SlackTeamIdentity - slack_team_identity = SlackTeamIdentity.objects.get(pk=slack_team_identity_pk) alert_group = AlertGroup.objects.get(pk=alert_group_pk) - if alert_group.skip_escalation_in_slack or alert_group.channel.is_rate_limited_in_slack: - return "Skip message update in Slack due to rate limit" - if alert_group.slack_message is None: - return "Skip message update in Slack due to absence of slack message" - AlertGroupSlackService(slack_team_identity).update_alert_group_slack_message(alert_group) + # NOTE: alert_group can't be None here, AlertGroup.objects.get(pk=alert_group_pk) would + # raise AlertGroup.DoesNotExist in this case + if not alert_group.slack_message: + logger.info( + f"skipping update_incident_slack_message as AlertGroup {alert_group_pk} doesn't have a slack message" + ) + return + + alert_group.slack_message.update_alert_groups_message(debounce=False) @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) @@ -153,7 +234,6 @@ def send_message_to_thread_if_bot_not_in_channel( """ Send message to alert group's thread if bot is not in current channel """ - from apps.alerts.models import AlertGroup from apps.slack.models import SlackTeamIdentity @@ -286,7 +366,13 @@ def populate_slack_user_identities(organization_pk): @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def post_slack_rate_limit_message(integration_id): +def post_slack_rate_limit_message(integration_id: int, error_message_verb: typing.Optional[str] = None) -> None: + """ + NOTE: error_message_verb was added to the function signature to allow for more descriptive error messages. + + We set it to None by default to maintain backwards compatibility with existing tasks. The default of None + can likely be removed in the near future (once existing tasks on the queue have been processed). + """ from apps.alerts.models import AlertReceiveChannel try: @@ -304,11 +390,14 @@ def post_slack_rate_limit_message(integration_id): default_route = integration.channel_filters.get(is_default=True) if (slack_channel := default_route.slack_channel_or_org_default) is not None: + # NOTE: see function docstring above 👆 + if error_message_verb is None: + error_message_verb = "Sending messages for" + text = ( - f"Delivering and updating alert groups of integration {integration.verbal_name} in Slack is " - f"temporarily stopped due to rate limit. You could find new alert groups at " - f"<{integration.new_incidents_web_link}|web page " - '"Alert Groups">' + f"{error_message_verb} Alert Groups in Slack, for integration {integration.verbal_name}, is " + f"temporarily rate-limited (due to a Slack rate-limit). Meanwhile, you can still find new Alert Groups " + f'in the <{integration.new_incidents_web_link}|"Alert Groups" web page>' ) post_message_to_channel(integration.organization, slack_channel.slack_id, text) diff --git a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py index e815d78036..06017ef996 100644 --- a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py +++ b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py @@ -2,7 +2,6 @@ from unittest.mock import patch import pytest -from django.core.cache import cache from django.utils import timezone from apps.alerts.models import AlertGroup, AlertReceiveChannel @@ -10,7 +9,6 @@ from apps.slack.models import SlackMessage from apps.slack.scenarios.distribute_alerts import IncomingAlertStep from apps.slack.tests.conftest import build_slack_response -from apps.slack.utils import get_cache_key_update_incident_slack_message SLACK_MESSAGE_TS = "1234567890.123456" SLACK_POST_MESSAGE_SUCCESS_RESPONSE = {"ts": SLACK_MESSAGE_TS} @@ -283,22 +281,20 @@ def test_process_signal_send_message_to_thread_if_bot_not_in_channel( ) @patch("apps.slack.client.SlackClient.chat_postMessage") - @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") @pytest.mark.django_db def test_process_signal_update_existing_message( self, - mock_update_incident_slack_message, + mock_update_alert_groups_message, mock_chat_postMessage, make_slack_team_identity, make_slack_channel, + make_slack_message, make_organization, make_alert_receive_channel, make_alert_group, make_alert, ): - mocked_update_incident_task_id = "1234" - mock_update_incident_slack_message.apply_async.return_value = mocked_update_incident_task_id - slack_team_identity = make_slack_team_identity() slack_channel = make_slack_channel(slack_team_identity) organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) @@ -310,6 +306,8 @@ def test_process_signal_update_existing_message( slack_message_sent=True, reason_to_skip_escalation=AlertGroup.NO_REASON, ) + make_slack_message(slack_channel, alert_group=alert_group) + assert alert_group.skip_escalation_in_slack is False alert = make_alert(alert_group, raw_request_data={}) @@ -317,23 +315,20 @@ def test_process_signal_update_existing_message( step = IncomingAlertStep(slack_team_identity) step.process_signal(alert) - # assert that the background task is scheduled - mock_update_incident_slack_message.apply_async.assert_called_once_with( - (slack_team_identity.pk, alert_group.pk), countdown=10 - ) + # assert that the SlackMessage is updated, and that it is debounced + mock_update_alert_groups_message.assert_called_once_with(debounce=True) mock_chat_postMessage.assert_not_called() - # Verify that the cache is set correctly - assert cache.get(get_cache_key_update_incident_slack_message(alert_group.pk)) == mocked_update_incident_task_id - @patch("apps.slack.client.SlackClient.chat_postMessage") - @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") @pytest.mark.django_db def test_process_signal_do_not_update_due_to_skip_escalation( self, - mock_update_incident_slack_message, + mock_update_alert_groups_message, mock_chat_postMessage, make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, make_alert_receive_channel, make_alert_group, make_alert, @@ -343,6 +338,7 @@ def test_process_signal_do_not_update_due_to_skip_escalation( """ organization, slack_team_identity = make_organization_with_slack_team_identity() alert_receive_channel = make_alert_receive_channel(organization) + slack_channel = make_slack_channel(slack_team_identity) # Simulate that slack_message_sent is already True and skip escalation due to RATE_LIMITED alert_group = make_alert_group( @@ -351,12 +347,13 @@ def test_process_signal_do_not_update_due_to_skip_escalation( reason_to_skip_escalation=AlertGroup.RATE_LIMITED, # Ensures skip_escalation_in_slack is True ) alert = make_alert(alert_group, raw_request_data={}) + make_slack_message(slack_channel, alert_group=alert_group) step = IncomingAlertStep(slack_team_identity) step.process_signal(alert) - # assert that the background task is not scheduled - mock_update_incident_slack_message.apply_async.assert_not_called() + # assert that we don't update the SlackMessage + mock_update_alert_groups_message.assert_not_called() mock_chat_postMessage.assert_not_called() @patch("apps.slack.client.SlackClient.chat_postMessage", side_effect=TimeoutError) @@ -399,6 +396,7 @@ def test_process_signal_timeout_error( assert SlackMessage.objects.count() == 0 assert not alert.delivered + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") @pytest.mark.parametrize( "reason,slack_error", [ @@ -412,6 +410,7 @@ def test_process_signal_timeout_error( @pytest.mark.django_db def test_process_signal_slack_errors( self, + mock_start_send_rate_limit_message_task, make_slack_team_identity, make_organization, make_alert_receive_channel, @@ -433,7 +432,8 @@ def test_process_signal_slack_errors( with patch.object(step._slack_client, "chat_postMessage") as mock_chat_postMessage: error_response = build_slack_response({"error": slack_error}) error_class = get_error_class(error_response) - mock_chat_postMessage.side_effect = error_class(error_response) + slack_error_raised = error_class(error_response) + mock_chat_postMessage.side_effect = slack_error_raised step.process_signal(alert) @@ -446,6 +446,13 @@ def test_process_signal_slack_errors( blocks=alert_group.render_slack_blocks(), ) + if error_class == SlackAPIRatelimitError: + mock_start_send_rate_limit_message_task.assert_called_once_with( + "Delivering", slack_error_raised.retry_after + ) + else: + mock_start_send_rate_limit_message_task.assert_not_called() + # For these Slack errors, retrying won't really help, so we should not set slack_message_sent back to False assert alert_group.slack_message_sent is True @@ -454,6 +461,7 @@ def test_process_signal_slack_errors( assert SlackMessage.objects.count() == 0 assert not alert.delivered + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") @patch( "apps.slack.client.SlackClient.chat_postMessage", side_effect=SlackAPIRatelimitError(build_slack_response({"error": "ratelimited"})), @@ -462,6 +470,7 @@ def test_process_signal_slack_errors( def test_process_signal_slack_api_ratelimit_for_maintenance_integration( self, mock_chat_postMessage, + mock_start_send_rate_limit_message_task, make_slack_team_identity, make_slack_channel, make_organization, @@ -496,6 +505,8 @@ def test_process_signal_slack_api_ratelimit_for_maintenance_integration( alert_group.refresh_from_db() + mock_start_send_rate_limit_message_task.assert_not_called() + # Ensure that slack_message_sent is set back to False, this will allow us to retry.. a SlackAPIRatelimitError, # may have been a transient error that is "recoverable" # diff --git a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py index 0f1598041c..28124202b5 100644 --- a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py +++ b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py @@ -347,12 +347,14 @@ def test_resolution_notes_modal_closed_before_update( assert call_args[0] == "views.update" +@patch("apps.slack.models.SlackMessage.update_alert_groups_message") @patch.object(SlackClient, "reactions_add") @patch.object(SlackClient, "chat_getPermalink", return_value={"permalink": "https://example.com"}) @pytest.mark.django_db def test_add_to_resolution_note( _mock_chat_getPermalink, mock_reactions_add, + mock_update_alert_groups_message, make_organization_and_user_with_slack_identities, make_alert_receive_channel, make_alert_group, @@ -386,6 +388,8 @@ def test_add_to_resolution_note( step.process_scenario(slack_user_identity, slack_team_identity, payload) mock_reactions_add.assert_called_once() + mock_update_alert_groups_message.assert_called_once_with(debounce=False) + assert alert_group.resolution_notes.get().text == "Test resolution note" diff --git a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py index 993692d6bc..a081cb0e06 100644 --- a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py +++ b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py @@ -393,8 +393,9 @@ def test_delete_thread_message_from_resolution_note_no_slack_user_identity( MockResolutionNoteSlackMessage.objects.get.assert_not_called() + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") def test_delete_thread_message_from_resolution_note_no_message_found( - self, make_organization_and_user_with_slack_identities + self, mock_update_alert_groups_message, make_organization_and_user_with_slack_identities ) -> None: ( organization, @@ -418,19 +419,20 @@ def test_delete_thread_message_from_resolution_note_no_message_found( } step = SlackChannelMessageEventStep(slack_team_identity, organization, user) - step.alert_group_slack_service = Mock() - step.delete_thread_message_from_resolution_note(slack_user_identity, payload) - step.alert_group_slack_service.assert_not_called() + mock_update_alert_groups_message.assert_not_called() + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") def test_delete_thread_message_from_resolution_note( self, + mock_update_alert_groups_message, make_organization_and_user_with_slack_identities, make_alert_receive_channel, make_alert_group, make_resolution_note_slack_message, make_slack_channel, + make_slack_message, ) -> None: channel_id = "potato" ts = 88945.4849 @@ -445,6 +447,7 @@ def test_delete_thread_message_from_resolution_note( slack_channel = make_slack_channel(slack_team_identity, slack_id=channel_id) integration = make_alert_receive_channel(organization) alert_group = make_alert_group(integration) + make_slack_message(alert_group=alert_group, slack_id=thread_ts, channel=slack_channel) payload = { "event": { @@ -461,11 +464,8 @@ def test_delete_thread_message_from_resolution_note( ) step = SlackChannelMessageEventStep(slack_team_identity, organization, user) - step.alert_group_slack_service = Mock() - step.delete_thread_message_from_resolution_note(slack_user_identity, payload) - step.alert_group_slack_service.update_alert_group_slack_message.assert_called_once_with(alert_group) assert ( ResolutionNoteSlackMessage.objects.filter( ts=ts, @@ -475,6 +475,8 @@ def test_delete_thread_message_from_resolution_note( == 0 ) + mock_update_alert_groups_message.assert_called_once_with(debounce=False) + def test_slack_message_has_no_alert_group( self, make_organization_and_user_with_slack_identities, diff --git a/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py b/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py new file mode 100644 index 0000000000..ee03833229 --- /dev/null +++ b/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py @@ -0,0 +1,377 @@ +from unittest.mock import patch + +import pytest +from django.utils import timezone + +from apps.alerts.models import AlertGroup +from apps.slack.errors import ( + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPIChannelNotFoundError, + SlackAPIMessageNotFoundError, + SlackAPIRatelimitError, + SlackAPITokenError, +) +from apps.slack.tasks import update_alert_group_slack_message +from apps.slack.tests.conftest import build_slack_response + + +@pytest.fixture +def mocked_rate_limited_slack_response(): + return build_slack_response({}, status_code=429, headers={"Retry-After": 123}) + + +class TestUpdateAlertGroupSlackMessageTask: + @pytest.mark.django_db + def test_update_alert_group_slack_message_slack_message_not_found(self): + """ + Test that the task exits early if SlackMessage does not exist. + """ + # No need to patch anything, just run the task with a non-existing pk + update_alert_group_slack_message.apply((99999,), task_id="task-id") + + # Since there is no exception raised, the test passes + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_task_id_mismatch( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if current_task_id doesn't match the task ID that exists in the cache + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("original-task-id") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="different-task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_no_alert_group( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, + ): + """ + Test that the task exits early if SlackMessage has no alert_group. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(alert_group=None, channel=slack_channel, organization=organization) + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_skip_escalation_in_slack( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if alert_group.skip_escalation_in_slack is True. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group( + alert_receive_channel, + reason_to_skip_escalation=AlertGroup.CHANNEL_ARCHIVED, + ) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # Ensure skip_escalation_in_slack is True + assert alert_group.skip_escalation_in_slack is True + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + # Verify that the active update task ID is not cleared and last_updated is not set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_alert_receive_channel_rate_limited( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if alert_receive_channel.is_rate_limited_in_slack is True. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel( + organization, + rate_limited_in_slack_at=timezone.now(), + ) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # Ensure is_rate_limited_in_slack is True + assert alert_receive_channel.is_rate_limited_in_slack is True + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + # Verify that the active update task ID is not cleared and last_updated is not set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_successful( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task successfully updates the alert group's Slack message. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that SlackClient.chat_update was called with correct parameters + mock_chat_update.assert_called_once_with( + channel=slack_message._channel_id, + ts=slack_message.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + # Verify that cache ID is cleared from the cache and last_updated is set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() is None + assert slack_message.last_updated is not None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_not_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + mocked_rate_limited_slack_response, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test handling of SlackAPIRatelimitError when not a maintenance integration. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + + # Ensure channel is not a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is False + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(mocked_rate_limited_slack_response) + mock_chat_update.side_effect = slack_api_ratelimit_error + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that start_send_rate_limit_message_task was called + mock_start_send_rate_limit_message_task.assert_called_with("Updating", slack_api_ratelimit_error.retry_after) + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_is_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + mocked_rate_limited_slack_response, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that SlackAPIRatelimitError is re-raised when it is a maintenance integration. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization, integration="maintenance") + + # Ensure channel is a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is True + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(mocked_rate_limited_slack_response) + mock_chat_update.side_effect = slack_api_ratelimit_error + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + slack_message.refresh_from_db() + + # Assert that start_send_rate_limit_message_task was not called, task id is not cleared, and we don't + # update last_updated + mock_start_send_rate_limit_message_task.assert_not_called() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.parametrize( + "ExceptionClass", + [ + SlackAPIMessageNotFoundError, + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPITokenError, + SlackAPIChannelNotFoundError, + ], + ) + @pytest.mark.django_db + def test_update_alert_group_slack_message_other_exceptions( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + ExceptionClass, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that other Slack API exceptions are handled silently. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises the exception class + mock_chat_update.side_effect = ExceptionClass("foo bar") + + # Call the task + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that exception was caught and passed + # SlackClient.chat_update was called + mock_chat_update.assert_called_once() + + # Assert that start_send_rate_limit_message_task was not called + mock_start_send_rate_limit_message_task.assert_not_called() + + # Verify that cache ID is cleared from the cache and last_updated is set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() is None + assert slack_message.last_updated is not None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_unexpected_exception( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that an unexpected exception propagates as expected. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises a generic exception + mock_chat_update.side_effect = ValueError("Unexpected error") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that task id is not cleared, and we don't update last_updated + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None diff --git a/engine/apps/slack/tests/test_slack_message.py b/engine/apps/slack/tests/test_slack_message.py index 61419a1b27..5a766654f2 100644 --- a/engine/apps/slack/tests/test_slack_message.py +++ b/engine/apps/slack/tests/test_slack_message.py @@ -1,3 +1,4 @@ +from datetime import timedelta from unittest.mock import patch import pytest @@ -6,6 +7,7 @@ from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIError +from apps.slack.models import SlackMessage from apps.slack.tests.conftest import build_slack_response @@ -28,6 +30,59 @@ def _slack_message_setup(cached_permalink): return _slack_message_setup +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), +) +@pytest.mark.django_db +def test_slack_message_permalink(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + assert slack_message.permalink == "test_permalink" + mock_slack_api_call.assert_called_once() + + +@patch.object( + SlackClient, + "chat_getPermalink", + side_effect=SlackAPIError(response=build_slack_response({"ok": False, "error": "message_not_found"})), +) +@pytest.mark.django_db +def test_slack_message_permalink_error(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + assert slack_message.permalink is None + mock_slack_api_call.assert_called_once() + + +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), +) +@pytest.mark.django_db +def test_slack_message_permalink_cache(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink="cached_permalink") + assert slack_message.permalink == "cached_permalink" + mock_slack_api_call.assert_not_called() + + +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": False, "error": "account_inactive"}), +) +@pytest.mark.django_db +def test_slack_message_permalink_token_revoked(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + slack_message.slack_team_identity.detected_token_revoked = timezone.now() + slack_message.slack_team_identity.save() + + assert slack_message.slack_team_identity is not None + assert slack_message.permalink is None + + mock_slack_api_call.assert_not_called() + + @pytest.mark.django_db def test_send_slack_notification( make_organization_and_user_with_slack_identities, @@ -86,54 +141,230 @@ def test_slack_message_deep_link( assert slack_message.deep_link == expected -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), -) -@pytest.mark.django_db -def test_slack_message_permalink(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - assert slack_message.permalink == "test_permalink" - mock_slack_api_call.assert_called_once() +class TestSlackMessageUpdateAlertGroupsMessage: + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_no_alert_group( + self, + mock_update_alert_group_slack_message, + make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, + ): + """ + Test that the method exits early if alert_group is None. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=None, organization=organization) + slack_message.update_alert_groups_message(debounce=True) -@patch.object( - SlackClient, - "chat_getPermalink", - side_effect=SlackAPIError(response=build_slack_response({"ok": False, "error": "message_not_found"})), -) -@pytest.mark.django_db -def test_slack_message_permalink_error(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - assert slack_message.permalink is None - mock_slack_api_call.assert_called_once() + # Ensure no task is scheduled + mock_update_alert_group_slack_message.apply_async.assert_not_called() + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_active_task_exists( + self, + mock_update_alert_group_slack_message, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_slack_channel, + make_slack_message, + ): + """ + Test that the method exits early if a task ID is set in the cache and debounce is True. + """ + task_id = "some-task-id" -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), -) -@pytest.mark.django_db -def test_slack_message_permalink_cache(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink="cached_permalink") - assert slack_message.permalink == "cached_permalink" - mock_slack_api_call.assert_not_called() + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group) + slack_message.set_active_update_task_id(task_id) -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": False, "error": "account_inactive"}), -) -@pytest.mark.django_db -def test_slack_message_permalink_token_revoked(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - slack_message.slack_team_identity.detected_token_revoked = timezone.now() - slack_message.slack_team_identity.save() + slack_message.update_alert_groups_message(debounce=True) - assert slack_message.slack_team_identity is not None - assert slack_message.permalink is None + # Ensure no task is scheduled + mock_update_alert_group_slack_message.apply_async.assert_not_called() - mock_slack_api_call.assert_not_called() + # Ensure task ID in the cache remains unchanged + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_last_updated_none( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the method handles last_updated being None and schedules with default debounce interval. + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group, last_updated=None) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=SlackMessage.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS, + task_id=task_id, + ) + + # Verify task ID is set in the cache + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_schedules_task_correctly( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the method schedules the task with correct countdown and updates the task ID in the cache + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message( + channel=slack_channel, + alert_group=alert_group, + last_updated=timezone.now() - timedelta(seconds=10), + ) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=35, + task_id=task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_handles_minimum_countdown( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the countdown is at least 10 seconds when the debounce interval has passed. + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message( + channel=slack_channel, + alert_group=alert_group, + last_updated=timezone.now() + - timedelta(seconds=SlackMessage.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS + 1), + ) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + # Since the time since last update exceeds the debounce interval, countdown should be 10 + countdown=10, + task_id=task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_debounce_false_schedules_immediately( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that when debounce is False, the task is scheduled immediately with countdown=0, + even if a task ID is set in the cache. + """ + new_task_id = "new-task-id" + mock_celery_uuid.return_value = new_task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + slack_channel = make_slack_channel(slack_team_identity) + + # Set up SlackMessage with existing task ID in the cache + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group) + slack_message.set_active_update_task_id("existing-task-id") + + slack_message.update_alert_groups_message(debounce=False) + + # Verify that apply_async was called with countdown=0 + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=0, + task_id=new_task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == new_task_id diff --git a/engine/apps/slack/utils.py b/engine/apps/slack/utils.py index 8b3515ba30..300b20f01e 100644 --- a/engine/apps/slack/utils.py +++ b/engine/apps/slack/utils.py @@ -103,10 +103,5 @@ def format_datetime_to_slack_with_time(timestamp: float, format: SlackDateFormat return _format_datetime_to_slack(timestamp, f"{{{format}}} {{time}}") -def get_cache_key_update_incident_slack_message(alert_group_pk: str) -> str: - CACHE_KEY_PREFIX = "update_incident_slack_message" - return f"{CACHE_KEY_PREFIX}_{alert_group_pk}" - - def get_populate_slack_channel_task_id_key(slack_team_identity_id: str) -> str: return f"SLACK_CHANNELS_TASK_ID_TEAM_{slack_team_identity_id}" diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 7ef62121dd..08f42631a4 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -170,7 +170,9 @@ "apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel": {"queue": "slack"}, "apps.slack.tasks.start_update_slack_user_group_for_schedules": {"queue": "slack"}, "apps.slack.tasks.unpopulate_slack_user_identities": {"queue": "slack"}, + # TODO: remove apps.slack.tasks.update_incident_slack_message after current tasks in queue have been processed "apps.slack.tasks.update_incident_slack_message": {"queue": "slack"}, + "apps.slack.tasks.update_alert_group_slack_message": {"queue": "slack"}, "apps.slack.tasks.update_slack_user_group_for_schedules": {"queue": "slack"}, "apps.slack.representatives.alert_group_representative.on_create_alert_slack_representative_async": { "queue": "slack"