diff --git a/docs/sources/oncall-api-reference/alertgroups.md b/docs/sources/oncall-api-reference/alertgroups.md index 33c2d46b6b..1bc80a8a8d 100644 --- a/docs/sources/oncall-api-reference/alertgroups.md +++ b/docs/sources/oncall-api-reference/alertgroups.md @@ -71,6 +71,9 @@ The above command returns JSON structured in the following way: ] } }, + "teams": [ + "TE5EF3RQHJQPI" + ] } ], "current_page_number": 1, @@ -78,6 +81,7 @@ The above command returns JSON structured in the following way: "total_pages": 1 } ``` +> **Note**: `team_id` is provided for each alert_group however this is based off the old method where team was assigned based on integration team. It's recommended to use the new `teams` field. > **Note**: The response is [paginated](ref:pagination). You may need to make multiple requests to get all records. diff --git a/engine/apps/alerts/migrations/0065_alertgroup_teams.py b/engine/apps/alerts/migrations/0065_alertgroup_teams.py new file mode 100644 index 0000000000..4d47740838 --- /dev/null +++ b/engine/apps/alerts/migrations/0065_alertgroup_teams.py @@ -0,0 +1,19 @@ +# Generated by Django 4.2.16 on 2024-11-19 06:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('user_management', '0026_auto_20241017_1919'), + ('alerts', '0064_migrate_resolutionnoteslackmessage_slack_channel_id'), + ] + + operations = [ + migrations.AddField( + model_name='alertgroup', + name='teams', + field=models.ManyToManyField(to='user_management.team'), + ), + ] diff --git a/engine/apps/alerts/migrations/0066_channelfilter_update_team.py b/engine/apps/alerts/migrations/0066_channelfilter_update_team.py new file mode 100644 index 0000000000..667b25cc46 --- /dev/null +++ b/engine/apps/alerts/migrations/0066_channelfilter_update_team.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.16 on 2024-11-22 00:22 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('alerts', '0065_alertgroup_teams'), + ] + + operations = [ + migrations.AddField( + model_name='channelfilter', + name='update_team', + field=models.BooleanField(default=False, null=True), + ), + ] diff --git a/engine/apps/alerts/models/alert.py b/engine/apps/alerts/models/alert.py index 844cbf6771..eaeb78a9e2 100644 --- a/engine/apps/alerts/models/alert.py +++ b/engine/apps/alerts/models/alert.py @@ -145,6 +145,19 @@ def create( group.log_records.create(type=AlertGroupLogRecord.TYPE_REGISTERED) group.log_records.create(type=AlertGroupLogRecord.TYPE_ROUTE_ASSIGNED) + if group_created and alert_receive_channel.team: + # add the team from the integration if its available + group.teams.set([alert_receive_channel.team]) + elif ( + group_created + and channel_filter + and channel_filter.escalation_chain + and channel_filter.escalation_chain.team + and channel_filter.update_team + ): + # set the team to the one defined in the escalation_chain if defined. + group.teams.set([channel_filter.escalation_chain.team]) + if group_created or alert.group.pause_escalation: # Build escalation snapshot if needed and start escalation alert.group.start_escalation_if_needed(countdown=TASK_DELAY_SECONDS) diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index f1e5a66d73..f33d032cb8 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -421,6 +421,8 @@ def status(self) -> int: raw_escalation_snapshot = JSONField(null=True, default=None) + teams = models.ManyToManyField(to="user_management.Team") + # This field is used for constraints so we can use get_or_create() in concurrent calls # https://docs.djangoproject.com/en/3.2/ref/models/querysets/#get-or-create # Combined with unique_together below, it allows only one alert group with diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index 4fd926ac47..325ff9553a 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -790,6 +790,6 @@ def listen_for_alertreceivechannel_model_save( # delete connected auth tokens instance.auth_tokens.all().delete() - metrics_remove_deleted_integration_from_cache(instance) + metrics_remove_deleted_integration_from_cache(instance, instance.organization) else: - metrics_update_integration_cache(instance) + metrics_update_integration_cache(instance, instance.organization) diff --git a/engine/apps/alerts/models/channel_filter.py b/engine/apps/alerts/models/channel_filter.py index f7cb302f7a..af88c0185b 100644 --- a/engine/apps/alerts/models/channel_filter.py +++ b/engine/apps/alerts/models/channel_filter.py @@ -67,6 +67,9 @@ class ChannelFilter(OrderedModel): "alerts.EscalationChain", null=True, default=None, on_delete=models.SET_NULL, related_name="channel_filters" ) + # Should we update the alertgroup team when this route is used + update_team = models.BooleanField(null=True, default=False) + notify_in_slack = models.BooleanField(null=True, default=True) notify_in_telegram = models.BooleanField(null=True, default=False) diff --git a/engine/apps/api/serializers/alert_group.py b/engine/apps/api/serializers/alert_group.py index c0882658fb..0e9a6c744d 100644 --- a/engine/apps/api/serializers/alert_group.py +++ b/engine/apps/api/serializers/alert_group.py @@ -20,6 +20,7 @@ from .alert_receive_channel import FastAlertReceiveChannelSerializer from .alerts_field_cache_buster_mixin import AlertsFieldCacheBusterMixin from .user import FastUserSerializer, UserShortSerializer +from .team import FastTeamSerializer logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -125,8 +126,7 @@ class AlertGroupListSerializer( related_users = serializers.SerializerMethodField() dependent_alert_groups = ShortAlertGroupSerializer(many=True) root_alert_group = ShortAlertGroupSerializer() - team = TeamPrimaryKeyRelatedField(source="channel.team", allow_null=True) - + teams = serializers.SerializerMethodField() alerts_count = serializers.IntegerField(read_only=True) render_for_web = serializers.SerializerMethodField() @@ -136,6 +136,7 @@ class AlertGroupListSerializer( "dependent_alert_groups", "log_records__author", "labels", + "teams", Prefetch( "slack_messages", queryset=SlackMessage.objects.select_related("_slack_team_identity").order_by("created_at")[:1], @@ -187,12 +188,24 @@ class Meta: "root_alert_group", "status", "declare_incident_link", - "team", "grafana_incident_id", "labels", "permalinks", + "teams" ] + @extend_schema_field(FastTeamSerializer(many=True)) + def get_teams(self, obj: "AlertGroup"): + """ + Handle AlertGroups that haven't been assigned a team yet + """ + + if obj.teams: + teams = obj.teams + elif obj.channel.team: + teams = [obj.channel.team] + return FastTeamSerializer(teams, context=self.context, many=True).data + def get_render_for_web(self, obj: "AlertGroup") -> RenderForWeb | EmptyRenderForWeb: if not obj.last_alert: return {} @@ -231,6 +244,7 @@ def get_related_users(self, obj: "AlertGroup"): return UserShortSerializer(users, context=self.context, many=True).data + class AlertGroupSerializer(AlertGroupListSerializer): alerts = serializers.SerializerMethodField("get_limited_alerts") last_alert_at = serializers.SerializerMethodField() diff --git a/engine/apps/api/serializers/channel_filter.py b/engine/apps/api/serializers/channel_filter.py index a1c796ad05..1ddd193a91 100644 --- a/engine/apps/api/serializers/channel_filter.py +++ b/engine/apps/api/serializers/channel_filter.py @@ -65,6 +65,7 @@ class Meta: "notification_backends", "filtering_term_as_jinja2", "telegram_channel_details", + "update_team" ] read_only_fields = [ "created_at", @@ -165,6 +166,7 @@ class Meta: "notify_in_slack", "notify_in_telegram", "notification_backends", + "update_team" ] read_only_fields = ["created_at", "is_default"] diff --git a/engine/apps/api/views/alert_group.py b/engine/apps/api/views/alert_group.py index 117fb9ce9d..15080f46d7 100644 --- a/engine/apps/api/views/alert_group.py +++ b/engine/apps/api/views/alert_group.py @@ -302,24 +302,21 @@ def get_serializer_class(self): def get_queryset(self, ignore_filtering_by_available_teams=False): # no select_related or prefetch_related is used at this point, it will be done on paginate_queryset. - alert_receive_channels_qs = AlertReceiveChannel.objects_with_deleted.filter( - organization_id=self.request.auth.organization.id - ) - if not ignore_filtering_by_available_teams: - alert_receive_channels_qs = alert_receive_channels_qs.filter(*self.available_teams_lookup_args) - - # Filter by team(s). Since we really filter teams from integrations, this is not an AlertGroup model filter. - # This is based on the common.api_helpers.ByTeamModelFieldFilterMixin implementation team_values = self.request.query_params.getlist("team", []) if team_values: - null_team_lookup = Q(team__isnull=True) if NO_TEAM_VALUE in team_values else None - teams_lookup = Q(team__public_primary_key__in=[ppk for ppk in team_values if ppk != NO_TEAM_VALUE]) + null_team_lookup = Q(teams__isnull=True) if NO_TEAM_VALUE in team_values else None + teams_lookup = Q(teams__public_primary_key__in=[ppk for ppk in team_values if ppk != NO_TEAM_VALUE]) if null_team_lookup: teams_lookup = teams_lookup | null_team_lookup - alert_receive_channels_qs = alert_receive_channels_qs.filter(teams_lookup) - alert_receive_channels_ids = list(alert_receive_channels_qs.values_list("id", flat=True)) - queryset = AlertGroup.objects.filter(channel__in=alert_receive_channels_ids) + if not ignore_filtering_by_available_teams: + queryset = AlertGroup.objects.filter(*self.available_teams_lookup_args) + else: + queryset = AlertGroup.objects + + if team_values: + queryset = queryset.filter(teams_lookup) + if self.action in ("list", "stats") and not self.request.query_params.get("started_at"): queryset = queryset.filter(started_at__gte=timezone.now() - timezone.timedelta(days=30)) diff --git a/engine/apps/metrics_exporter/helpers.py b/engine/apps/metrics_exporter/helpers.py index 91a0190520..e1571bbe45 100644 --- a/engine/apps/metrics_exporter/helpers.py +++ b/engine/apps/metrics_exporter/helpers.py @@ -141,47 +141,45 @@ def get_default_states_dict() -> AlertGroupStateDict: } -def metrics_update_integration_cache(integration: "AlertReceiveChannel") -> None: +def metrics_update_integration_cache(integration: "AlertReceiveChannel", organization: "Organization") -> None: """Update integration data in metrics cache""" metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id) metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id) metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id) - for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]: - metric_cache = cache.get(metric_key, {}) - integration_metric_cache = metric_cache.get(integration.id) - if integration_metric_cache: - cache_updated = False - if integration_metric_cache["team_id"] != integration.team_id_or_no_team: - integration_metric_cache["team_id"] = integration.team_id_or_no_team - integration_metric_cache["team_name"] = integration.team_name - cache_updated = True - if integration_metric_cache["integration_name"] != integration.emojized_verbal_name: - integration_metric_cache["integration_name"] = integration.emojized_verbal_name - cache_updated = True - if cache_updated: - cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout) + for team in organization.teams.all(): + for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]: + metric_cache = cache.get(metric_key, {}) + integration_metric_cache = metric_cache.get((integration.id,team.team_id)) + if integration_metric_cache: + cache_updated = False + if integration_metric_cache["integration_name"] != integration.emojized_verbal_name: + integration_metric_cache["integration_name"] = integration.emojized_verbal_name + cache_updated = True + if cache_updated: + cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout) -def metrics_remove_deleted_integration_from_cache(integration: "AlertReceiveChannel"): +def metrics_remove_deleted_integration_from_cache(integration: "AlertReceiveChannel", organization: "Organization"): """Remove data related to deleted integration from metrics cache""" metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id) metric_alert_groups_total_key = get_metric_alert_groups_total_key(integration.organization_id) metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(integration.organization_id) - for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]: - metric_cache = cache.get(metric_key) - if metric_cache: - metric_cache.pop(integration.id, None) - cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout) + for team in organization.teams.all(): + for metric_key in [metric_alert_groups_total_key, metric_alert_groups_response_time_key]: + metric_cache = cache.get(metric_key) + if metric_cache: + metric_cache.pop((integration.id, team.team_id), None) + cache.set(metric_key, metric_cache, timeout=metrics_cache_timeout) def metrics_add_integrations_to_cache(integrations: list["AlertReceiveChannel"], organization: "Organization"): """ Bulk add new integration data to metrics cache. This method is safe to call multiple times on the same integrations. """ - metrics_cache_timeout = get_metrics_cache_timeout(organization.id) - metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.id) + metrics_cache_timeout = get_metrics_cache_timeout(organization.org_id) + metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.org_id) instance_slug = organization.stack_slug instance_id = organization.stack_id @@ -189,40 +187,41 @@ def metrics_add_integrations_to_cache(integrations: list["AlertReceiveChannel"], metric_alert_groups_total: typing.Dict[int, AlertGroupsTotalMetricsDict] = cache.get( metric_alert_groups_total_key, {} ) - - for integration in integrations: - metric_alert_groups_total.setdefault( - integration.id, - { - "integration_name": integration.emojized_verbal_name, - "team_name": integration.team_name, - "team_id": integration.team_id_or_no_team, - "org_id": grafana_org_id, - "slug": instance_slug, - "id": instance_id, - "services": {NO_SERVICE_VALUE: get_default_states_dict()}, - }, - ) + for team in organization.teams.all(): + for integration in integrations: + metric_alert_groups_total.setdefault( + (integration.id,team.team_id), + { + "integration_name": integration.emojized_verbal_name, + "team_name": team.name, + "team_id": team.team_id, + "org_id": grafana_org_id, + "slug": instance_slug, + "id": instance_id, + "services": {NO_SERVICE_VALUE: get_default_states_dict()}, + }, + ) cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout) - metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.id) + metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization.org_id) metric_alert_groups_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = cache.get( metric_alert_groups_response_time_key, {} ) - for integration in integrations: - metric_alert_groups_response_time.setdefault( - integration.id, - { - "integration_name": integration.emojized_verbal_name, - "team_name": integration.team_name, - "team_id": integration.team_id_or_no_team, - "org_id": grafana_org_id, - "slug": instance_slug, - "id": instance_id, - "services": {NO_SERVICE_VALUE: []}, - }, - ) + for team in organization.teams.all(): + for integration in integrations: + metric_alert_groups_response_time.setdefault( + (integration.id, team.team_id), + { + "integration_name": integration.emojized_verbal_name, + "team_name": team.name, + "team_id": team.team_id, + "org_id": grafana_org_id, + "slug": instance_slug, + "id": instance_id, + "services": {NO_SERVICE_VALUE: []}, + }, + ) cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout) @@ -236,18 +235,21 @@ def metrics_bulk_update_team_label_cache(teams_updated_data: dict, organization_ metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {}) metric_alert_groups_response_time = cache.get(metric_alert_groups_response_time_key, {}) + + # TODO need to work out how to handle team changes... or if we need to. + for team_id, team_data in teams_updated_data.items(): - for integration_id in metric_alert_groups_total: - if metric_alert_groups_total[integration_id]["team_id"] == team_id: + for integration_id, team_id in metric_alert_groups_total: + if metric_alert_groups_total[(integration_id, team_id)]["team_id"] == team_id: integration_response_time_metrics = metric_alert_groups_response_time.get(integration_id) if team_data["deleted"]: - metric_alert_groups_total[integration_id]["team_id"] = "no_team" - metric_alert_groups_total[integration_id]["team_name"] = "No team" + metric_alert_groups_total[(integration_id, team_id)]["team_id"] = "no_team" + metric_alert_groups_total[(integration_id, team_id)]["team_name"] = "No team" if integration_response_time_metrics: integration_response_time_metrics["team_id"] = "no_team" integration_response_time_metrics["team_name"] = "No team" else: - metric_alert_groups_total[integration_id]["team_name"] = team_data["team_name"] + metric_alert_groups_total[(integration_id, team_id)]["team_name"] = team_data["team_name"] if integration_response_time_metrics: integration_response_time_metrics["team_name"] = team_data["team_name"] @@ -255,7 +257,7 @@ def metrics_bulk_update_team_label_cache(teams_updated_data: dict, organization_ cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout) -def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id: int): +def metrics_update_alert_groups_state_cache(states_diff: dict, organization: "Organization"): """ Update alert groups state metric cache for each integration in states_diff dict. states_diff example: @@ -281,24 +283,27 @@ def metrics_update_alert_groups_state_cache(states_diff: dict, organization_id: if not states_diff: return - metrics_cache_timeout = get_metrics_cache_timeout(organization_id) - metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id) + metrics_cache_timeout = get_metrics_cache_timeout(organization.org_id) + metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization.org_id) metric_alert_groups_total = cache.get(metric_alert_groups_total_key, {}) + + if not metric_alert_groups_total: return for integration_id, service_data in states_diff.items(): - integration_alert_groups = metric_alert_groups_total.get(int(integration_id)) - if not integration_alert_groups: - continue - for service_name, service_state_diff in service_data.items(): - states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict()) - for previous_state, counter in service_state_diff["previous_states"].items(): - if states_to_update[previous_state] - counter > 0: - states_to_update[previous_state] -= counter - else: - states_to_update[previous_state] = 0 - for new_state, counter in service_state_diff["new_states"].items(): - states_to_update[new_state] += counter + for team in organization.teams.all(): + integration_alert_groups = metric_alert_groups_total.get((int(integration_id), team.team_id)) + if not integration_alert_groups: + continue + for service_name, service_state_diff in service_data.items(): + states_to_update = integration_alert_groups["services"].setdefault(service_name, get_default_states_dict()) + for previous_state, counter in service_state_diff["previous_states"].items(): + if states_to_update[previous_state] - counter > 0: + states_to_update[previous_state] -= counter + else: + states_to_update[previous_state] = 0 + for new_state, counter in service_state_diff["new_states"].items(): + states_to_update[new_state] += counter cache.set(metric_alert_groups_total_key, metric_alert_groups_total, timeout=metrics_cache_timeout) diff --git a/engine/apps/metrics_exporter/metrics_cache_manager.py b/engine/apps/metrics_exporter/metrics_cache_manager.py index 25f85655f5..fd902aeaf9 100644 --- a/engine/apps/metrics_exporter/metrics_cache_manager.py +++ b/engine/apps/metrics_exporter/metrics_cache_manager.py @@ -54,7 +54,7 @@ def update_integration_states_diff(metrics_dict, integration_id, service_name, p @staticmethod def metrics_update_state_cache_for_alert_group( - integration_id, organization_id, service_name, old_state=None, new_state=None + integration_id, organization, service_name, old_state=None, new_state=None ): """ Update state metric cache for one alert group. @@ -62,7 +62,7 @@ def metrics_update_state_cache_for_alert_group( metrics_state_diff = MetricsCacheManager.update_integration_states_diff( {}, integration_id, service_name, previous_state=old_state, new_state=new_state ) - metrics_update_alert_groups_state_cache(metrics_state_diff, organization_id) + metrics_update_alert_groups_state_cache(metrics_state_diff, organization) @staticmethod def metrics_update_response_time_cache_for_alert_group( @@ -79,7 +79,7 @@ def metrics_update_response_time_cache_for_alert_group( @staticmethod def metrics_update_cache_for_alert_group( integration_id, - organization_id, + organization, old_state=None, new_state=None, response_time=None, @@ -91,9 +91,9 @@ def metrics_update_cache_for_alert_group( if response_time and old_state == AlertGroupState.FIRING and started_at > get_response_time_period(): response_time_seconds = int(response_time.total_seconds()) MetricsCacheManager.metrics_update_response_time_cache_for_alert_group( - integration_id, organization_id, response_time_seconds, service_name + integration_id, organization, response_time_seconds, service_name ) if old_state or new_state: MetricsCacheManager.metrics_update_state_cache_for_alert_group( - integration_id, organization_id, service_name, old_state, new_state + integration_id, organization, service_name, old_state, new_state ) diff --git a/engine/apps/metrics_exporter/metrics_collectors.py b/engine/apps/metrics_exporter/metrics_collectors.py index 87fb15c839..c9fb1f1a6c 100644 --- a/engine/apps/metrics_exporter/metrics_collectors.py +++ b/engine/apps/metrics_exporter/metrics_collectors.py @@ -33,6 +33,9 @@ METRIC_USER_WAS_NOTIFIED_OF_ALERT_GROUPS_NAME, ) +type IntegrationId = int +type TeamId = int + application_metrics_registry = CollectorRegistry() logger = logging.getLogger(__name__) @@ -101,7 +104,7 @@ def _get_alert_groups_total_metric(self, org_ids: set[int]) -> typing.Tuple[Metr ) processed_org_ids = set() alert_groups_total_keys = [get_metric_alert_groups_total_key(org_id) for org_id in org_ids] - org_ag_states: typing.Dict[str, typing.Dict[int, AlertGroupsTotalMetricsDict]] = cache.get_many( + org_ag_states: typing.Dict[str, typing.Dict[(IntegrationId, TeamId), AlertGroupsTotalMetricsDict]] = cache.get_many( alert_groups_total_keys ) for org_key, ag_states in org_ag_states.items(): @@ -146,9 +149,10 @@ def _get_response_time_metric(self, org_ids: set[int]) -> typing.Tuple[Metric, s "Users response time to alert groups in 7 days (seconds)", labels=self._integration_labels, ) + processed_org_ids = set() alert_groups_response_time_keys = [get_metric_alert_groups_response_time_key(org_id) for org_id in org_ids] - org_ag_response_times: typing.Dict[str, typing.Dict[int, AlertGroupsResponseTimeMetricsDict]] = cache.get_many( + org_ag_response_times: typing.Dict[str, typing.Dict[(IntegrationId,TeamId), AlertGroupsResponseTimeMetricsDict]] = cache.get_many( alert_groups_response_time_keys ) for org_key, ag_response_time in org_ag_response_times.items(): diff --git a/engine/apps/metrics_exporter/tasks.py b/engine/apps/metrics_exporter/tasks.py index 1663ddecc1..a238260d0b 100644 --- a/engine/apps/metrics_exporter/tasks.py +++ b/engine/apps/metrics_exporter/tasks.py @@ -83,6 +83,7 @@ def calculate_and_cache_metrics(organization_id, force=False): """ from apps.alerts.models import AlertGroup, AlertReceiveChannel from apps.user_management.models import Organization + from apps.user_management.models import Team ONE_HOUR = 3600 TWO_HOURS = 7200 @@ -103,8 +104,11 @@ def calculate_and_cache_metrics(organization_id, force=False): instance_id = organization.stack_id instance_org_id = organization.org_id - metric_alert_group_total: typing.Dict[int, AlertGroupsTotalMetricsDict] = {} - metric_alert_group_response_time: typing.Dict[int, AlertGroupsResponseTimeMetricsDict] = {} + type IntegrationId = int + type TeamId = int + + metric_alert_group_total: typing.Dict[(IntegrationId, TeamId), AlertGroupsTotalMetricsDict] = {} + metric_alert_group_response_time: typing.Dict[(IntegrationId, TeamId), AlertGroupsResponseTimeMetricsDict] = {} states = { AlertGroupState.FIRING.value: AlertGroup.get_new_state_filter(), @@ -114,80 +118,91 @@ def calculate_and_cache_metrics(organization_id, force=False): } for integration in integrations: - metric_alert_group_total_data = { - "integration_name": integration.emojized_verbal_name, - "team_name": integration.team_name, - "team_id": integration.team_id_or_no_team, - "org_id": instance_org_id, - "slug": instance_slug, - "id": instance_id, - "services": { - NO_SERVICE_VALUE: get_default_states_dict(), - }, - } - # calculate states - for state, alert_group_filter in states.items(): - # count alert groups with `service_name` label group by label value - alert_group_count_by_service = ( - integration.alert_groups.filter( - alert_group_filter, - labels__organization=organization, - labels__key_name=SERVICE_LABEL, + # get teams with alerts for this integration + # not sure how performant this approach will be? + alert_group_teams = integration.alert_groups.values_list('teams', flat=True).distinct() + for alert_group_team_id in alert_group_teams: + if alert_group_team_id: + team_name = Team.objects.get(id=alert_group_team_id).name + team_id = alert_group_team_id + else: + team_name = integration.team_name + team_id = integration.team_id_or_no_team + + metric_alert_group_total_data = { + "integration_name": integration.emojized_verbal_name, + "team_name": team_name, + "team_id": team_id, + "org_id": instance_org_id, + "slug": instance_slug, + "id": instance_id, + "services": { + NO_SERVICE_VALUE: get_default_states_dict(), + }, + } + # calculate states + for state, alert_group_filter in states.items(): + # count alert groups with `service_name` label group by label value + alert_group_count_by_service = ( + integration.alert_groups.filter( + alert_group_filter, + labels__organization=organization, + labels__key_name=SERVICE_LABEL, + ) + .values("labels__value_name") + .annotate(count=Count("id")) ) - .values("labels__value_name") - .annotate(count=Count("id")) - ) - for value in alert_group_count_by_service: - metric_alert_group_total_data["services"].setdefault( - value["labels__value_name"], - get_default_states_dict(), - )[state] += value["count"] - # count alert groups without `service_name` label - alert_groups_count_without_service = integration.alert_groups.filter( - alert_group_filter, - ~Q(labels__key_name=SERVICE_LABEL), - ).count() - metric_alert_group_total_data["services"][NO_SERVICE_VALUE][state] += alert_groups_count_without_service - metric_alert_group_total[integration.id] = metric_alert_group_total_data - - # calculate response time metric - metric_response_time_data = { - "integration_name": integration.emojized_verbal_name, - "team_name": integration.team_name, - "team_id": integration.team_id_or_no_team, - "org_id": instance_org_id, - "slug": instance_slug, - "id": instance_id, - "services": {NO_SERVICE_VALUE: []}, - } - - # filter response time by services - response_time_by_service = integration.alert_groups.filter( - started_at__gte=response_time_period, - response_time__isnull=False, - labels__organization=organization, - labels__key_name=SERVICE_LABEL, - ).values_list("id", "labels__value_name", "response_time") - for _, service_name, response_time in response_time_by_service: - metric_response_time_data["services"].setdefault(service_name, []) - metric_response_time_data["services"][service_name].append(response_time.total_seconds()) - - no_service_response_time = ( - integration.alert_groups.filter( + for value in alert_group_count_by_service: + metric_alert_group_total_data["services"].setdefault( + value["labels__value_name"], + get_default_states_dict(), + )[state] += value["count"] + # count alert groups without `service_name` label + alert_groups_count_without_service = integration.alert_groups.filter( + alert_group_filter, + ~Q(labels__key_name=SERVICE_LABEL), + ).count() + metric_alert_group_total_data["services"][NO_SERVICE_VALUE][state] += alert_groups_count_without_service + metric_alert_group_total[(integration.id, team_id)] = metric_alert_group_total_data + + # calculate response time metric + metric_response_time_data = { + "integration_name": integration.emojized_verbal_name, + "team_name": team_name, + "team_id": team_id, + "org_id": instance_org_id, + "slug": instance_slug, + "id": instance_id, + "services": {NO_SERVICE_VALUE: []}, + } + + # filter response time by services + response_time_by_service = integration.alert_groups.filter( started_at__gte=response_time_period, response_time__isnull=False, + labels__organization=organization, + labels__key_name=SERVICE_LABEL, + ).values_list("id", "labels__value_name", "response_time") + for _, service_name, response_time in response_time_by_service: + metric_response_time_data["services"].setdefault(service_name, []) + metric_response_time_data["services"][service_name].append(response_time.total_seconds()) + + no_service_response_time = ( + integration.alert_groups.filter( + started_at__gte=response_time_period, + response_time__isnull=False, + ) + .exclude(id__in=[i[0] for i in response_time_by_service]) + .values_list("response_time", flat=True) ) - .exclude(id__in=[i[0] for i in response_time_by_service]) - .values_list("response_time", flat=True) - ) - no_service_response_time_seconds = [ - int(response_time.total_seconds()) for response_time in no_service_response_time - ] - metric_response_time_data["services"][NO_SERVICE_VALUE] = no_service_response_time_seconds + no_service_response_time_seconds = [ + int(response_time.total_seconds()) for response_time in no_service_response_time + ] + metric_response_time_data["services"][NO_SERVICE_VALUE] = no_service_response_time_seconds - metric_alert_group_response_time[integration.id] = metric_response_time_data + metric_alert_group_response_time[(integration.id, team_id)] = metric_response_time_data metric_alert_groups_total_key = get_metric_alert_groups_total_key(organization_id) metric_alert_groups_response_time_key = get_metric_alert_groups_response_time_key(organization_id) @@ -273,7 +288,7 @@ def update_metrics_for_alert_group(alert_group_id, organization_id, previous_sta service_name = service_label.value_name if service_label else NO_SERVICE_VALUE MetricsCacheManager.metrics_update_cache_for_alert_group( integration_id=alert_group.channel_id, - organization_id=organization_id, + organization=alert_group.channel.organization, old_state=previous_state, new_state=new_state, response_time=updated_response_time, diff --git a/engine/apps/public_api/serializers/alert_groups.py b/engine/apps/public_api/serializers/alert_groups.py index 5218bd1305..a9349c7925 100644 --- a/engine/apps/public_api/serializers/alert_groups.py +++ b/engine/apps/public_api/serializers/alert_groups.py @@ -13,7 +13,10 @@ class AlertGroupSerializer(EagerLoadingMixin, serializers.ModelSerializer): id = serializers.CharField(read_only=True, source="public_primary_key") integration_id = serializers.CharField(source="channel.public_primary_key") + + # This uses the old integration based teams assignment to retain backwards compatibility in the api team_id = TeamPrimaryKeyRelatedField(source="channel.team", allow_null=True) + route_id = serializers.SerializerMethodField() created_at = serializers.DateTimeField(source="started_at") alerts_count = serializers.SerializerMethodField() @@ -24,6 +27,10 @@ class AlertGroupSerializer(EagerLoadingMixin, serializers.ModelSerializer): labels = AlertGroupLabelSerializer(many=True, read_only=True) last_alert = serializers.SerializerMethodField() + # Unlike the internal API we don't fallback to using the integration team as we don't need to be backwards compatible here + # since we are retaining the .team_id field in the api + teams = serializers.SlugRelatedField(read_only=True, many=True, slug_field="public_primary_key", allow_null=True) + SELECT_RELATED = [ "channel", "channel_filter", @@ -67,6 +74,7 @@ class Meta: "permalinks", "silenced_at", "last_alert", + "teams" ] def get_title(self, obj): diff --git a/engine/apps/public_api/serializers/routes.py b/engine/apps/public_api/serializers/routes.py index c9e4f094af..4bd60c3e1c 100644 --- a/engine/apps/public_api/serializers/routes.py +++ b/engine/apps/public_api/serializers/routes.py @@ -158,6 +158,7 @@ class Meta: "is_the_last_route", "slack", "telegram", + "update_team" ] read_only_fields = ["is_the_last_route"] @@ -228,6 +229,7 @@ class Meta: "slack", "telegram", "escalation_chain_id", + "update_team" ] def update(self, instance, validated_data): diff --git a/grafana-plugin/src/containers/IntegrationContainers/ExpandedIntegrationRouteDisplay/ExpandedIntegrationRouteDisplay.tsx b/grafana-plugin/src/containers/IntegrationContainers/ExpandedIntegrationRouteDisplay/ExpandedIntegrationRouteDisplay.tsx index 30455ffaf5..7322ab58fe 100644 --- a/grafana-plugin/src/containers/IntegrationContainers/ExpandedIntegrationRouteDisplay/ExpandedIntegrationRouteDisplay.tsx +++ b/grafana-plugin/src/containers/IntegrationContainers/ExpandedIntegrationRouteDisplay/ExpandedIntegrationRouteDisplay.tsx @@ -13,6 +13,7 @@ import { RadioButtonGroup, Alert, useStyles2, + InlineSwitch } from '@grafana/ui'; import { UserActions } from 'helpers/authorization/authorization'; import { StackSize } from 'helpers/consts'; @@ -345,6 +346,14 @@ export const ExpandedIntegrationRouteDisplay: React.FC )} + + + {!isEscalationCollapsed && ( @@ -466,6 +475,13 @@ export const ExpandedIntegrationRouteDisplay: React.FC) { + const value = event.target.checked; + await alertReceiveChannelStore.saveChannelFilter(channelFilterId, { + update_team: value, + }); + } + async function onEscalationChainsRefresh() { setState({ isRefreshingEscalationChains: true }); await escalationChainStore.updateItems(); diff --git a/grafana-plugin/src/models/channel_filter/channel_filter.types.ts b/grafana-plugin/src/models/channel_filter/channel_filter.types.ts index f55651ae81..1f2c5e28e5 100644 --- a/grafana-plugin/src/models/channel_filter/channel_filter.types.ts +++ b/grafana-plugin/src/models/channel_filter/channel_filter.types.ts @@ -29,4 +29,5 @@ export interface ChannelFilter { [key: string]: any; } | null; escalation_chain: EscalationChain['id']; + update_team: boolean; } diff --git a/grafana-plugin/src/network/oncall-api/autogenerated-api.types.d.ts b/grafana-plugin/src/network/oncall-api/autogenerated-api.types.d.ts index 19836b72cc..ad2dfc90b5 100644 --- a/grafana-plugin/src/network/oncall-api/autogenerated-api.types.d.ts +++ b/grafana-plugin/src/network/oncall-api/autogenerated-api.types.d.ts @@ -1540,7 +1540,7 @@ export interface components { readonly status: number; /** @description Generate a link for AlertGroup to declare Grafana Incident by click */ readonly declare_incident_link: string; - team: string | null; + teams: components['schemas']['FastTeam'][]; grafana_incident_id?: string | null; readonly labels: components['schemas']['AlertGroupLabel'][]; readonly permalinks: { @@ -1652,7 +1652,7 @@ export interface components { readonly status: number; /** @description Generate a link for AlertGroup to declare Grafana Incident by click */ readonly declare_incident_link: string; - team: string | null; + teams: components['schemas']['FastTeam'][]; grafana_incident_id?: string | null; readonly labels: components['schemas']['AlertGroupLabel'][]; readonly permalinks: { diff --git a/grafana-plugin/src/pages/incidents/Incidents.tsx b/grafana-plugin/src/pages/incidents/Incidents.tsx index e8787c277b..00b871b2fb 100644 --- a/grafana-plugin/src/pages/incidents/Incidents.tsx +++ b/grafana-plugin/src/pages/incidents/Incidents.tsx @@ -778,9 +778,11 @@ class _IncidentsPage extends React.Component - - + record.teams.map((team) => ( + + + + )) ); };