Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: get alert group related affected services #5407

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
send_alert_group_signal_for_delete,
unsilence_task,
)
from apps.grafana_plugin.helpers import GrafanaAPIClient
from apps.grafana_plugin.ui_url_builder import UIURLBuilder
from apps.metrics_exporter.tasks import update_metrics_for_alert_group
from apps.slack.slack_formatter import SlackFormatter
Expand Down Expand Up @@ -573,6 +574,66 @@ def declare_incident_link(self) -> str:
def happened_while_maintenance(self):
return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None

def get_dependent_services(self, affected_only: bool = False) -> typing.List[str]:
"""Return a service name list of current alert group service dependent services.

Service name is extracted from current alert group labels.
If affected_only is True, return only dependent services with active alert groups.
"""
SERVICE_LABEL = "service_name"
affected_deps = []
organization = self.channel.organization
service_label = self.labels.filter(key_name=SERVICE_LABEL).first()

if not service_label:
return affected_deps

service_name = service_label.value_name
# query dependent services via aggregated API server
grafana_api_client = GrafanaAPIClient(api_url=organization.grafana_url, api_token=organization.api_token)
response_data, call_status = grafana_api_client.get_services_depending_on(
service_name, stack_id=organization.stack_id
)
if call_status["status_code"] != 200:
# check additional logs from Grafana API client
logger.info(f"get_dependent_services for alert_group {self.pk} failed")
return affected_deps
deps = [s["spec"]["from"]["ref"]["name"] for s in response_data.get("items", [])]

if not affected_only:
return deps

# check for dependent services with active alert groups
for dep_service_name in deps:
queryset = AlertGroup.objects.filter(
channel__organization=organization,
labels__organization=organization,
labels__key_name=SERVICE_LABEL,
labels__value_name=dep_service_name,
# check for firing and acknowledged alert groups
resolved=False,
silenced=False,
# TODO: is root?
# TODO: review this period? started around this one?
# eg. started after? last week? both? tests!
# started_at__gte=timezone.now() - timezone.timedelta(days=30),
started_at__gte=self.started_at,
started_at__gt=timezone.now() - timezone.timedelta(days=7),
)
# TODO: maybe we don't need this?
if settings.ALERT_GROUPS_DISABLE_PREFER_ORDERING_INDEX:
# workaround related to MySQL "ORDER BY LIMIT Query Optimizer Bug"
# read more: https://hackmysql.com/infamous-order-by-limit-query-optimizer-bug/
from django_mysql.models import add_QuerySetMixin

queryset = add_QuerySetMixin(queryset)
queryset = queryset.force_index("alert_group_list_index")

if queryset.exists():
affected_deps.append(dep_service_name)

return affected_deps

def get_paged_users(self) -> typing.List[PagedUser]:
from apps.alerts.models import AlertGroupLogRecord

Expand Down
141 changes: 141 additions & 0 deletions engine/apps/alerts/tests/test_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,3 +867,144 @@ def test_slack_channel_id_no_slack_message_no_channel_filter(

# Assert that slack_channel_id is None
assert alert_group.slack_channel_id is None


@pytest.mark.django_db
def test_alert_group_dependent_services_failed_api_call(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (None, {"status_code": 500})
services = alert_group.get_dependent_services()
assert services == []


@pytest.mark.django_db
def test_alert_group_dependent_services_no_service_set(
make_organization,
make_alert_receive_channel,
make_alert_group,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# no service name label set

services = alert_group.get_dependent_services()
assert services == []


@pytest.mark.django_db
def test_alert_group_dependent_services_all(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in ("b", "c")
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}

with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
services = alert_group.get_dependent_services()
assert services == ["service-b", "service-c"]


@pytest.mark.django_db
def test_alert_group_dependent_services_filter_affected(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

affected_states = {AlertGroupState.FIRING, AlertGroupState.ACKNOWLEDGED}
expected_services = []
for i, state in enumerate(AlertGroupState):
ag = make_alert_group(alert_receive_channel)
if state != AlertGroupState.FIRING:
setattr(ag, state.lower(), True)
ag.save()
# set service name label
service_name = f"service-{i}"
make_alert_group_label_association(organization, ag, key_name="service_name", value_name=service_name)
if state in affected_states:
expected_services.append(service_name)

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in range(len(AlertGroupState))
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}
with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
services = alert_group.get_dependent_services(affected_only=True)
assert len(services) == len(affected_states)
assert services == expected_services
64 changes: 64 additions & 0 deletions engine/apps/api/tests/test_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2413,3 +2413,67 @@ def test_filter_default_started_at(
)
assert response.status_code == status.HTTP_200_OK
assert response.json()["pk"] == old_alert_group.public_primary_key


@pytest.mark.django_db
def test_alert_group_affected_services(
alert_group_internal_api_setup,
make_user_for_organization,
make_user_auth_headers,
make_alert_group_label_association,
):
_, token, alert_groups = alert_group_internal_api_setup
resolved_ag, ack_ag, new_ag, silenced_ag = alert_groups
organization = new_ag.channel.organization
user = make_user_for_organization(organization)

# make sure the firing alert group started before the others
new_ag.started_at = timezone.now() - timezone.timedelta(days=1)
new_ag.save(update_fields=["started_at"])
# set firing alert group service label
make_alert_group_label_association(organization, new_ag, key_name="service_name", value_name="service-a")
# set other service name labels for other alert groups
make_alert_group_label_association(organization, ack_ag, key_name="service_name", value_name="service-2")
make_alert_group_label_association(organization, resolved_ag, key_name="service_name", value_name="service-3")
make_alert_group_label_association(organization, silenced_ag, key_name="service_name", value_name="service-4")

client = APIClient()
url = reverse("api-internal:alertgroup-related-affected-services", kwargs={"pk": new_ag.public_primary_key})

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in range(5)
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}
with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
response = client.get(url, format="json", **make_user_auth_headers(user, token))

assert response.status_code == status.HTTP_200_OK
expected = {
"name": "service-2",
"service_url": "a/grafana-slo-app/service/service-2",
"alert_groups_url": "a/grafana-oncall-app/alert-groups?status=0&status=1&started_at=now-30d_now&label=service_name:service-2",
}
assert response.json() == [expected]
30 changes: 30 additions & 0 deletions engine/apps/api/views/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from apps.api.serializers.team import TeamSerializer
from apps.auth_token.auth import PluginAuthentication
from apps.base.models.user_notification_policy_log_record import UserNotificationPolicyLogRecord
from apps.grafana_plugin.ui_url_builder import UIURLBuilder
from apps.labels.utils import is_labels_feature_enabled
from apps.mobile_app.auth import MobileAppAuthTokenAuthentication
from apps.user_management.models import Team, User
Expand Down Expand Up @@ -283,6 +284,7 @@ class AlertGroupView(
"bulk_action": [RBACPermission.Permissions.ALERT_GROUPS_WRITE],
"preview_template": [RBACPermission.Permissions.INTEGRATIONS_TEST],
"escalation_snapshot": [RBACPermission.Permissions.ALERT_GROUPS_READ],
"related_affected_services": [RBACPermission.Permissions.ALERT_GROUPS_READ],
}

queryset = AlertGroup.objects.none() # needed for drf-spectacular introspection
Expand Down Expand Up @@ -881,3 +883,31 @@ def escalation_snapshot(self, request, pk=None):
escalation_snapshot = alert_group.escalation_snapshot
result = AlertGroupEscalationSnapshotAPISerializer(escalation_snapshot).data if escalation_snapshot else {}
return Response(result)

@extend_schema(
responses=inline_serializer(
name="RelatedAffectedServices",
fields={
"name": serializers.CharField(),
"service_url": serializers.CharField(),
"alert_groups_url": serializers.CharField(),
},
many=True,
)
)
@action(methods=["get"], detail=True)
def related_affected_services(self, request, pk=None):
alert_group = self.get_object()
affected_deps = alert_group.get_dependent_services(affected_only=True)
url_builder = UIURLBuilder(alert_group.channel.organization)
result = [
{
"name": service_name,
"service_url": url_builder.service_page(service_name),
"alert_groups_url": url_builder.alert_groups(
f"?status=0&status=1&started_at=now-30d_now&label=service_name:{service_name}"
),
}
for service_name in affected_deps
]
return Response(result)
11 changes: 11 additions & 0 deletions engine/apps/grafana_plugin/helpers/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,17 @@ def validate_grafana_token_format(grafana_token: str) -> bool:
return False
return True

def get_services_depending_on(self, service_name, stack_id=None):
namespace = "default"
if settings.LICENSE == settings.CLOUD_LICENSE_NAME and stack_id:
namespace = f"stacks-{stack_id}"
query_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
return self.api_get(
f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/{namespace}/relations", params=query_params
)


class GcomAPIClient(APIClient):
ACTIVE_INSTANCE_QUERY = "instances?status=active"
Expand Down
34 changes: 34 additions & 0 deletions engine/apps/grafana_plugin/tests/test_grafana_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,37 @@ def test_it_returns_based_on_status_code_of_head_call(

api_client = GrafanaAPIClient(API_URL, API_TOKEN)
assert api_client.is_rbac_enabled_for_organization() == expected


class TestGetServicesDependingOn:
@patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get")
def test_api_call_cloud(self, mock_grafana_api_client_api_get, settings):
settings.LICENSE = settings.CLOUD_LICENSE_NAME
api_client = GrafanaAPIClient(API_URL, API_TOKEN)

service_name = "service-foo"
stack_id = 42
api_client.get_services_depending_on(service_name, stack_id=stack_id)
expected_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
mock_grafana_api_client_api_get.assert_called_with(
f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/stacks-{stack_id}/relations",
params=expected_params,
)

@patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get")
def test_api_call_oss(self, mock_grafana_api_client_api_get, settings):
settings.LICENSE = settings.OPEN_SOURCE_LICENSE_NAME
api_client = GrafanaAPIClient(API_URL, API_TOKEN)

service_name = "service-foo"
stack_id = 42
api_client.get_services_depending_on(service_name, stack_id=stack_id)
expected_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
mock_grafana_api_client_api_get.assert_called_with(
"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/default/relations",
params=expected_params,
)
Loading
Loading