Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Close anomalous spans in Datadog middleware #798

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Change Log
Unreleased
~~~~~~~~~~

[4.5.0] - 2024-09-18
~~~~~~~~~~~~~~~~~~~~
Added
-----
* Datadog diagnostics middleware can now attempt to close anomalous spans. Can be enabled via Waffle flag ``datadog.diagnostics.close_anomalous_spans`` (controlled separately from logging feature).

[4.4.0] - 2024-09-10
~~~~~~~~~~~~~~~~~~~~
Changed
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.4.0'
__version__ = '4.5.0'
63 changes: 63 additions & 0 deletions edx_arch_experiments/datadog_diagnostics/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DETECT_ANOMALOUS_TRACE = WaffleFlag('datadog.diagnostics.detect_anomalous_trace', module_name=__name__)

# .. toggle_name: datadog.diagnostics.close_anomalous_spans
# .. toggle_implementation: WaffleFlag
# .. toggle_default: False
# .. toggle_description: Close anomalous spans that are ancestors of the django.request span.
# .. toggle_use_cases: temporary
# .. toggle_creation_date: 2024-09-18
# .. toggle_target_removal_date: 2024-12-01
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
CLOSE_ANOMALOUS_SPANS = WaffleFlag('datadog.diagnostics.close_anomalous_spans', module_name=__name__)

# .. toggle_name: datadog.diagnostics.log_root_span
# .. toggle_implementation: WaffleFlag
# .. toggle_default: False
Expand Down Expand Up @@ -76,6 +86,8 @@ def __call__(self, request):
def process_view(self, request, _view_func, _view_args, _view_kwargs):
try:
self.log_diagnostics(request)
if CLOSE_ANOMALOUS_SPANS.is_enabled():
self.close_anomalous_spans(request)
except BaseException as e:
# If there's an error, it will probably hit every request,
# so let's just log it once.
Expand All @@ -86,6 +98,57 @@ def process_view(self, request, _view_func, _view_args, _view_kwargs):
f"(suppressing further errors): {e!r}"
)

# pylint: disable=protected-access
def close_anomalous_spans(self, request):
"""
Detect anomalous spans and close them.
This closes any open spans that are ancestors of the current
request. The trace will still have two requests concatenated
together, but the problematic spans should not affect
future requests.
Only activates if the root span is itself closed, which is a
cheap thing to check.
"""
# If the root span is still open, probably not an anomalous trace.
if self.dd_tracer.current_root_span().duration is None:
return # nothing to do!

# Walk upwards until we find the django.request span.
walk_span = self.dd_tracer.current_span()
while walk_span.name != 'django.request':
walk_span = walk_span._parent
if walk_span is None:
# If we can't find the django.request root, there's
# something bad about our assumptions and we should
# not attempt a fix.
log.error(
"Did not find django.request span when walking anomalous trace "
"to root. Not attempting a fix."
)
return

# Go "above" the request span
walk_span = walk_span._parent

# Now close everything above the current request span that's
# still open, logging as we go.
while walk_span is not None:
# We call finish() individually rather than
# finish_with_ancestors() because this gives us a chance
# to log each one.
if walk_span.duration is None:
walk_span.finish()
log.info(
f"Closed span in anomalous trace: name={walk_span.name} "
f"id={walk_span.span_id:x} trace={walk_span.trace_id:x}"
)
# Keep walking up even if we discover closed spans; we've
# previously seen multiple contiguous segments of open
# spans separated by closed ones.
walk_span = walk_span._parent

def log_diagnostics(self, request):
"""
Contains all the actual logging logic.
Expand Down
100 changes: 96 additions & 4 deletions edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
"""

import re
from contextlib import ExitStack
from unittest.mock import Mock, patch

import ddt
import ddtrace
from django.test import TestCase, override_settings

from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware
from ..middleware import CLOSE_ANOMALOUS_SPANS, DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware


def fake_view(_request):
Expand All @@ -24,7 +25,7 @@ def make_middleware(self):
"""Make an instance of the middleware with current settings."""
return DatadogDiagnosticMiddleware(fake_view)

def run_middleware(self, middleware=None):
def run_middleware(self, middleware=None, check_error_state=True):
"""Run the middleware using a fake request."""
if middleware is None:
middleware = self.make_middleware()
Expand All @@ -36,6 +37,9 @@ def run_middleware(self, middleware=None):

middleware.process_view(request, None, None, None)

if check_error_state:
assert middleware.error is False

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_log_diagnostics_error_only_once(self, mock_log_error):
"""
Expand All @@ -48,8 +52,9 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
bad_method = Mock(side_effect=lambda request: 1/0)
middleware.log_diagnostics = bad_method

self.run_middleware(middleware)
self.run_middleware(middleware)
self.run_middleware(middleware, check_error_state=False)
self.run_middleware(middleware, check_error_state=False)
assert middleware.error is True

# Called twice
assert len(bad_method.call_args_list) == 2
Expand All @@ -74,6 +79,7 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=enabled),
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
Expand Down Expand Up @@ -108,6 +114,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning):
"""
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True),
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
Expand All @@ -134,6 +141,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning):
def test_log_root_span(self, mock_log_info):
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True),
# Need at least two levels of spans for interesting logging
ddtrace.tracer.trace("local_root"),
Expand All @@ -149,3 +157,87 @@ def test_log_root_span(self, mock_log_info):
r"current span = name='inner_span' .*",
log_msg
)

def run_close_with(self, *, enabled, anomalous, ancestors=None):
"""
Run a "close anomalous spans" scenario with supplied settings.
ancestors is a list of span operation names, defaulting to
something reasonable if not supplied.
"""
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=enabled),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
ExitStack() as stack,
):
if ancestors is None:
ancestors = [
'django.request', 'django.view',
'celery.apply',
# ^ will need to close some of these
'django.request', 'django.view',
]
for ancestor_name in ancestors:
stack.enter_context(ddtrace.tracer.trace(ancestor_name))
# make anomaly readily detectable
if anomalous:
ddtrace.tracer.current_root_span().finish()

self.run_middleware()

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_close_disabled(self, mock_log_error, mock_log_info):
"""
Confirm that nothing interesting happens when close-spans flag is disabled.
"""
self.run_close_with(enabled=False, anomalous=True)

mock_log_error.assert_not_called()
mock_log_info.assert_not_called()

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_close_applied(self, mock_log_error, mock_log_info):
"""
Confirm that anomalous spans are closed, at least for future requests.
"""
self.run_close_with(enabled=True, anomalous=True)

mock_log_error.assert_not_called()

# Expect to close celery.apply and the one above it (but we've
# already closed the root, above).
assert len(mock_log_info.call_args_list) == 2
assert [call[0][0].split(' id=')[0] for call in mock_log_info.call_args_list] == [
"Closed span in anomalous trace: name=celery.apply",
"Closed span in anomalous trace: name=django.view",
]

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_close_not_needed(self, mock_log_error, mock_log_info):
"""
Confirm that no logging when anomalous trace not present.
"""
self.run_close_with(enabled=True, anomalous=False)

mock_log_error.assert_not_called()
mock_log_info.assert_not_called()

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_close_missing_request(self, mock_log_error, mock_log_info):
"""
Check that we look for the expected ancestor and only close above it.
"""
self.run_close_with(enabled=True, anomalous=True, ancestors=[
# Artificial scenario standing in for something unexpected.
'django.view', 'celery.apply', 'django.view',
])

mock_log_error.assert_called_once_with(
"Did not find django.request span when walking anomalous trace to root. Not attempting a fix."
)
mock_log_info.assert_not_called()
Loading