From 8e2868a791e10011d58e80f37ca27f3e627e9b9d Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Thu, 19 Sep 2024 14:39:30 +0000 Subject: [PATCH] feat: Close anomalous spans in Datadog middleware (#798) Also, add an additional assertion in `run_middleware` to ensure that the middleware hasn't silently entered an error state. (Helped me debug a case where I had forgotten to fake a Waffle flag.) --- CHANGELOG.rst | 6 ++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/middleware.py | 63 +++++++++++ .../tests/test_middleware.py | 100 +++++++++++++++++- 4 files changed, 166 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d42b0ec..e7db937 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[4.5.0] - 2024-09-19 +~~~~~~~~~~~~~~~~~~~~ +Added +----- +* Datadog diagnostics middleware can now attempt to close anomalous spans. Can be enabled via Waffle flag ``datadog.diagnostics.close_anomalous_spans`` (controlled separately from logging feature). + [4.4.0] - 2024-09-10 ~~~~~~~~~~~~~~~~~~~~ Changed diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index 94612b4..72386ec 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '4.4.0' +__version__ = '4.5.0' diff --git a/edx_arch_experiments/datadog_diagnostics/middleware.py b/edx_arch_experiments/datadog_diagnostics/middleware.py index 79c77b1..b249868 100644 --- a/edx_arch_experiments/datadog_diagnostics/middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/middleware.py @@ -27,6 +27,16 @@ # .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 DETECT_ANOMALOUS_TRACE = WaffleFlag('datadog.diagnostics.detect_anomalous_trace', module_name=__name__) +# .. toggle_name: datadog.diagnostics.close_anomalous_spans +# .. toggle_implementation: WaffleFlag +# .. toggle_default: False +# .. toggle_description: Close anomalous spans that are ancestors of the django.request span. +# .. toggle_use_cases: temporary +# .. toggle_creation_date: 2024-09-19 +# .. toggle_target_removal_date: 2024-12-01 +# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 +CLOSE_ANOMALOUS_SPANS = WaffleFlag('datadog.diagnostics.close_anomalous_spans', module_name=__name__) + # .. toggle_name: datadog.diagnostics.log_root_span # .. toggle_implementation: WaffleFlag # .. toggle_default: False @@ -76,6 +86,8 @@ def __call__(self, request): def process_view(self, request, _view_func, _view_args, _view_kwargs): try: self.log_diagnostics(request) + if CLOSE_ANOMALOUS_SPANS.is_enabled(): + self.close_anomalous_spans(request) except BaseException as e: # If there's an error, it will probably hit every request, # so let's just log it once. @@ -86,6 +98,57 @@ def process_view(self, request, _view_func, _view_args, _view_kwargs): f"(suppressing further errors): {e!r}" ) + # pylint: disable=protected-access + def close_anomalous_spans(self, request): + """ + Detect anomalous spans and close them. + + This closes any open spans that are ancestors of the current + request. The trace will still have two requests concatenated + together, but the problematic spans should not affect + future requests. + + Only activates if the root span is itself closed, which is a + cheap thing to check. + """ + # If the root span is still open, probably not an anomalous trace. + if self.dd_tracer.current_root_span().duration is None: + return # nothing to do! + + # Walk upwards until we find the django.request span. + walk_span = self.dd_tracer.current_span() + while walk_span.name != 'django.request': + walk_span = walk_span._parent + if walk_span is None: + # If we can't find the django.request root, there's + # something bad about our assumptions and we should + # not attempt a fix. + log.error( + "Did not find django.request span when walking anomalous trace " + "to root. Not attempting a fix." + ) + return + + # Go "above" the request span + walk_span = walk_span._parent + + # Now close everything above the current request span that's + # still open, logging as we go. + while walk_span is not None: + # We call finish() individually rather than + # finish_with_ancestors() because this gives us a chance + # to log each one. + if walk_span.duration is None: + walk_span.finish() + log.info( + f"Closed span in anomalous trace: name={walk_span.name} " + f"id={walk_span.span_id:x} trace={walk_span.trace_id:x}" + ) + # Keep walking up even if we discover closed spans; we've + # previously seen multiple contiguous segments of open + # spans separated by closed ones. + walk_span = walk_span._parent + def log_diagnostics(self, request): """ Contains all the actual logging logic. diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py index ee21888..4ae05c8 100644 --- a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py @@ -3,13 +3,14 @@ """ import re +from contextlib import ExitStack from unittest.mock import Mock, patch import ddt import ddtrace from django.test import TestCase, override_settings -from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware +from ..middleware import CLOSE_ANOMALOUS_SPANS, DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware def fake_view(_request): @@ -24,7 +25,7 @@ def make_middleware(self): """Make an instance of the middleware with current settings.""" return DatadogDiagnosticMiddleware(fake_view) - def run_middleware(self, middleware=None): + def run_middleware(self, middleware=None, check_error_state=True): """Run the middleware using a fake request.""" if middleware is None: middleware = self.make_middleware() @@ -36,6 +37,9 @@ def run_middleware(self, middleware=None): middleware.process_view(request, None, None, None) + if check_error_state: + assert middleware.error is False + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') def test_log_diagnostics_error_only_once(self, mock_log_error): """ @@ -48,8 +52,9 @@ def test_log_diagnostics_error_only_once(self, mock_log_error): bad_method = Mock(side_effect=lambda request: 1/0) middleware.log_diagnostics = bad_method - self.run_middleware(middleware) - self.run_middleware(middleware) + self.run_middleware(middleware, check_error_state=False) + self.run_middleware(middleware, check_error_state=False) + assert middleware.error is True # Called twice assert len(bad_method.call_args_list) == 2 @@ -74,6 +79,7 @@ def test_log_diagnostics_error_only_once(self, mock_log_error): def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning): with ( patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=enabled), + patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False), patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), # Need at least two levels of spans in order to fake # an anomaly. (Otherwise current_root_span returns None.) @@ -108,6 +114,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning): """ with ( patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True), + patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False), patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), # Need at least two levels of spans in order to fake # an anomaly. (Otherwise current_root_span returns None.) @@ -134,6 +141,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning): def test_log_root_span(self, mock_log_info): with ( patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False), + patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False), patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True), # Need at least two levels of spans for interesting logging ddtrace.tracer.trace("local_root"), @@ -149,3 +157,87 @@ def test_log_root_span(self, mock_log_info): r"current span = name='inner_span' .*", log_msg ) + + def run_close_with(self, *, enabled, anomalous, ancestors=None): + """ + Run a "close anomalous spans" scenario with supplied settings. + + ancestors is a list of span operation names, defaulting to + something reasonable if not supplied. + """ + with ( + patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False), + patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=enabled), + patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), + ExitStack() as stack, + ): + if ancestors is None: + ancestors = [ + 'django.request', 'django.view', + 'celery.apply', + # ^ will need to close some of these + 'django.request', 'django.view', + ] + for ancestor_name in ancestors: + stack.enter_context(ddtrace.tracer.trace(ancestor_name)) + # make anomaly readily detectable + if anomalous: + ddtrace.tracer.current_root_span().finish() + + self.run_middleware() + + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') + def test_close_disabled(self, mock_log_error, mock_log_info): + """ + Confirm that nothing interesting happens when close-spans flag is disabled. + """ + self.run_close_with(enabled=False, anomalous=True) + + mock_log_error.assert_not_called() + mock_log_info.assert_not_called() + + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') + def test_close_applied(self, mock_log_error, mock_log_info): + """ + Confirm that anomalous spans are closed, at least for future requests. + """ + self.run_close_with(enabled=True, anomalous=True) + + mock_log_error.assert_not_called() + + # Expect to close celery.apply and the one above it (but we've + # already closed the root, above). + assert len(mock_log_info.call_args_list) == 2 + assert [call[0][0].split(' id=')[0] for call in mock_log_info.call_args_list] == [ + "Closed span in anomalous trace: name=celery.apply", + "Closed span in anomalous trace: name=django.view", + ] + + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') + def test_close_not_needed(self, mock_log_error, mock_log_info): + """ + Confirm that no logging when anomalous trace not present. + """ + self.run_close_with(enabled=True, anomalous=False) + + mock_log_error.assert_not_called() + mock_log_info.assert_not_called() + + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') + def test_close_missing_request(self, mock_log_error, mock_log_info): + """ + Check that we look for the expected ancestor and only close above it. + """ + self.run_close_with(enabled=True, anomalous=True, ancestors=[ + # Artificial scenario standing in for something unexpected. + 'django.view', 'celery.apply', 'django.view', + ]) + + mock_log_error.assert_called_once_with( + "Did not find django.request span when walking anomalous trace to root. Not attempting a fix." + ) + mock_log_info.assert_not_called()