From 196e0c2a36f888079227624bc76903b4f7a333fc Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Mon, 9 Sep 2024 21:23:12 +0000 Subject: [PATCH] feat: Log ancestors of current span when anomaly found Datadog believes that a lingering unfinished celery span from a previous request might be becoming the active context for all future Django requests, causing them to become parented to it. To gather evidence, we'll need to learn not just what the trace root span is, but whether there are unexpected intermediary spans as well. There may be a great many ancestors, especially with the trace concatenation in play, so we'll walk upwards from the current span towards the root until a limit is reached. This should allow us to find if the current Django request's span has a parent, and what it is. - New setting `DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH` to control walk depth - Move anomaly logging to its own method. - Log ancestry of N proximal spans, and adjust formatting of logging to be more compact. Changes relating to testing: - Add comment on how to do manual testing -- calling finish() on the root span lets us fake an anomaly. - Lift `current_span` to early in `log_diagnostics`. This is helpful during manual testing when `DD_DJANGO_INSTRUMENT_MIDDLEWARE=false`, because in that case, the current span *is* the root span. But finishing that span means we can no longer call current_root_span to retrieve it! Retrieving it before that point allows this trick to still work with our code. - Need to move middleware setup out of `setUp` method because we need to be able to adjust Django settings before the middleware is constructed. `run_middleware` now takes an optional `middleware` param. - Add intermediary span in unit tests so that truncation can be more clearly observed, and to help demonstrate that walk and print order is as expected. - Rename test local-root span to be in line with other naming. Don't need the "test_" prefix. --- CHANGELOG.rst | 6 ++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/middleware.py | 48 ++++++++++--- .../tests/test_middleware.py | 67 +++++++++++++++---- 4 files changed, 101 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5ed7f41..d42b0ec 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[4.4.0] - 2024-09-10 +~~~~~~~~~~~~~~~~~~~~ +Changed +------- +* Datadog diagnostics now logs ancestor spans when an anomaly is encountered, up to a limit of 10 (controlled by new Django setting ``DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH``). Spans are logged in full and on separate lines, so this logging is now much more verbose; consider only enabling this logging for short periods. Log format of first line has also changed slightly. + [4.3.0] - 2024-08-22 ~~~~~~~~~~~~~~~~~~~~ Added diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index eaffb83..94612b4 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '4.3.0' +__version__ = '4.4.0' diff --git a/edx_arch_experiments/datadog_diagnostics/middleware.py b/edx_arch_experiments/datadog_diagnostics/middleware.py index 844695c..79c77b1 100644 --- a/edx_arch_experiments/datadog_diagnostics/middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/middleware.py @@ -9,6 +9,7 @@ import logging import time +from django.conf import settings from django.core.exceptions import MiddlewareNotUsed from edx_toggles.toggles import WaffleFlag @@ -61,6 +62,13 @@ def __init__(self, get_response): raise MiddlewareNotUsed # pylint: disable=raise-missing-from self.worker_start_epoch = time.time() + # .. setting_name: DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH + # .. setting_default: 10 + # .. setting_description: Controls how many ancestors spans are logged + # when anomalous traces are detected. This limits log size when very + # deep span trees are present (such as in anomalous traces, or even + # just when each middleware is given a span). + self.log_span_ancestors_depth = getattr(settings, "DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH", 10) def __call__(self, request): return self.get_response(request) @@ -82,25 +90,49 @@ def log_diagnostics(self, request): """ Contains all the actual logging logic. """ + current_span = self.dd_tracer.current_span() local_root_span = self.dd_tracer.current_root_span() if DETECT_ANOMALOUS_TRACE.is_enabled(): + # For testing, uncomment this line to fake an anomalous span: + # local_root_span.finish() root_duration_s = local_root_span.duration if root_duration_s is not None: - worker_run_time_s = time.time() - self.worker_start_epoch - log.warning( - f"Anomalous Datadog local root span (duration already set): " - f"id = {local_root_span.trace_id:x}; " - f"duration = {root_duration_s:0.3f} sec; " - f"worker age = {worker_run_time_s:0.3f} sec" - ) + self.log_anomalous_trace(current_span, local_root_span) if LOG_ROOT_SPAN.is_enabled(): route_pattern = getattr(request.resolver_match, 'route', None) - current_span = self.dd_tracer.current_span() # pylint: disable=protected-access log.info( f"Datadog span diagnostics: Route = {route_pattern}; " f"local root span = {local_root_span._pprint()}; " f"current span = {current_span._pprint()}" ) + + def log_anomalous_trace(self, current_span, local_root_span): + worker_run_time_s = time.time() - self.worker_start_epoch + + # Build up a list of spans from current back towards the root, up to a limit. + ancestors = [] + walk_span = current_span + while len(ancestors) < self.log_span_ancestors_depth and walk_span is not None: + ancestors.insert(0, walk_span._pprint()) # pylint: disable=protected-access + walk_span = walk_span._parent # pylint: disable=protected-access + + trunc = "(ancestors truncated)\n" if walk_span else "" + + if local_root_span.duration: + duration_msg = f"duration={local_root_span.duration:0.3f}" + else: + # Should only occur during local testing of this + # middleware, when forcing this code path to run. + duration_msg = "duration not set" + + msg = ( + "Anomalous Datadog local root span: " + f"trace_id={local_root_span.trace_id:x}; " + f"{duration_msg}; " + f"worker_age={worker_run_time_s:0.3f}; span ancestry:" + ) + + log.warning(msg + "\n" + trunc + "\n".join(ancestors)) # pylint: disable=logging-not-lazy diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py index 4e48619..ee21888 100644 --- a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py @@ -7,7 +7,7 @@ import ddt import ddtrace -from django.test import TestCase +from django.test import TestCase, override_settings from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware @@ -20,16 +20,21 @@ def fake_view(_request): class TestDatadogDiagnosticMiddleware(TestCase): """Tests for DatadogDiagnosticMiddleware.""" - def setUp(self): - self.middleware = DatadogDiagnosticMiddleware(fake_view) + def make_middleware(self): + """Make an instance of the middleware with current settings.""" + return DatadogDiagnosticMiddleware(fake_view) - def run_middleware(self): + def run_middleware(self, middleware=None): """Run the middleware using a fake request.""" + if middleware is None: + middleware = self.make_middleware() + resolver = Mock() resolver.route = "/some/path" request = Mock() request.resolver_match = resolver - self.middleware.process_view(request, None, None, None) + + middleware.process_view(request, None, None, None) @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') def test_log_diagnostics_error_only_once(self, mock_log_error): @@ -38,12 +43,13 @@ def test_log_diagnostics_error_only_once(self, mock_log_error): The method should still be called every time in case it is still doing useful work before the error, though. """ + middleware = self.make_middleware() bad_method = Mock(side_effect=lambda request: 1/0) - self.middleware.log_diagnostics = bad_method + middleware.log_diagnostics = bad_method - self.run_middleware() - self.run_middleware() + self.run_middleware(middleware) + self.run_middleware(middleware) # Called twice assert len(bad_method.call_args_list) == 2 @@ -71,7 +77,8 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning): patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), # Need at least two levels of spans in order to fake # an anomaly. (Otherwise current_root_span returns None.) - ddtrace.tracer.trace("test_local_root"), + ddtrace.tracer.trace("local_root"), + ddtrace.tracer.trace("intermediary_span"), ddtrace.tracer.trace("inner_span"), ): if cause_anomaly: @@ -81,21 +88,55 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning): if enabled and cause_anomaly: mock_log_warning.assert_called_once() log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call + assert re.fullmatch( - r"Anomalous Datadog local root span \(duration already set\): " - r"id = [0-9A-Fa-f]+; duration = [0-9]\.[0-9]{3} sec; worker age = [0-9]\.[0-9]{3} sec", + r"Anomalous Datadog local root span: " + r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n" + r"name='local_root'.*duration=[0-9]+.*\n" + r"name='intermediary_span'.*duration=None.*\n" + r"name='inner_span'.*duration=None.*", log_msg ) else: mock_log_warning.assert_not_called() + @override_settings(DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH=2) + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.warning') + def test_anomalous_trace_truncation(self, mock_log_warning): + """ + Test that truncation works, returning N most proximal spans. + """ + with ( + patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True), + patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), + # Need at least two levels of spans in order to fake + # an anomaly. (Otherwise current_root_span returns None.) + ddtrace.tracer.trace("local_root"), + ddtrace.tracer.trace("intermediary_span"), + ddtrace.tracer.trace("inner_span"), + ): + ddtrace.tracer.current_root_span().finish() # cause anomaly + self.run_middleware() + + mock_log_warning.assert_called_once() + log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call + + assert re.fullmatch( + r"Anomalous Datadog local root span: " + r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n" + r"\(ancestors truncated\)\n" # difference here + r"name='intermediary_span'.*duration=None.*\n" + r"name='inner_span'.*duration=None.*", + log_msg + ) + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') def test_log_root_span(self, mock_log_info): with ( patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False), patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True), # Need at least two levels of spans for interesting logging - ddtrace.tracer.trace("test_local_root"), + ddtrace.tracer.trace("local_root"), ddtrace.tracer.trace("inner_span"), ): self.run_middleware() @@ -104,7 +145,7 @@ def test_log_root_span(self, mock_log_info): log_msg = mock_log_info.call_args_list[0][0][0] # first arg of first call assert re.fullmatch( r"Datadog span diagnostics: Route = /some/path; " - r"local root span = name='test_local_root' .*; " + r"local root span = name='local_root' .*; " r"current span = name='inner_span' .*", log_msg )