diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5ed7f41..d42b0ec 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[4.4.0] - 2024-09-10 +~~~~~~~~~~~~~~~~~~~~ +Changed +------- +* Datadog diagnostics now logs ancestor spans when an anomaly is encountered, up to a limit of 10 (controlled by new Django setting ``DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH``). Spans are logged in full and on separate lines, so this logging is now much more verbose; consider only enabling this logging for short periods. Log format of first line has also changed slightly. + [4.3.0] - 2024-08-22 ~~~~~~~~~~~~~~~~~~~~ Added diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index eaffb83..94612b4 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '4.3.0' +__version__ = '4.4.0' diff --git a/edx_arch_experiments/datadog_diagnostics/middleware.py b/edx_arch_experiments/datadog_diagnostics/middleware.py index 844695c..79c77b1 100644 --- a/edx_arch_experiments/datadog_diagnostics/middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/middleware.py @@ -9,6 +9,7 @@ import logging import time +from django.conf import settings from django.core.exceptions import MiddlewareNotUsed from edx_toggles.toggles import WaffleFlag @@ -61,6 +62,13 @@ def __init__(self, get_response): raise MiddlewareNotUsed # pylint: disable=raise-missing-from self.worker_start_epoch = time.time() + # .. setting_name: DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH + # .. setting_default: 10 + # .. setting_description: Controls how many ancestors spans are logged + # when anomalous traces are detected. This limits log size when very + # deep span trees are present (such as in anomalous traces, or even + # just when each middleware is given a span). + self.log_span_ancestors_depth = getattr(settings, "DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH", 10) def __call__(self, request): return self.get_response(request) @@ -82,25 +90,49 @@ def log_diagnostics(self, request): """ Contains all the actual logging logic. """ + current_span = self.dd_tracer.current_span() local_root_span = self.dd_tracer.current_root_span() if DETECT_ANOMALOUS_TRACE.is_enabled(): + # For testing, uncomment this line to fake an anomalous span: + # local_root_span.finish() root_duration_s = local_root_span.duration if root_duration_s is not None: - worker_run_time_s = time.time() - self.worker_start_epoch - log.warning( - f"Anomalous Datadog local root span (duration already set): " - f"id = {local_root_span.trace_id:x}; " - f"duration = {root_duration_s:0.3f} sec; " - f"worker age = {worker_run_time_s:0.3f} sec" - ) + self.log_anomalous_trace(current_span, local_root_span) if LOG_ROOT_SPAN.is_enabled(): route_pattern = getattr(request.resolver_match, 'route', None) - current_span = self.dd_tracer.current_span() # pylint: disable=protected-access log.info( f"Datadog span diagnostics: Route = {route_pattern}; " f"local root span = {local_root_span._pprint()}; " f"current span = {current_span._pprint()}" ) + + def log_anomalous_trace(self, current_span, local_root_span): + worker_run_time_s = time.time() - self.worker_start_epoch + + # Build up a list of spans from current back towards the root, up to a limit. + ancestors = [] + walk_span = current_span + while len(ancestors) < self.log_span_ancestors_depth and walk_span is not None: + ancestors.insert(0, walk_span._pprint()) # pylint: disable=protected-access + walk_span = walk_span._parent # pylint: disable=protected-access + + trunc = "(ancestors truncated)\n" if walk_span else "" + + if local_root_span.duration: + duration_msg = f"duration={local_root_span.duration:0.3f}" + else: + # Should only occur during local testing of this + # middleware, when forcing this code path to run. + duration_msg = "duration not set" + + msg = ( + "Anomalous Datadog local root span: " + f"trace_id={local_root_span.trace_id:x}; " + f"{duration_msg}; " + f"worker_age={worker_run_time_s:0.3f}; span ancestry:" + ) + + log.warning(msg + "\n" + trunc + "\n".join(ancestors)) # pylint: disable=logging-not-lazy diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py index 4e48619..ee21888 100644 --- a/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py @@ -7,7 +7,7 @@ import ddt import ddtrace -from django.test import TestCase +from django.test import TestCase, override_settings from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware @@ -20,16 +20,21 @@ def fake_view(_request): class TestDatadogDiagnosticMiddleware(TestCase): """Tests for DatadogDiagnosticMiddleware.""" - def setUp(self): - self.middleware = DatadogDiagnosticMiddleware(fake_view) + def make_middleware(self): + """Make an instance of the middleware with current settings.""" + return DatadogDiagnosticMiddleware(fake_view) - def run_middleware(self): + def run_middleware(self, middleware=None): """Run the middleware using a fake request.""" + if middleware is None: + middleware = self.make_middleware() + resolver = Mock() resolver.route = "/some/path" request = Mock() request.resolver_match = resolver - self.middleware.process_view(request, None, None, None) + + middleware.process_view(request, None, None, None) @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error') def test_log_diagnostics_error_only_once(self, mock_log_error): @@ -38,12 +43,13 @@ def test_log_diagnostics_error_only_once(self, mock_log_error): The method should still be called every time in case it is still doing useful work before the error, though. """ + middleware = self.make_middleware() bad_method = Mock(side_effect=lambda request: 1/0) - self.middleware.log_diagnostics = bad_method + middleware.log_diagnostics = bad_method - self.run_middleware() - self.run_middleware() + self.run_middleware(middleware) + self.run_middleware(middleware) # Called twice assert len(bad_method.call_args_list) == 2 @@ -71,7 +77,8 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning): patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), # Need at least two levels of spans in order to fake # an anomaly. (Otherwise current_root_span returns None.) - ddtrace.tracer.trace("test_local_root"), + ddtrace.tracer.trace("local_root"), + ddtrace.tracer.trace("intermediary_span"), ddtrace.tracer.trace("inner_span"), ): if cause_anomaly: @@ -81,21 +88,55 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning): if enabled and cause_anomaly: mock_log_warning.assert_called_once() log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call + assert re.fullmatch( - r"Anomalous Datadog local root span \(duration already set\): " - r"id = [0-9A-Fa-f]+; duration = [0-9]\.[0-9]{3} sec; worker age = [0-9]\.[0-9]{3} sec", + r"Anomalous Datadog local root span: " + r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n" + r"name='local_root'.*duration=[0-9]+.*\n" + r"name='intermediary_span'.*duration=None.*\n" + r"name='inner_span'.*duration=None.*", log_msg ) else: mock_log_warning.assert_not_called() + @override_settings(DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH=2) + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.warning') + def test_anomalous_trace_truncation(self, mock_log_warning): + """ + Test that truncation works, returning N most proximal spans. + """ + with ( + patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True), + patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False), + # Need at least two levels of spans in order to fake + # an anomaly. (Otherwise current_root_span returns None.) + ddtrace.tracer.trace("local_root"), + ddtrace.tracer.trace("intermediary_span"), + ddtrace.tracer.trace("inner_span"), + ): + ddtrace.tracer.current_root_span().finish() # cause anomaly + self.run_middleware() + + mock_log_warning.assert_called_once() + log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call + + assert re.fullmatch( + r"Anomalous Datadog local root span: " + r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n" + r"\(ancestors truncated\)\n" # difference here + r"name='intermediary_span'.*duration=None.*\n" + r"name='inner_span'.*duration=None.*", + log_msg + ) + @patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info') def test_log_root_span(self, mock_log_info): with ( patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False), patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True), # Need at least two levels of spans for interesting logging - ddtrace.tracer.trace("test_local_root"), + ddtrace.tracer.trace("local_root"), ddtrace.tracer.trace("inner_span"), ): self.run_middleware() @@ -104,7 +145,7 @@ def test_log_root_span(self, mock_log_info): log_msg = mock_log_info.call_args_list[0][0][0] # first arg of first call assert re.fullmatch( r"Datadog span diagnostics: Route = /some/path; " - r"local root span = name='test_local_root' .*; " + r"local root span = name='local_root' .*; " r"current span = name='inner_span' .*", log_msg )