Skip to content

Commit

Permalink
feat: Log ancestors of current span when anomaly found
Browse files Browse the repository at this point in the history
Datadog believes that a lingering unfinished celery span from a previous
request might be becoming the active context for all future Django
requests, causing them to become parented to it. To gather evidence, we'll
need to learn not just what the trace root span is, but whether there are
unexpected intermediary spans as well.

There may be a great many ancestors, especially with the trace
concatenation in play, so we'll walk upwards from the current span towards
the root until a limit is reached. This should allow us to find if the
current Django request's span has a parent, and what it is.

- New setting `DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH` to control walk depth
- Move anomaly logging to its own method.
- Log ancestry of N proximal spans, and adjust formatting of logging to be
  more compact.

Changes relating to testing:

- Add comment on how to do manual testing -- calling finish() on the root
  span lets us fake an anomaly.
- Lift `current_span` to early in `log_diagnostics`. This is helpful during
  manual testing when `DD_DJANGO_INSTRUMENT_MIDDLEWARE=false`, because in
  that case, the current span *is* the root span. But finishing that span
  means we can no longer call current_root_span to retrieve it! Retrieving
  it before that point allows this trick to still work with our code.
- Need to move middleware setup out of `setUp` method because we need to be
  able to adjust Django settings before the middleware is constructed.
  `run_middleware` now takes an optional `middleware` param.
- Add intermediary span in unit tests so that truncation can be more
  clearly observed, and to help demonstrate that walk and print order is
  as expected.
- Rename test local-root span to be in line with other naming. Don't need
  the "test_" prefix.
  • Loading branch information
timmc-edx committed Sep 10, 2024
1 parent 560a6fb commit 196e0c2
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 22 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Change Log
Unreleased
~~~~~~~~~~

[4.4.0] - 2024-09-10
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics now logs ancestor spans when an anomaly is encountered, up to a limit of 10 (controlled by new Django setting ``DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH``). Spans are logged in full and on separate lines, so this logging is now much more verbose; consider only enabling this logging for short periods. Log format of first line has also changed slightly.

[4.3.0] - 2024-08-22
~~~~~~~~~~~~~~~~~~~~
Added
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.3.0'
__version__ = '4.4.0'
48 changes: 40 additions & 8 deletions edx_arch_experiments/datadog_diagnostics/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import time

from django.conf import settings
from django.core.exceptions import MiddlewareNotUsed
from edx_toggles.toggles import WaffleFlag

Expand Down Expand Up @@ -61,6 +62,13 @@ def __init__(self, get_response):
raise MiddlewareNotUsed # pylint: disable=raise-missing-from

self.worker_start_epoch = time.time()
# .. setting_name: DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH
# .. setting_default: 10
# .. setting_description: Controls how many ancestors spans are logged
# when anomalous traces are detected. This limits log size when very
# deep span trees are present (such as in anomalous traces, or even
# just when each middleware is given a span).
self.log_span_ancestors_depth = getattr(settings, "DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH", 10)

def __call__(self, request):
return self.get_response(request)
Expand All @@ -82,25 +90,49 @@ def log_diagnostics(self, request):
"""
Contains all the actual logging logic.
"""
current_span = self.dd_tracer.current_span()
local_root_span = self.dd_tracer.current_root_span()

if DETECT_ANOMALOUS_TRACE.is_enabled():
# For testing, uncomment this line to fake an anomalous span:
# local_root_span.finish()
root_duration_s = local_root_span.duration
if root_duration_s is not None:
worker_run_time_s = time.time() - self.worker_start_epoch
log.warning(
f"Anomalous Datadog local root span (duration already set): "
f"id = {local_root_span.trace_id:x}; "
f"duration = {root_duration_s:0.3f} sec; "
f"worker age = {worker_run_time_s:0.3f} sec"
)
self.log_anomalous_trace(current_span, local_root_span)

if LOG_ROOT_SPAN.is_enabled():
route_pattern = getattr(request.resolver_match, 'route', None)
current_span = self.dd_tracer.current_span()
# pylint: disable=protected-access
log.info(
f"Datadog span diagnostics: Route = {route_pattern}; "
f"local root span = {local_root_span._pprint()}; "
f"current span = {current_span._pprint()}"
)

def log_anomalous_trace(self, current_span, local_root_span):
worker_run_time_s = time.time() - self.worker_start_epoch

# Build up a list of spans from current back towards the root, up to a limit.
ancestors = []
walk_span = current_span
while len(ancestors) < self.log_span_ancestors_depth and walk_span is not None:
ancestors.insert(0, walk_span._pprint()) # pylint: disable=protected-access
walk_span = walk_span._parent # pylint: disable=protected-access

trunc = "(ancestors truncated)\n" if walk_span else ""

if local_root_span.duration:
duration_msg = f"duration={local_root_span.duration:0.3f}"
else:
# Should only occur during local testing of this
# middleware, when forcing this code path to run.
duration_msg = "duration not set"

msg = (
"Anomalous Datadog local root span: "
f"trace_id={local_root_span.trace_id:x}; "
f"{duration_msg}; "
f"worker_age={worker_run_time_s:0.3f}; span ancestry:"
)

log.warning(msg + "\n" + trunc + "\n".join(ancestors)) # pylint: disable=logging-not-lazy
67 changes: 54 additions & 13 deletions edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import ddt
import ddtrace
from django.test import TestCase
from django.test import TestCase, override_settings

from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware

Expand All @@ -20,16 +20,21 @@ def fake_view(_request):
class TestDatadogDiagnosticMiddleware(TestCase):
"""Tests for DatadogDiagnosticMiddleware."""

def setUp(self):
self.middleware = DatadogDiagnosticMiddleware(fake_view)
def make_middleware(self):
"""Make an instance of the middleware with current settings."""
return DatadogDiagnosticMiddleware(fake_view)

def run_middleware(self):
def run_middleware(self, middleware=None):
"""Run the middleware using a fake request."""
if middleware is None:
middleware = self.make_middleware()

resolver = Mock()
resolver.route = "/some/path"
request = Mock()
request.resolver_match = resolver
self.middleware.process_view(request, None, None, None)

middleware.process_view(request, None, None, None)

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_log_diagnostics_error_only_once(self, mock_log_error):
Expand All @@ -38,12 +43,13 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
The method should still be called every time in case it is still doing
useful work before the error, though.
"""
middleware = self.make_middleware()

bad_method = Mock(side_effect=lambda request: 1/0)
self.middleware.log_diagnostics = bad_method
middleware.log_diagnostics = bad_method

self.run_middleware()
self.run_middleware()
self.run_middleware(middleware)
self.run_middleware(middleware)

# Called twice
assert len(bad_method.call_args_list) == 2
Expand Down Expand Up @@ -71,7 +77,8 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
ddtrace.tracer.trace("test_local_root"),
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("intermediary_span"),
ddtrace.tracer.trace("inner_span"),
):
if cause_anomaly:
Expand All @@ -81,21 +88,55 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
if enabled and cause_anomaly:
mock_log_warning.assert_called_once()
log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call

assert re.fullmatch(
r"Anomalous Datadog local root span \(duration already set\): "
r"id = [0-9A-Fa-f]+; duration = [0-9]\.[0-9]{3} sec; worker age = [0-9]\.[0-9]{3} sec",
r"Anomalous Datadog local root span: "
r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n"
r"name='local_root'.*duration=[0-9]+.*\n"
r"name='intermediary_span'.*duration=None.*\n"
r"name='inner_span'.*duration=None.*",
log_msg
)
else:
mock_log_warning.assert_not_called()

@override_settings(DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH=2)
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.warning')
def test_anomalous_trace_truncation(self, mock_log_warning):
"""
Test that truncation works, returning N most proximal spans.
"""
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("intermediary_span"),
ddtrace.tracer.trace("inner_span"),
):
ddtrace.tracer.current_root_span().finish() # cause anomaly
self.run_middleware()

mock_log_warning.assert_called_once()
log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call

assert re.fullmatch(
r"Anomalous Datadog local root span: "
r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n"
r"\(ancestors truncated\)\n" # difference here
r"name='intermediary_span'.*duration=None.*\n"
r"name='inner_span'.*duration=None.*",
log_msg
)

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
def test_log_root_span(self, mock_log_info):
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True),
# Need at least two levels of spans for interesting logging
ddtrace.tracer.trace("test_local_root"),
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("inner_span"),
):
self.run_middleware()
Expand All @@ -104,7 +145,7 @@ def test_log_root_span(self, mock_log_info):
log_msg = mock_log_info.call_args_list[0][0][0] # first arg of first call
assert re.fullmatch(
r"Datadog span diagnostics: Route = /some/path; "
r"local root span = name='test_local_root' .*; "
r"local root span = name='local_root' .*; "
r"current span = name='inner_span' .*",
log_msg
)

0 comments on commit 196e0c2

Please sign in to comment.