Skip to content

Commit

Permalink
fix: Remove early span-start logging; fix loading of Django settings (#…
Browse files Browse the repository at this point in the history
…772)

I still don't understand why this setting loading was broken -- maybe
there's something unusual about the way plugin apps are loaded?

This addresses #771
  • Loading branch information
timmc-edx authored Aug 13, 2024
1 parent 1345db5 commit 6be3a73
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 67 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,22 @@ Change Log
Unreleased
~~~~~~~~~~

[4.2.0] - 2024-08-13
~~~~~~~~~~~~~~~~~~~~
Fixed
-----
* Fixed loading of ``DATADOG_DIAGNOSTICS_ENABLE``, which was previously not loaded properly and therefore was always True. Also fixed loading of ``DATADOG_DIAGNOSTICS_MAX_SPANS``, which was presumably broken as well.

Removed
-------
* Removed early span-start logging. It never worked properly, possibly because workers are continually being destroyed and created, leading to high log volume.

[4.1.0] - 2024-08-09
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics will now log all span-starts for the first minute after server startup
* **WARNING**: Do not use this version; see 4.2.0 release notes.

[4.0.0] - 2024-08-05
~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.1.0'
__version__ = '4.2.0'
54 changes: 17 additions & 37 deletions edx_arch_experiments/datadog_diagnostics/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,13 @@
"""

import logging
import time

from django.apps import AppConfig
from django.conf import settings

log = logging.getLogger(__name__)


# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Enables logging of Datadog diagnostics information.
# .. toggle_use_cases: circuit_breaker
# .. toggle_creation_date: 2024-07-11
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)

# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
# .. setting_default: 100
# .. setting_description: Limit of how many spans to hold onto and log
# when diagnosing Datadog tracing issues. This limits memory consumption
# avoids logging more data than is actually needed for diagnosis.
DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD
# .. setting_default: 60
# .. setting_description: Log all span starts for this many seconds after worker
# startup.
DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr(
settings,
'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD',
60
)


# pylint: disable=missing-function-docstring
class MissingSpanProcessor:
"""Datadog span processor that logs unfinished spans at shutdown."""
Expand All @@ -46,20 +18,19 @@ def __init__(self):
self.spans_started = 0
self.spans_finished = 0
self.open_spans = {}
self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD

# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
# .. setting_default: 100
# .. setting_description: Limit of how many spans to hold onto and log
# when diagnosing Datadog tracing issues. This limits memory consumption
# avoids logging more data than is actually needed for diagnosis.
self.DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

def on_span_start(self, span):
self.spans_started += 1
if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
if len(self.open_spans) < self.DATADOG_DIAGNOSTICS_MAX_SPANS:
self.open_spans[span.span_id] = span

# We believe that the anomalous traces always come from a
# single span that is created early in the lifetime of a
# gunicorn worker. If we log *every* span-start in this early
# period, we may be able to observe something interesting.
if time.time() <= self.log_all_until:
log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access

def on_span_finish(self, span):
self.spans_finished += 1
self.open_spans.pop(span.span_id, None) # "delete if present"
Expand All @@ -80,6 +51,15 @@ class DatadogDiagnostics(AppConfig):
plugin_app = {}

def ready(self):
# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Enables logging of Datadog diagnostics information.
# .. toggle_use_cases: circuit_breaker
# .. toggle_creation_date: 2024-07-11
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)

if not DATADOG_DIAGNOSTICS_ENABLE:
return

Expand Down
62 changes: 33 additions & 29 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
Tests for plugin app.
"""

from unittest.mock import call, patch
from unittest.mock import patch

from django.test import TestCase
from ddtrace import tracer
from django.test import TestCase, override_settings

from .. import apps

Expand All @@ -21,7 +22,35 @@ def _pprint(self):
class TestMissingSpanProcessor(TestCase):
"""Tests for MissingSpanProcessor."""

@patch.object(apps, 'DATADOG_DIAGNOSTICS_MAX_SPANS', new=3)
def test_feature_switch(self):
"""
Regression test -- the use of override_settings ensures that we read
the setting as needed, and not once at module load time (when it's
not guaranteed to be available.)
"""
def initialize():
apps.DatadogDiagnostics('edx_arch_experiments.datadog_diagnostics', apps).ready()

def get_processor_list():
# pylint: disable=protected-access
return [type(sp).__name__ for sp in tracer._span_processors]

with override_settings(DATADOG_DIAGNOSTICS_ENABLE=False):
initialize()
assert sorted(get_processor_list()) == [
'EndpointCallCounterProcessor', 'TopLevelSpanProcessor',
]

# The True case needs to come second because the initializer
# appends to the list and there isn't an immediately obvious
# way of resetting it.
with override_settings(DATADOG_DIAGNOSTICS_ENABLE=True):
initialize()
assert sorted(get_processor_list()) == [
'EndpointCallCounterProcessor', 'MissingSpanProcessor', 'TopLevelSpanProcessor',
]

@override_settings(DATADOG_DIAGNOSTICS_MAX_SPANS=3)
def test_metrics(self):
proc = apps.MissingSpanProcessor()
ids = [2, 4, 6, 8, 10]
Expand All @@ -47,30 +76,5 @@ def test_logging(self, mock_log_error, mock_log_info):
proc.on_span_start(FakeSpan(17))
proc.shutdown(0)

assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=17"),
call("Spans created = 1; spans finished = 0"),
]
mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0")
mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")

@patch('edx_arch_experiments.datadog_diagnostics.apps.log.info')
def test_early_span_logging_cutoff(self, mock_log_info):
with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[
# Setup
1700000000,
# Span-start time checks
1700000020,
1700000040,
1700010000,
]):
proc = apps.MissingSpanProcessor()
# Three spans are started
proc.on_span_start(FakeSpan(44))
proc.on_span_start(FakeSpan(45))
proc.on_span_start(FakeSpan(46))

# Just two early span-starts are logged
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=44"),
call("Early span-start sample: span_id=45"),
]

0 comments on commit 6be3a73

Please sign in to comment.