Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Robustness improvements to datadog_diagnostics plugin #723

Merged
merged 1 commit into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ Change Log
Unreleased
~~~~~~~~~~

[3.5.0] - 2024-07-11
~~~~~~~~~~~~~~~~~~~~
Added
-----
* Toggle ``DATADOG_DIAGNOSTICS_ENABLE`` for disabling that plugin quickly if needed. (Feature remains enabled by default.)

Fixed
-----
* Limit the number of spans collected via new setting ``DATADOG_DIAGNOSTICS_MAX_SPANS``, defaulting to 100. This may help avoid memory leaks.
* Make accidental class variables into member variables in ``datadog_diagnostics``

[3.4.0] - 2024-07-10
~~~~~~~~~~~~~~~~~~~~
Added
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '3.4.0'
__version__ = '3.5.0'
34 changes: 29 additions & 5 deletions edx_arch_experiments/datadog_diagnostics/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,44 @@
import logging

from django.apps import AppConfig
from django.conf import settings

log = logging.getLogger(__name__)


# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Enables logging of Datadog diagnostics information.
# .. toggle_use_cases: circuit_breaker
# .. toggle_creation_date: 2024-07-11
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)

# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
# .. setting_default: 100
# .. setting_description: Limit of how many spans to hold onto and log
# when diagnosing Datadog tracing issues. This limits memory consumption
# avoids logging more data than is actually needed for diagnosis.
DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)


class MissingSpanProcessor:
"""Datadog span processor that logs unfinished spans at shutdown."""
spans_started = 0
spans_finished = 0
open_spans = {}

def __init__(self):
self.spans_started = 0
self.spans_finished = 0
self.open_spans = {}

def on_span_start(self, span):
self.spans_started += 1
self.open_spans[span.span_id] = span
if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
self.open_spans[span.span_id] = span

def on_span_finish(self, span):
self.spans_finished += 1
del self.open_spans[span.span_id]
self.open_spans.pop(span.span_id, None) # "delete if present"

def shutdown(self, _timeout):
log.info(f"Spans created = {self.spans_started}; spans finished = {self.spans_finished}")
Expand All @@ -39,6 +60,9 @@ class DatadogDiagnostics(AppConfig):
plugin_app = {}

def ready(self):
if not DATADOG_DIAGNOSTICS_ENABLE:
return

try:
from ddtrace import tracer # pylint: disable=import-outside-toplevel
tracer._span_processors.append(MissingSpanProcessor()) # pylint: disable=protected-access
Expand Down
Empty file.
51 changes: 51 additions & 0 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Tests for plugin app.
"""

from unittest.mock import patch

from django.test import TestCase

from .. import apps


class FakeSpan:
"""A fake Span instance that just carries a span_id."""
def __init__(self, span_id):
self.span_id = span_id

def _pprint(self):
return f"span_id={self.span_id}"


class TestMissingSpanProcessor(TestCase):
"""Tests for MissingSpanProcessor."""

@patch.object(apps, 'DATADOG_DIAGNOSTICS_MAX_SPANS', new=3)
def test_metrics(self):
proc = apps.MissingSpanProcessor()
ids = [2, 4, 6, 8, 10]

for span_id in ids:
proc.on_span_start(FakeSpan(span_id))

assert {(sk, sv.span_id) for sk, sv in proc.open_spans.items()} == {(2, 2), (4, 4), (6, 6)}
assert proc.spans_started == 5
assert proc.spans_finished == 0

for span_id in ids:
proc.on_span_finish(FakeSpan(span_id))

assert proc.open_spans.keys() == set()
assert proc.spans_started == 5
assert proc.spans_finished == 5

@patch('edx_arch_experiments.datadog_diagnostics.apps.log.info')
@patch('edx_arch_experiments.datadog_diagnostics.apps.log.error')
def test_logging(self, mock_log_error, mock_log_info):
proc = apps.MissingSpanProcessor()
proc.on_span_start(FakeSpan(17))
proc.shutdown(0)

mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0")
mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")
Loading