chore(llmobs): fix behavior for running multiple evaluators (#11986)

There was unexpected behavior when turning on multiple evaluators where only the last evaluator in the evaluators list was being run multiple times for one span. Fix it by replacing `executor.map` with just individual calls to `executor.submit`, which should do the same thing but it avoids the the unexpected behavior above To reproduce: the added test case in this PR fails if we switch back to `executor.map`. [See this CI run for the reproduced error](a6279df) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan <[email protected]>
DataDog · Jan 17, 2025 · 3aa6921 · 3aa6921
1 parent f67a358
commit 3aa6921
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 20 deletions.
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
@@ -115,20 +115,12 @@ def periodic(self, _wait_sync=False) -> None:
             self._buffer = []
 
         try:
-            if not _wait_sync:
-                for evaluator in self.evaluators:
-                    self.executor.map(
-                        lambda span_event: evaluator.run_and_submit_evaluation(span_event),
-                        [
-                            span_event
-                            for span_event, span in span_events_and_spans
-                            if self.sampler.sample(evaluator.LABEL, span)
-                        ],
-                    )
-            else:
-                for evaluator in self.evaluators:
-                    for span_event, span in span_events_and_spans:
-                        if self.sampler.sample(evaluator.LABEL, span):
+            for evaluator in self.evaluators:
+                for span_event, span in span_events_and_spans:
+                    if self.sampler.sample(evaluator.LABEL, span):
+                        if not _wait_sync:
+                            self.executor.submit(evaluator.run_and_submit_evaluation, span_event)
+                        else:
                             evaluator.run_and_submit_evaluation(span_event)
         except RuntimeError as e:
             logger.debug("failed to run evaluation: %s", e)
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
@@ -531,28 +531,27 @@ def _llm_span_with_expected_ragas_inputs_in_messages(ragas_inputs=None):
 
 
 class DummyEvaluator:
-    LABEL = "dummy"
-
-    def __init__(self, llmobs_service):
+    def __init__(self, llmobs_service, label="dummy"):
         self.llmobs_service = llmobs_service
+        self.LABEL = label
 
     def run_and_submit_evaluation(self, span):
         self.llmobs_service.submit_evaluation(
             span_context=span,
-            label=DummyEvaluator.LABEL,
+            label=self.LABEL,
             value=1.0,
             metric_type="score",
         )
 
 
-def _dummy_evaluator_eval_metric_event(span_id, trace_id):
+def _dummy_evaluator_eval_metric_event(span_id, trace_id, label=None):
     return LLMObsEvaluationMetricEvent(
         join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
         score_value=1.0,
         ml_app="unnamed-ml-app",
         timestamp_ms=mock.ANY,
         metric_type="score",
-        label=DummyEvaluator.LABEL,
+        label=label or "dummy",
         tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"],
     )
 

diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -59,6 +59,29 @@ def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_me
     )
 
 
+@pytest.mark.vcr_logs
+def test_evaluator_runner_multiple_evaluators(llmobs, mock_llmobs_eval_metric_writer):
+    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
+    evaluator_runner.evaluators += [
+        DummyEvaluator(llmobs_service=llmobs, label="1"),
+        DummyEvaluator(llmobs_service=llmobs, label="2"),
+        DummyEvaluator(llmobs_service=llmobs, label="3"),
+    ]
+    evaluator_runner.start()
+
+    evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)
+
+    time.sleep(0.1)
+
+    calls = [call[0][0] for call in mock_llmobs_eval_metric_writer.enqueue.call_args_list]
+    sorted_calls = sorted(calls, key=lambda x: x["label"])
+    assert sorted_calls == [
+        _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="1"),
+        _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="2"),
+        _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="3"),
+    ]
+
+
 def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subprocess):
     env = os.environ.copy()
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))]