Skip to content

Commit

Permalink
chore(llmobs): fix behavior for running multiple evaluators (#11986)
Browse files Browse the repository at this point in the history
There was unexpected behavior when turning on multiple evaluators where
only the last evaluator in the evaluators list was being run multiple
times for one span.

Fix it by replacing `executor.map` with just individual calls to
`executor.submit`, which should do the same thing but it avoids the the
unexpected behavior above

To reproduce: the added test case in this PR fails if we switch back to
`executor.map`. [See this CI run for the reproduced
error](a6279df)

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [ ] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <[email protected]>
  • Loading branch information
lievan and lievan authored Jan 17, 2025
1 parent f67a358 commit 3aa6921
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 20 deletions.
20 changes: 6 additions & 14 deletions ddtrace/llmobs/_evaluators/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,20 +115,12 @@ def periodic(self, _wait_sync=False) -> None:
self._buffer = []

try:
if not _wait_sync:
for evaluator in self.evaluators:
self.executor.map(
lambda span_event: evaluator.run_and_submit_evaluation(span_event),
[
span_event
for span_event, span in span_events_and_spans
if self.sampler.sample(evaluator.LABEL, span)
],
)
else:
for evaluator in self.evaluators:
for span_event, span in span_events_and_spans:
if self.sampler.sample(evaluator.LABEL, span):
for evaluator in self.evaluators:
for span_event, span in span_events_and_spans:
if self.sampler.sample(evaluator.LABEL, span):
if not _wait_sync:
self.executor.submit(evaluator.run_and_submit_evaluation, span_event)
else:
evaluator.run_and_submit_evaluation(span_event)
except RuntimeError as e:
logger.debug("failed to run evaluation: %s", e)
11 changes: 5 additions & 6 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,28 +531,27 @@ def _llm_span_with_expected_ragas_inputs_in_messages(ragas_inputs=None):


class DummyEvaluator:
LABEL = "dummy"

def __init__(self, llmobs_service):
def __init__(self, llmobs_service, label="dummy"):
self.llmobs_service = llmobs_service
self.LABEL = label

def run_and_submit_evaluation(self, span):
self.llmobs_service.submit_evaluation(
span_context=span,
label=DummyEvaluator.LABEL,
label=self.LABEL,
value=1.0,
metric_type="score",
)


def _dummy_evaluator_eval_metric_event(span_id, trace_id):
def _dummy_evaluator_eval_metric_event(span_id, trace_id, label=None):
return LLMObsEvaluationMetricEvent(
join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
score_value=1.0,
ml_app="unnamed-ml-app",
timestamp_ms=mock.ANY,
metric_type="score",
label=DummyEvaluator.LABEL,
label=label or "dummy",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"],
)

Expand Down
23 changes: 23 additions & 0 deletions tests/llmobs/test_llmobs_evaluator_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,29 @@ def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_me
)


@pytest.mark.vcr_logs
def test_evaluator_runner_multiple_evaluators(llmobs, mock_llmobs_eval_metric_writer):
evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
evaluator_runner.evaluators += [
DummyEvaluator(llmobs_service=llmobs, label="1"),
DummyEvaluator(llmobs_service=llmobs, label="2"),
DummyEvaluator(llmobs_service=llmobs, label="3"),
]
evaluator_runner.start()

evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)

time.sleep(0.1)

calls = [call[0][0] for call in mock_llmobs_eval_metric_writer.enqueue.call_args_list]
sorted_calls = sorted(calls, key=lambda x: x["label"])
assert sorted_calls == [
_dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="1"),
_dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="2"),
_dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="3"),
]


def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subprocess):
env = os.environ.copy()
pypath = [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))]
Expand Down

0 comments on commit 3aa6921

Please sign in to comment.