From b4cf04edd29d41f5a79ea39e373f3e796950b811 Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 09:59:28 -0500 Subject: [PATCH 1/5] fix ragas version tagging --- ddtrace/llmobs/_evaluators/ragas/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index 17cf5807af0..798c8e2fccc 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -26,8 +26,10 @@ class RagasDependencies: def __init__(self): import ragas - self.ragas_version = parse_version(ragas.__version__) - if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): + self.ragas_version = ragas.__version__ # type: str + + parsed_version = parse_version(ragas.__version__) + if parsed_version >= (0, 2, 0) or parsed_version < (0, 1, 10): raise NotImplementedError( "Ragas version: {} is not supported".format(self.ragas_version), ) From 1adc1276e1651131c71b0846140bb9c77272cd38 Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 10:46:46 -0500 Subject: [PATCH 2/5] fix behavior for multiple evaluator runners --- ddtrace/llmobs/_evaluators/runner.py | 11 +++------- tests/llmobs/_utils.py | 11 +++++----- tests/llmobs/test_llmobs_evaluator_runner.py | 23 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index 6d23af647ea..64a50baf6cf 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -117,14 +117,9 @@ def periodic(self, _wait_sync=False) -> None: try: if not _wait_sync: for evaluator in self.evaluators: - self.executor.map( - lambda span_event: evaluator.run_and_submit_evaluation(span_event), - [ - span_event - for span_event, span in span_events_and_spans - if self.sampler.sample(evaluator.LABEL, span) - ], - ) + for span_event, span in span_events_and_spans: + if self.sampler.sample(evaluator.LABEL, span): + self.executor.submit(evaluator.run_and_submit_evaluation, span_event) else: for evaluator in self.evaluators: for span_event, span in span_events_and_spans: diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 8049245dbc1..084c3ec327d 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -531,28 +531,27 @@ def _llm_span_with_expected_ragas_inputs_in_messages(ragas_inputs=None): class DummyEvaluator: - LABEL = "dummy" - - def __init__(self, llmobs_service): + def __init__(self, llmobs_service, label="dummy"): self.llmobs_service = llmobs_service + self.LABEL = label def run_and_submit_evaluation(self, span): self.llmobs_service.submit_evaluation( span_context=span, - label=DummyEvaluator.LABEL, + label=self.LABEL, value=1.0, metric_type="score", ) -def _dummy_evaluator_eval_metric_event(span_id, trace_id): +def _dummy_evaluator_eval_metric_event(span_id, trace_id, label=None): return LLMObsEvaluationMetricEvent( join_on={"span": {"span_id": span_id, "trace_id": trace_id}}, score_value=1.0, ml_app="unnamed-ml-app", timestamp_ms=mock.ANY, metric_type="score", - label=DummyEvaluator.LABEL, + label=label or DummyEvaluator().LABEL, tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"], ) diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 96104bb19be..eaf381367d0 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -59,6 +59,29 @@ def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_me ) +@pytest.mark.vcr_logs +def test_evaluator_runner_multiple_evaluators(llmobs, mock_llmobs_eval_metric_writer): + evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs) + evaluator_runner.evaluators += [ + DummyEvaluator(llmobs_service=llmobs, label="1"), + DummyEvaluator(llmobs_service=llmobs, label="2"), + DummyEvaluator(llmobs_service=llmobs, label="3"), + ] + evaluator_runner.start() + + evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) + + time.sleep(0.1) + + calls = [call[0][0] for call in mock_llmobs_eval_metric_writer.enqueue.call_args_list] + sorted_calls = sorted(calls, key=lambda x: x["label"]) + assert sorted_calls == [ + _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="1"), + _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="2"), + _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234", label="3"), + ] + + def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))] From 2fc38af2723b704a87f38e5703bfda9175067354 Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 10:53:01 -0500 Subject: [PATCH 3/5] remove accident change --- ddtrace/llmobs/_evaluators/ragas/base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index 798c8e2fccc..17cf5807af0 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -26,10 +26,8 @@ class RagasDependencies: def __init__(self): import ragas - self.ragas_version = ragas.__version__ # type: str - - parsed_version = parse_version(ragas.__version__) - if parsed_version >= (0, 2, 0) or parsed_version < (0, 1, 10): + self.ragas_version = parse_version(ragas.__version__) + if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): raise NotImplementedError( "Ragas version: {} is not supported".format(self.ragas_version), ) From c39c4f954c5989ae5b825d42cb299065de5961da Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 11:08:32 -0500 Subject: [PATCH 4/5] fix test --- tests/llmobs/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 084c3ec327d..3583516538c 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -551,7 +551,7 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id, label=None): ml_app="unnamed-ml-app", timestamp_ms=mock.ANY, metric_type="score", - label=label or DummyEvaluator().LABEL, + label=label or "dummy", tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"], ) From 2dace985c5a18575fb00b2b2a4f2df83fe658cd1 Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 14:38:52 -0500 Subject: [PATCH 5/5] remove dupe code --- ddtrace/llmobs/_evaluators/runner.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index 64a50baf6cf..056a80000e4 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -115,15 +115,12 @@ def periodic(self, _wait_sync=False) -> None: self._buffer = [] try: - if not _wait_sync: - for evaluator in self.evaluators: - for span_event, span in span_events_and_spans: - if self.sampler.sample(evaluator.LABEL, span): + for evaluator in self.evaluators: + for span_event, span in span_events_and_spans: + if self.sampler.sample(evaluator.LABEL, span): + if not _wait_sync: self.executor.submit(evaluator.run_and_submit_evaluation, span_event) - else: - for evaluator in self.evaluators: - for span_event, span in span_events_and_spans: - if self.sampler.sample(evaluator.LABEL, span): + else: evaluator.run_and_submit_evaluation(span_event) except RuntimeError as e: logger.debug("failed to run evaluation: %s", e)