From b20211e1c799b19ab5a95fd93d343b131dd684b5 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Mon, 6 Jan 2025 16:09:11 -0500 Subject: [PATCH] Skip all ragas tests if ragas unavailable --- riotfile.py | 6 +- ...emits_traces_and_evaluations_on_exit.yaml} | 0 ...test_ragas_faithfulness_emits_traces.yaml} | 0 ...agas_faithfulness_submits_evaluation.yaml} | 0 ..._evaluation_on_span_with_custom_keys.yaml} | 0 ...on_on_span_with_question_in_messages.yaml} | 0 ...est_llmobs_ragas_faithfulness_evaluator.py | 405 +++++++++--------- tests/llmobs/test_llmobs_service.py | 2 +- 8 files changed, 204 insertions(+), 209 deletions(-) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%) diff --git a/riotfile.py b/riotfile.py index 8f36a9ff80b..f274b84bb0a 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2885,11 +2885,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, venvs=[ Venv(pys="3.7"), - Venv( - pys=select_pys(min_version="3.8"), - pkgs={"ragas": "==0.1.21", "langchain": latest}, - env={"RAGAS_AVAILABLE": "True"}, - ), + Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), ], ), Venv( diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index 42a69a4d613..7309d911b31 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -11,215 +11,214 @@ from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt -RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) +pytest.importorskip("ragas", reason="Tests require ragas to be available on user env") def _llm_span_without_io(): return _expected_llmobs_llm_span_event(Span("dummy")) +def test_ragas_evaluator_init(ragas, LLMObs): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + assert rf_evaluator.llmobs_service == LLMObs + assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness + assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() + + def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas): with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"): RagasFaithfulnessEvaluator(LLMObs) -@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Tests require ragas to be available on user env") -class TestRagasFaithfulnessEvaluator: - def test_ragas_evaluator_init(self, ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.llmobs_service == LLMObs - assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness - assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() - - def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails( - self, ragas, mock_llmobs_submit_evaluation, LLMObs - ): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) - assert failure_msg == "fail_extract_faithfulness_inputs" - assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 - - def test_ragas_faithfulness_has_modified_faithfulness_instance( - self, ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs - ): - """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" - from ragas.llms import BaseRagasLLM - from ragas.metrics import faithfulness - - class FirstDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self) -> str: - return "dummy llm" - - def agenerate_text(self) -> str: - return "dummy llm" - - faithfulness.llm = FirstDummyLLM() - - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - - assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" - - class SecondDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - def agenerate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - faithfulness.llm = SecondDummyLLM() - - with pytest.raises(ValueError, match="dummy_llm"): - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation(self, ragas, LLMObs, mock_llmobs_submit_evaluation): - """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( - self, ragas, LLMObs, mock_llmobs_submit_evaluation - ): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_messages() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys( - self, ragas, LLMObs, mock_llmobs_submit_evaluation - ): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _expected_llmobs_llm_span_event( - Span("dummy"), - prompt={ - "variables": { - "user_input": "Is france part of europe?", - "context_1": "hello, ", - "context_2": "france is ", - "context_3": "part of europe", +def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) + assert failure_msg == "fail_extract_faithfulness_inputs" + assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 + + +def test_ragas_faithfulness_has_modified_faithfulness_instance( + ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs +): + """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" + from ragas.llms import BaseRagasLLM + from ragas.metrics import faithfulness + + class FirstDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self) -> str: + return "dummy llm" + + def agenerate_text(self) -> str: + return "dummy llm" + + faithfulness.llm = FirstDummyLLM() + + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + + assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" + + class SecondDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + def agenerate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + faithfulness.llm = SecondDummyLLM() + + with pytest.raises(ValueError, match="dummy_llm"): + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( + ragas, LLMObs, mock_llmobs_submit_evaluation +): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_messages() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", }, - "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], - "_dd_query_variable_keys": ["user_input"], + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _expected_llmobs_llm_span_event( + Span("dummy"), + prompt={ + "variables": { + "user_input": "Is france part of europe?", + "context_1": "hello, ", + "context_2": "france is ", + "context_3": "part of europe", }, - output_messages=[{"content": "France is indeed part of europe"}], - ) - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_emits_traces(self, ragas, llmobs, llmobs_events): - """Why are we asserting only 7 spans caught?""" - rf_evaluator = RagasFaithfulnessEvaluator(llmobs) - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] - ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) - assert len(ragas_spans) == 7 - # check name, io, span kinds match - assert ragas_spans == _expected_ragas_spans() - - # verify the trace structure - root_span = ragas_spans[0] - root_span_id = root_span["span_id"] - assert root_span["parent_id"] == "undefined" - assert root_span["meta"] is not None - assert root_span["meta"]["metadata"] is not None - assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) - assert isinstance(root_span["meta"]["metadata"]["statements"], list) - root_span_trace_id = root_span["trace_id"] - for child_span in ragas_spans[1:]: - assert child_span["trace_id"] == root_span_trace_id - - assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) - assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) - assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) - assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) - - def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( - self, mock_writer_logs, run_python_code_in_subprocess - ): - env = os.environ.copy() - pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] - if "PYTHONPATH" in env: - pypath.append(env["PYTHONPATH"]) - env.update( - { - "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), - "DD_SITE": "datad0g.com", - "PYTHONPATH": ":".join(pypath), - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), - "DD_LLMOBS_ML_APP": "unnamed-ml-app", - "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", - "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", - } - ) - out, err, status, pid = run_python_code_in_subprocess( - """ + "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], + "_dd_query_variable_keys": ["user_input"], + }, + output_messages=[{"content": "France is indeed part of europe"}], + ) + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): + """Why are we asserting only 7 spans caught?""" + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 + # check name, io, span kinds match + assert ragas_spans == _expected_ragas_spans() + + # verify the trace structure + root_span = ragas_spans[0] + root_span_id = root_span["span_id"] + assert root_span["parent_id"] == "undefined" + assert root_span["meta"] is not None + assert root_span["meta"]["metadata"] is not None + assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) + assert isinstance(root_span["meta"]["metadata"]["statements"], list) + root_span_trace_id = root_span["trace_id"] + for child_span in ragas_spans[1:]: + assert child_span["trace_id"] == root_span_trace_id + + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) + + +def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): + env = os.environ.copy() + pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] + if "PYTHONPATH" in env: + pypath.append(env["PYTHONPATH"]) + env.update( + { + "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), + "DD_SITE": "datad0g.com", + "PYTHONPATH": ":".join(pypath), + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), + "DD_LLMOBS_ML_APP": "unnamed-ml-app", + "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", + "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", + "DD_LLMOBS_AGENTLESS_ENABLED": "true", + } + ) + out, err, status, pid = run_python_code_in_subprocess( + """ import os import time import atexit @@ -230,7 +229,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__()) @@ -243,9 +242,9 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( ): LLMObs.enable() LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None) - """, - env=env, - ) - assert status == 0, err - assert out == b"" - assert err == b"" +""", + env=env, + ) + assert status == 0, err + assert out == b"" + assert err == b"" diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index f550bf639ac..2e1d5e6035f 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1746,8 +1746,8 @@ async def test_annotation_context_async_nested(llmobs): assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} -@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Test requires ragas to be available on user env") def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): + pytest.importorskip("ragas") with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): dummy_tracer = DummyTracer()