diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py new file mode 100644 index 00000000000..23aa4cd3caa --- /dev/null +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -0,0 +1,213 @@ +import traceback +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +from ddtrace.internal.logger import get_logger +from ddtrace.internal.telemetry import telemetry_writer +from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT +from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL +from ddtrace.internal.utils.version import parse_version +from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS +from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS +from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX + + +logger = get_logger(__name__) + + +class RagasDependencies: + """ + A helper class to store instances of ragas classes and functions + that may or may not exist in a user's environment. + """ + + def __init__(self): + import ragas + + self.ragas_version = parse_version(ragas.__version__) + if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): + raise NotImplementedError( + "Ragas version: {} is not supported".format(self.ragas_version), + ) + + from ragas.llms import llm_factory + + self.llm_factory = llm_factory + + from ragas.llms.output_parser import RagasoutputParser + + self.RagasoutputParser = RagasoutputParser + + from ragas.metrics import context_precision + + self.context_precision = context_precision + + from ragas.metrics.base import ensembler + + self.ensembler = ensembler + + from ragas.metrics import faithfulness + + self.faithfulness = faithfulness + + from ragas.metrics.base import get_segmenter + + self.get_segmenter = get_segmenter + + from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers + + self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers + + from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers + + self.StatementsAnswers = StatementsAnswers + + +def _get_ml_app_for_ragas_trace(span_event: dict) -> str: + """ + The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` + or `dd-ragas` if `ml_app` is not present in the span event. + """ + tags: List[str] = span_event.get("tags", []) + ml_app = None + for tag in tags: + if isinstance(tag, str) and tag.startswith("ml_app:"): + ml_app = tag.split(":")[1] + break + if not ml_app: + return RAGAS_ML_APP_PREFIX + return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) + + +class BaseRagasEvaluator: + """A class used by EvaluatorRunner to conduct ragas evaluations + on LLM Observability span events. The job of an Evaluator is to take a span and + submit evaluation metrics based on the span's attributes. + + Extenders of this class should only need to implement the `evaluate` method. + """ + + LABEL = "ragas" + METRIC_TYPE = "score" + + def __init__(self, llmobs_service): + """ + Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans. + + :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and + submitting evaluation metrics. + + Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. + """ + self.llmobs_service = llmobs_service + self.ragas_version = "unknown" + telemetry_state = "ok" + try: + self.ragas_dependencies = RagasDependencies() + self.ragas_version = self.ragas_dependencies.ragas_version + except ImportError as e: + telemetry_state = "fail_import_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except AttributeError as e: + telemetry_state = "fail_attribute_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except NotImplementedError as e: + telemetry_state = "fail_not_supported" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except Exception as e: + telemetry_state = "fail_unknown" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + finally: + telemetry_writer.add_count_metric( + namespace=TELEMETRY_APM_PRODUCT.LLMOBS, + name="evaluators.init", + value=1, + tags=( + ("evaluator_label", self.LABEL), + ("state", telemetry_state), + ("evaluator_version", self.ragas_version), + ), + ) + if telemetry_state != "ok": + telemetry_writer.add_log( + level=TELEMETRY_LOG_LEVEL.ERROR, + message="Failed to import Ragas dependencies", + stack_trace=traceback.format_exc(), + tags={"evaluator_version": self.ragas_version}, + ) + + def run_and_submit_evaluation(self, span_event: dict): + if not span_event: + return + score_result_or_failure, metric_metadata = self.evaluate(span_event) + telemetry_writer.add_count_metric( + TELEMETRY_APM_PRODUCT.LLMOBS, + "evaluators.run", + 1, + tags=( + ("evaluator_label", self.LABEL), + ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), + ("evaluator_version", self.ragas_version), + ), + ) + if isinstance(score_result_or_failure, float): + self.llmobs_service.submit_evaluation( + span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, + label=self.LABEL, + metric_type=self.METRIC_TYPE, + value=score_result_or_failure, + metadata=metric_metadata, + ) + + def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: + raise NotImplementedError("evaluate method must be implemented by individual evaluators") + + def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]: + """ + Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event. + """ + with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow: + self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) + question, answer, contexts = None, None, None + + meta_io = span_event.get("meta") + if meta_io is None: + return None + + meta_input = meta_io.get("input") + meta_output = meta_io.get("output") + + if not (meta_input and meta_output): + return None + + prompt = meta_input.get("prompt") + if prompt is None: + logger.debug("Failed to extract `prompt` from span for ragas evaluation") + return None + prompt_variables = prompt.get("variables") + + input_messages = meta_input.get("messages") + + messages = meta_output.get("messages") + if messages is not None and len(messages) > 0: + answer = messages[-1].get("content") + + if prompt_variables: + context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) + question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) + contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)] + question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) + + if not question and input_messages is not None and len(input_messages) > 0: + question = input_messages[-1].get("content") + + self.llmobs_service.annotate( + span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer} + ) + if any(field is None for field in (question, contexts, answer)): + logger.debug("Failed to extract inputs required for ragas evaluation") + return None + + return {"question": question, "contexts": contexts, "answer": answer} diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index d651c2443a4..98725b1f27e 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -1,73 +1,22 @@ import json import math -import traceback from typing import List from typing import Optional from typing import Tuple from typing import Union from ddtrace.internal.logger import get_logger -from ddtrace.internal.telemetry import telemetry_writer -from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT -from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL -from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA -from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS -from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX +from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator +from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace logger = get_logger(__name__) -class MiniRagas: - """ - A helper class to store instances of ragas classes and functions - that may or may not exist in a user's environment. - """ - - llm_factory = None - RagasoutputParser = None - faithfulness = None - ensembler = None - get_segmenter = None - StatementFaithfulnessAnswers = None - StatementsAnswers = None - - -def _get_ml_app_for_ragas_trace(span_event: dict) -> str: - """ - The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` - or `dd-ragas` if `ml_app` is not present in the span event. - """ - tags = span_event.get("tags", []) # list[str] - ml_app = None - for tag in tags: - if isinstance(tag, str) and tag.startswith("ml_app:"): - ml_app = tag.split(":")[1] - break - if not ml_app: - return RAGAS_ML_APP_PREFIX - return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) - - -def _get_faithfulness_instance() -> Optional[object]: - """ - This helper function ensures the faithfulness instance used in - ragas evaluator is updated with the latest ragas faithfulness - instance AND has an non-null llm - """ - if MiniRagas.faithfulness is None: - return None - ragas_faithfulness_instance = MiniRagas.faithfulness - if not ragas_faithfulness_instance.llm: - ragas_faithfulness_instance.llm = MiniRagas.llm_factory() - return ragas_faithfulness_instance - - -class RagasFaithfulnessEvaluator: +class RagasFaithfulnessEvaluator(BaseRagasEvaluator): """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations on LLM Observability span events. The job of an Evaluator is to take a span and submit evaluation metrics based on the span's attributes. @@ -95,98 +44,30 @@ def __init__(self, llmobs_service): Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. """ - self.llmobs_service = llmobs_service - self.ragas_version = "unknown" - telemetry_state = "ok" - try: - import ragas - - self.ragas_version = parse_version(ragas.__version__) - if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): - raise NotImplementedError( - "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version), - ) - - from ragas.llms import llm_factory - - MiniRagas.llm_factory = llm_factory - - from ragas.llms.output_parser import RagasoutputParser - - MiniRagas.RagasoutputParser = RagasoutputParser - - from ragas.metrics import faithfulness - - MiniRagas.faithfulness = faithfulness - - from ragas.metrics.base import ensembler - - MiniRagas.ensembler = ensembler - - from ragas.metrics.base import get_segmenter - - MiniRagas.get_segmenter = get_segmenter - - from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers - - MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers - - from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers - - MiniRagas.StatementsAnswers = StatementsAnswers - except Exception as e: - telemetry_state = "fail" - telemetry_writer.add_log( - level=TELEMETRY_LOG_LEVEL.ERROR, - message="Failed to import Ragas dependencies", - stack_trace=traceback.format_exc(), - tags={"ragas_version": self.ragas_version}, - ) - raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e - finally: - telemetry_writer.add_count_metric( - namespace=TELEMETRY_APM_PRODUCT.LLMOBS, - name="evaluators.init", - value=1, - tags=( - ("evaluator_label", self.LABEL), - ("state", telemetry_state), - ("ragas_version", self.ragas_version), - ), - ) - - self.ragas_faithfulness_instance = _get_faithfulness_instance() - self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementsAnswers + super().__init__(llmobs_service) + self.ragas_faithfulness_instance = self._get_faithfulness_instance() + self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementsAnswers ) - self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementFaithfulnessAnswers + self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers ) - self.split_answer_into_sentences = MiniRagas.get_segmenter( + self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter( language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False ) - def run_and_submit_evaluation(self, span_event: dict): - if not span_event: - return - score_result_or_failure, metric_metadata = self.evaluate(span_event) - telemetry_writer.add_count_metric( - TELEMETRY_APM_PRODUCT.LLMOBS, - "evaluators.run", - 1, - tags=( - ("evaluator_label", self.LABEL), - ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), - ), - ) - if isinstance(score_result_or_failure, float): - self.llmobs_service.submit_evaluation( - span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=score_result_or_failure, - metadata=metric_metadata, - ) + def _get_faithfulness_instance(self) -> Optional[object]: + """ + This helper function ensures the faithfulness instance used in + ragas evaluator is updated with the latest ragas faithfulness + instance AND has an non-null llm + """ + if self.ragas_dependencies.faithfulness is None: + return None + ragas_faithfulness_instance = self.ragas_dependencies.faithfulness + if not ragas_faithfulness_instance.llm: + ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory() + return ragas_faithfulness_instance def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: """ @@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()` method from ragas which defaults to openai's gpt-4o-turbo. """ - self.ragas_faithfulness_instance = _get_faithfulness_instance() + self.ragas_faithfulness_instance = self._get_faithfulness_instance() if not self.ragas_faithfulness_instance: return "fail_faithfulness_is_none", {} @@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] span=ragas_faithfulness_workflow ) - faithfulness_inputs = self._extract_faithfulness_inputs(span_event) + faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event) if faithfulness_inputs is None: logger.debug( - "Failed to extract question and context from span sampled for ragas_faithfulness evaluation" + "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation" ) return "fail_extract_faithfulness_inputs", evaluation_metadata question = faithfulness_inputs["question"] answer = faithfulness_inputs["answer"] - context = faithfulness_inputs["context"] + context = " ".join(faithfulness_inputs["contexts"]) statements = self._create_statements(question, answer) if statements is None: @@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]): return None # collapse multiple generations into a single faithfulness list - faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict") # type: ignore + faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict") try: - return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) # type: ignore + return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) except Exception as e: logger.debug("Failed to parse faithfulness_list", exc_info=e) return None @@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]): output_data=faithfulness_list, ) - def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]: - """ - Extracts the question, answer, and context used as inputs to faithfulness - evaluation from a span event. - - question - input.prompt.variables.question OR input.messages[-1].content - context - input.prompt.variables.context - answer - output.messages[-1].content - """ - with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow: - self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) - question, answer, context = None, None, None - - meta_io = span_event.get("meta") - if meta_io is None: - return None - - meta_input = meta_io.get("input") - meta_output = meta_io.get("output") - - if not (meta_input and meta_output): - return None - - prompt = meta_input.get("prompt") - if prompt is None: - logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation") - return None - prompt_variables = prompt.get("variables") - - input_messages = meta_input.get("messages") - - messages = meta_output.get("messages") - if messages is not None and len(messages) > 0: - answer = messages[-1].get("content") - - if prompt_variables: - context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) - question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) - context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]) - question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) - - if not question and input_messages is not None and len(input_messages) > 0: - question = input_messages[-1].get("content") - - self.llmobs_service.annotate( - span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer} - ) - if any(field is None for field in (question, context, answer)): - logger.debug("Failed to extract inputs required for faithfulness evaluation") - return None - - return {"question": question, "context": context, "answer": answer} - def _create_statements_prompt(self, answer, question): # Returns: `ragas.llms.PromptValue` object with self.llmobs_service.task("dd-ragas.create_statements_prompt"): diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index bf45e618e01..3d26998f1b4 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None): ("state", evaluator_init_state), ), ) + else: + raise ValueError("Parsed unsupported evaluator: {}".format(evaluator)) def start(self, *args, **kwargs): if not self.evaluators: logger.debug("no evaluators configured, not starting %r", self.__class__.__name__) return super(EvaluatorRunner, self).start() - logger.debug("started %r to %r", self.__class__.__name__) + logger.debug("started %r", self.__class__.__name__) def _stop_service(self) -> None: """ diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 0ecdde36ee6..32bbce849db 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -553,7 +553,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id): ) -def _expected_ragas_spans(ragas_inputs=None): +def _expected_ragas_context_precision_spans(ragas_inputs=None): + if not ragas_inputs: + ragas_inputs = default_ragas_inputs + return [ + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": "undefined", + "name": "dd-ragas.context_precision", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": "1.0"}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": mock.ANY, + "name": "dd-ragas.extract_evaluation_inputs_from_span", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": mock.ANY}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + ] + + +def _expected_ragas_faithfulness_spans(ragas_inputs=None): if not ragas_inputs: ragas_inputs = default_ragas_inputs return [ @@ -581,7 +620,7 @@ def _expected_ragas_spans(ragas_inputs=None): "trace_id": mock.ANY, "span_id": mock.ANY, "parent_id": mock.ANY, - "name": "dd-ragas.extract_faithfulness_inputs", + "name": "dd-ragas.extract_evaluation_inputs_from_span", "start_ns": mock.ANY, "duration": mock.ANY, "status": "ok", diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 128c4639946..eb0be25c91b 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock())) evaluator_runner.start() - mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")]) + mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")]) def test_evaluator_runner_buffer_limit(mock_evaluator_logs): @@ -99,6 +99,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces assert err == b"" +def test_evaluator_runner_unsupported_evaluator(): + with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}): + with pytest.raises(ValueError): + EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) + + def test_evaluator_runner_sampler_single_rule(monkeypatch): monkeypatch.setenv( EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR, diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py similarity index 97% rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py rename to tests/llmobs/test_llmobs_ragas_evaluators.py index 39e315b37e4..9df6c392470 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_evaluators.py @@ -6,7 +6,7 @@ from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator from ddtrace.span import Span from tests.llmobs._utils import _expected_llmobs_llm_span_event -from tests.llmobs._utils import _expected_ragas_spans +from tests.llmobs._utils import _expected_ragas_faithfulness_spans from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt @@ -177,7 +177,8 @@ def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) assert len(ragas_spans) == 7 # check name, io, span kinds match - assert ragas_spans == _expected_ragas_spans() + assert ragas_spans == _expected_ragas_faithfulness_spans() + # verify the trace structure root_span = ragas_spans[0] root_span_id = root_span["span_id"] @@ -212,7 +213,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log "DD_LLMOBS_ML_APP": "unnamed-ml-app", "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", + "DD_LLMOBS_AGENTLESS_ENABLED": "1", } ) out, err, status, pid = run_python_code_in_subprocess( @@ -227,7 +228,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__())