Skip to content

Commit 5e68823

Browse files
lievanlievan
and
lievan
authored
chore(llmobs): refactor out ragas base evaluator (#11846)
Creates a `RagasBaseEvaluator` class that ragas evaluators can share. This is a split-out version of [this PR](#11716) that has incorporated changes based on the first round of comments from @Yun-Kim and @Kyle-Verhoog This class contains shared logic for: - checking ragas dependencies are present, and throwing otherwise - `submit_and_evaluate` function, which generally takes a span event, calls evaluate to generate a score, and then submits an evaluation - extracting out question, contexts, and answer from an LLM span ### Misc changes - rename tests to specify they are faithfulness-specific tests - throw when we parse an unsupported evaluator instead of logging a warning ### ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan <[email protected]>
1 parent 1020545 commit 5e68823

11 files changed

+297
-208
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import traceback
2+
from typing import List
3+
from typing import Optional
4+
from typing import Tuple
5+
from typing import Union
6+
7+
from ddtrace.internal.logger import get_logger
8+
from ddtrace.internal.telemetry import telemetry_writer
9+
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
10+
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
11+
from ddtrace.internal.utils.version import parse_version
12+
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
13+
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
14+
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
15+
16+
17+
logger = get_logger(__name__)
18+
19+
20+
class RagasDependencies:
21+
"""
22+
A helper class to store instances of ragas classes and functions
23+
that may or may not exist in a user's environment.
24+
"""
25+
26+
def __init__(self):
27+
import ragas
28+
29+
self.ragas_version = parse_version(ragas.__version__)
30+
if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
31+
raise NotImplementedError(
32+
"Ragas version: {} is not supported".format(self.ragas_version),
33+
)
34+
35+
from ragas.llms import llm_factory
36+
37+
self.llm_factory = llm_factory
38+
39+
from ragas.llms.output_parser import RagasoutputParser
40+
41+
self.RagasoutputParser = RagasoutputParser
42+
43+
from ragas.metrics import context_precision
44+
45+
self.context_precision = context_precision
46+
47+
from ragas.metrics.base import ensembler
48+
49+
self.ensembler = ensembler
50+
51+
from ragas.metrics import faithfulness
52+
53+
self.faithfulness = faithfulness
54+
55+
from ragas.metrics.base import get_segmenter
56+
57+
self.get_segmenter = get_segmenter
58+
59+
from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
60+
61+
self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
62+
63+
from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
64+
65+
self.StatementsAnswers = StatementsAnswers
66+
67+
68+
def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
69+
"""
70+
The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
71+
or `dd-ragas` if `ml_app` is not present in the span event.
72+
"""
73+
tags: List[str] = span_event.get("tags", [])
74+
ml_app = None
75+
for tag in tags:
76+
if isinstance(tag, str) and tag.startswith("ml_app:"):
77+
ml_app = tag.split(":")[1]
78+
break
79+
if not ml_app:
80+
return RAGAS_ML_APP_PREFIX
81+
return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
82+
83+
84+
class BaseRagasEvaluator:
85+
"""A class used by EvaluatorRunner to conduct ragas evaluations
86+
on LLM Observability span events. The job of an Evaluator is to take a span and
87+
submit evaluation metrics based on the span's attributes.
88+
89+
Extenders of this class should only need to implement the `evaluate` method.
90+
"""
91+
92+
LABEL = "ragas"
93+
METRIC_TYPE = "score"
94+
95+
def __init__(self, llmobs_service):
96+
"""
97+
Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans.
98+
99+
:param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
100+
submitting evaluation metrics.
101+
102+
Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
103+
"""
104+
self.llmobs_service = llmobs_service
105+
self.ragas_version = "unknown"
106+
telemetry_state = "ok"
107+
try:
108+
self.ragas_dependencies = RagasDependencies()
109+
self.ragas_version = self.ragas_dependencies.ragas_version
110+
except ImportError as e:
111+
telemetry_state = "fail_import_error"
112+
raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
113+
except AttributeError as e:
114+
telemetry_state = "fail_attribute_error"
115+
raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
116+
except NotImplementedError as e:
117+
telemetry_state = "fail_not_supported"
118+
raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
119+
except Exception as e:
120+
telemetry_state = "fail_unknown"
121+
raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
122+
finally:
123+
telemetry_writer.add_count_metric(
124+
namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
125+
name="evaluators.init",
126+
value=1,
127+
tags=(
128+
("evaluator_label", self.LABEL),
129+
("state", telemetry_state),
130+
("evaluator_version", self.ragas_version),
131+
),
132+
)
133+
if telemetry_state != "ok":
134+
telemetry_writer.add_log(
135+
level=TELEMETRY_LOG_LEVEL.ERROR,
136+
message="Failed to import Ragas dependencies",
137+
stack_trace=traceback.format_exc(),
138+
tags={"evaluator_version": self.ragas_version},
139+
)
140+
141+
def run_and_submit_evaluation(self, span_event: dict):
142+
if not span_event:
143+
return
144+
score_result_or_failure, metric_metadata = self.evaluate(span_event)
145+
telemetry_writer.add_count_metric(
146+
TELEMETRY_APM_PRODUCT.LLMOBS,
147+
"evaluators.run",
148+
1,
149+
tags=(
150+
("evaluator_label", self.LABEL),
151+
("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
152+
("evaluator_version", self.ragas_version),
153+
),
154+
)
155+
if isinstance(score_result_or_failure, float):
156+
self.llmobs_service.submit_evaluation(
157+
span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
158+
label=self.LABEL,
159+
metric_type=self.METRIC_TYPE,
160+
value=score_result_or_failure,
161+
metadata=metric_metadata,
162+
)
163+
164+
def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
165+
raise NotImplementedError("evaluate method must be implemented by individual evaluators")
166+
167+
def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]:
168+
"""
169+
Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event.
170+
"""
171+
with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow:
172+
self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
173+
question, answer, contexts = None, None, None
174+
175+
meta_io = span_event.get("meta")
176+
if meta_io is None:
177+
return None
178+
179+
meta_input = meta_io.get("input")
180+
meta_output = meta_io.get("output")
181+
182+
if not (meta_input and meta_output):
183+
return None
184+
185+
prompt = meta_input.get("prompt")
186+
if prompt is None:
187+
logger.debug("Failed to extract `prompt` from span for ragas evaluation")
188+
return None
189+
prompt_variables = prompt.get("variables")
190+
191+
input_messages = meta_input.get("messages")
192+
193+
messages = meta_output.get("messages")
194+
if messages is not None and len(messages) > 0:
195+
answer = messages[-1].get("content")
196+
197+
if prompt_variables:
198+
context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
199+
question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
200+
contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]
201+
question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
202+
203+
if not question and input_messages is not None and len(input_messages) > 0:
204+
question = input_messages[-1].get("content")
205+
206+
self.llmobs_service.annotate(
207+
span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer}
208+
)
209+
if any(field is None for field in (question, contexts, answer)):
210+
logger.debug("Failed to extract inputs required for ragas evaluation")
211+
return None
212+
213+
return {"question": question, "contexts": contexts, "answer": answer}

0 commit comments

Comments
 (0)