Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(llmobs): update ragas trace ml app #11952

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
# Used to differentiate traces of Datadog-run operations vs user-application operations.
RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration"

# The ml app of all ragas traces have this prefix that we use to detect
# whether a span is generated from the ragas evaluation itself.
RAGAS_ML_APP_PREFIX = "dd-ragas"
# All ragas traces have this context item set so we can differentiate
# spans generated from the ragas integration vs user application spans.
IS_EVALUATION_SPAN = "_is_evaluation_span"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
IS_EVALUATION_SPAN = "_is_evaluation_span"
IS_EVALUATION_SPAN = "_ml_obs.evaluation_span"

Let's use the _ml_obs prefix to make it clear this is coming from us (for future reviewers/users)


ANNOTATIONS_CONTEXT_ID = "annotations_context_id"
INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys"
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -84,6 +85,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_ar_workflow:
ragas_ar_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow)

Expand Down
6 changes: 2 additions & 4 deletions ddtrace/llmobs/_evaluators/ragas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from typing import Tuple
from typing import Union

from ddtrace import config
from ddtrace.internal.logger import get_logger
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
from ddtrace.internal.utils.version import parse_version
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX


logger = get_logger(__name__)
Expand Down Expand Up @@ -94,9 +94,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
if isinstance(tag, str) and tag.startswith("ml_app:"):
ml_app = tag.split(":")[1]
break
if not ml_app:
return RAGAS_ML_APP_PREFIX
return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
return ml_app or config._llmobs_ml_app or "unknown-ml-app"


class BaseRagasEvaluator:
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -82,6 +83,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_cp_workflow:
ragas_cp_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow)

Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -96,6 +97,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_faithfulness_workflow:
ragas_faithfulness_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(
span=ragas_faithfulness_workflow
Expand Down
26 changes: 13 additions & 13 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from ddtrace.llmobs._constants import INPUT_PARAMETERS
from ddtrace.llmobs._constants import INPUT_PROMPT
from ddtrace.llmobs._constants import INPUT_VALUE
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import ML_APP
Expand All @@ -59,6 +60,7 @@
from ddtrace.llmobs._utils import _get_session_id
from ddtrace.llmobs._utils import _get_span_name
from ddtrace.llmobs._utils import _inject_llmobs_parent_id
from ddtrace.llmobs._utils import _is_evaluation_span
from ddtrace.llmobs._utils import safe_json
from ddtrace.llmobs._utils import validate_prompt
from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
Expand Down Expand Up @@ -123,16 +125,16 @@ def _submit_llmobs_span(self, span: Span) -> None:
"""Generate and submit an LLMObs span event to be sent to LLMObs."""
span_event = None
is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
is_ragas_integration_span = False
is_evaluation_span = False
try:
span_event, is_ragas_integration_span = self._llmobs_span_event(span)
span_event, is_evaluation_span = self._llmobs_span_event(span)
self._llmobs_span_writer.enqueue(span_event)
except (KeyError, TypeError):
log.error(
"Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
)
finally:
if not span_event or not is_llm_span or is_ragas_integration_span:
if not span_event or not is_llm_span or is_evaluation_span:
return
if self._evaluator_runner:
self._evaluator_runner.enqueue(span_event, span)
Expand Down Expand Up @@ -185,10 +187,8 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
metrics = span._get_ctx_item(METRICS) or {}
ml_app = _get_ml_app(span)

is_ragas_integration_span = False

if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
is_ragas_integration_span = True
is_evaluation_span = _is_evaluation_span(span)
span._set_ctx_item(IS_EVALUATION_SPAN, is_evaluation_span)
Comment on lines +190 to +191
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this check in this function scope?


span._set_ctx_item(ML_APP, ml_app)
parent_id = str(_get_llmobs_parent_id(span) or "undefined")
Expand All @@ -210,9 +210,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
llmobs_span_event["session_id"] = session_id

llmobs_span_event["tags"] = cls._llmobs_tags(
span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
span, ml_app, session_id, is_ragas_integration_span=is_evaluation_span
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like we don't necessarily need to pass this information in to _llmobs_tags() or even return it from this function. Thoughts on just doing the checks individually when needed instead of passing it around?

)
return llmobs_span_event, is_ragas_integration_span
return llmobs_span_event, is_evaluation_span

@staticmethod
def _llmobs_tags(
Expand Down Expand Up @@ -272,10 +272,6 @@ def _start_service(self) -> None:
log.debug("Error starting evaluator runner")

def _stop_service(self) -> None:
# Remove listener hooks for span events
core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

try:
self._evaluator_runner.stop()
# flush remaining evaluation spans & evaluations
Expand All @@ -290,6 +286,10 @@ def _stop_service(self) -> None:
except ServiceStatusError:
log.debug("Error stopping LLMObs writers")

# Remove listener hooks for span events
Copy link
Contributor Author

@lievan lievan Jan 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes sure hooks aren't removed prematurely while ragas is still running as triggered by self._evaluator_runner.stop()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this triggering any bugs?

core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

forksafe.unregister(self._child_after_fork)

@classmethod
Expand Down
18 changes: 18 additions & 0 deletions ddtrace/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ddtrace.llmobs._constants import GEMINI_APM_SPAN_NAME
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import LANGCHAIN_APM_SPAN_NAME
from ddtrace.llmobs._constants import ML_APP
from ddtrace.llmobs._constants import OPENAI_APM_SPAN_NAME
Expand Down Expand Up @@ -127,6 +128,23 @@ def _get_span_name(span: Span) -> str:
return span.name


def _is_evaluation_span(span: Span) -> bool:
"""
Return whether or not a span is an evaluation span by checking the span's
nearest LLMObs span ancestor. Default to 'False'
"""
is_evaluation_span = span._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span is not None:
return is_evaluation_span
llmobs_parent = _get_nearest_llmobs_ancestor(span)
while llmobs_parent:
is_evaluation_span = llmobs_parent._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span is not None:
return is_evaluation_span
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's do a bool check to be defensive here

llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
return is_evaluation_span or False


def _get_ml_app(span: Span) -> str:
"""
Return the ML app name for a given span, by checking the span's nearest LLMObs span ancestor.
Expand Down
2 changes: 1 addition & 1 deletion tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def expected_ragas_trace_tags():
"env:",
"service:tests.llmobs",
"source:integration",
"ml_app:dd-ragas-unnamed-ml-app",
"ml_app:unnamed-ml-app",
"ddtrace.version:{}".format(ddtrace.__version__),
"language:python",
"error:0",
Expand Down
8 changes: 5 additions & 3 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ddtrace.llmobs._constants import INPUT_PARAMETERS
from ddtrace.llmobs._constants import INPUT_PROMPT
from ddtrace.llmobs._constants import INPUT_VALUE
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
Expand All @@ -24,7 +25,6 @@
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_VALUE
from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
from ddtrace.llmobs._constants import SESSION_ID
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
Expand Down Expand Up @@ -1538,8 +1538,10 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner):


def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs):
with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
pass
with llmobs.agent(name="test") as agent:
agent._set_ctx_item(IS_EVALUATION_SPAN, True)
with llmobs.llm(model_name="test_model"):
pass
time.sleep(0.1)
assert llmobs._instance._evaluator_runner.enqueue.call_count == 0

Expand Down
Loading