Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(llmobs): update ragas trace ml app #11952

Merged
merged 20 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
# Used to differentiate traces of Datadog-run operations vs user-application operations.
RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration"

# The ml app of all ragas traces have this prefix that we use to detect
# whether a span is generated from the ragas evaluation itself.
RAGAS_ML_APP_PREFIX = "dd-ragas"
# All ragas traces have this context item set so we can differentiate
# spans generated from the ragas integration vs user application spans.
IS_EVALUATION_SPAN = "_ml_obs.evaluation_span"

ANNOTATIONS_CONTEXT_ID = "annotations_context_id"
INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys"
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -84,6 +85,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_ar_workflow:
ragas_ar_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow)

Expand Down
6 changes: 2 additions & 4 deletions ddtrace/llmobs/_evaluators/ragas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from typing import Tuple
from typing import Union

from ddtrace import config
from ddtrace.internal.logger import get_logger
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
from ddtrace.internal.telemetry.constants import TELEMETRY_NAMESPACE
from ddtrace.internal.utils.version import parse_version
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX


logger = get_logger(__name__)
Expand Down Expand Up @@ -94,9 +94,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
if isinstance(tag, str) and tag.startswith("ml_app:"):
ml_app = tag.split(":")[1]
break
if not ml_app:
return RAGAS_ML_APP_PREFIX
return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
return ml_app or config._llmobs_ml_app or "unknown-ml-app"


class BaseRagasEvaluator:
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -82,6 +83,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_cp_workflow:
ragas_cp_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow)

Expand Down
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace

Expand Down Expand Up @@ -96,6 +97,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
with self.llmobs_service.workflow(
"dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_faithfulness_workflow:
ragas_faithfulness_workflow._set_ctx_item(IS_EVALUATION_SPAN, True)
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(
span=ragas_faithfulness_workflow
Expand Down
35 changes: 12 additions & 23 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import ddtrace
Expand Down Expand Up @@ -59,6 +58,7 @@
from ddtrace.llmobs._utils import _get_session_id
from ddtrace.llmobs._utils import _get_span_name
from ddtrace.llmobs._utils import _inject_llmobs_parent_id
from ddtrace.llmobs._utils import _is_evaluation_span
from ddtrace.llmobs._utils import safe_json
from ddtrace.llmobs._utils import validate_prompt
from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
Expand Down Expand Up @@ -121,23 +121,21 @@ def _on_span_finish(self, span):
def _submit_llmobs_span(self, span: Span) -> None:
"""Generate and submit an LLMObs span event to be sent to LLMObs."""
span_event = None
is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
is_ragas_integration_span = False
try:
span_event, is_ragas_integration_span = self._llmobs_span_event(span)
span_event = self._llmobs_span_event(span)
self._llmobs_span_writer.enqueue(span_event)
except (KeyError, TypeError):
log.error(
"Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
)
finally:
if not span_event or not is_llm_span or is_ragas_integration_span:
if not span_event or not span._get_ctx_item(SPAN_KIND) == "llm" or _is_evaluation_span(span):
return
if self._evaluator_runner:
self._evaluator_runner.enqueue(span_event, span)

@classmethod
def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
def _llmobs_span_event(cls, span: Span) -> Dict[str, Any]:
"""Span event object structure."""
span_kind = span._get_ctx_item(SPAN_KIND)
if not span_kind:
Expand Down Expand Up @@ -184,11 +182,6 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
metrics = span._get_ctx_item(METRICS) or {}
ml_app = _get_ml_app(span)

is_ragas_integration_span = False

if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
is_ragas_integration_span = True

span._set_ctx_item(ML_APP, ml_app)
parent_id = str(_get_llmobs_parent_id(span) or "undefined")

Expand All @@ -208,20 +201,16 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
span._set_ctx_item(SESSION_ID, session_id)
llmobs_span_event["session_id"] = session_id

llmobs_span_event["tags"] = cls._llmobs_tags(
span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
)
llmobs_span_event["tags"] = cls._llmobs_tags(span, ml_app, session_id)

span_links = span._get_ctx_item(SPAN_LINKS)
if isinstance(span_links, list):
llmobs_span_event["span_links"] = span_links

return llmobs_span_event, is_ragas_integration_span
Yun-Kim marked this conversation as resolved.
Show resolved Hide resolved
return llmobs_span_event

@staticmethod
def _llmobs_tags(
span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
) -> List[str]:
def _llmobs_tags(span: Span, ml_app: str, session_id: Optional[str] = None) -> List[str]:
tags = {
"version": config.version or "",
"env": config.env or "",
Expand All @@ -237,7 +226,7 @@ def _llmobs_tags(
tags["error_type"] = err_type
if session_id:
tags["session_id"] = session_id
if is_ragas_integration_span:
if _is_evaluation_span(span):
tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
existing_tags = span._get_ctx_item(TAGS)
if existing_tags is not None:
Expand Down Expand Up @@ -276,10 +265,6 @@ def _start_service(self) -> None:
log.debug("Error starting evaluator runner")

def _stop_service(self) -> None:
# Remove listener hooks for span events
core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

try:
self._evaluator_runner.stop()
# flush remaining evaluation spans & evaluations
Expand All @@ -294,6 +279,10 @@ def _stop_service(self) -> None:
except ServiceStatusError:
log.debug("Error stopping LLMObs writers")

# Remove listener hooks for span events
Yun-Kim marked this conversation as resolved.
Show resolved Hide resolved
core.reset_listeners("trace.span_start", self._on_span_start)
core.reset_listeners("trace.span_finish", self._on_span_finish)

forksafe.unregister(self._child_after_fork)

@classmethod
Expand Down
18 changes: 18 additions & 0 deletions ddtrace/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ddtrace.llmobs._constants import GEMINI_APM_SPAN_NAME
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import LANGCHAIN_APM_SPAN_NAME
from ddtrace.llmobs._constants import ML_APP
from ddtrace.llmobs._constants import NAME
Expand Down Expand Up @@ -128,6 +129,23 @@ def _get_span_name(span: Span) -> str:
return span._get_ctx_item(NAME) or span.name


def _is_evaluation_span(span: Span) -> bool:
"""
Return whether or not a span is an evaluation span by checking the span's
nearest LLMObs span ancestor. Default to 'False'
"""
is_evaluation_span = span._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span:
return is_evaluation_span
llmobs_parent = _get_nearest_llmobs_ancestor(span)
while llmobs_parent:
is_evaluation_span = llmobs_parent._get_ctx_item(IS_EVALUATION_SPAN)
if is_evaluation_span:
return is_evaluation_span
lievan marked this conversation as resolved.
Show resolved Hide resolved
llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
return False


def _get_ml_app(span: Span) -> str:
"""
Return the ML app name for a given span, by checking the span's nearest LLMObs span ancestor.
Expand Down
2 changes: 1 addition & 1 deletion tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def expected_ragas_trace_tags():
"env:",
"service:tests.llmobs",
"source:integration",
"ml_app:dd-ragas-unnamed-ml-app",
"ml_app:unnamed-ml-app",
"ddtrace.version:{}".format(ddtrace.__version__),
"language:python",
"error:0",
Expand Down
8 changes: 5 additions & 3 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ddtrace.llmobs._constants import INPUT_PARAMETERS
from ddtrace.llmobs._constants import INPUT_PROMPT
from ddtrace.llmobs._constants import INPUT_VALUE
from ddtrace.llmobs._constants import IS_EVALUATION_SPAN
from ddtrace.llmobs._constants import METADATA
from ddtrace.llmobs._constants import METRICS
from ddtrace.llmobs._constants import MODEL_NAME
Expand All @@ -24,7 +25,6 @@
from ddtrace.llmobs._constants import OUTPUT_MESSAGES
from ddtrace.llmobs._constants import OUTPUT_VALUE
from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
from ddtrace.llmobs._constants import SESSION_ID
from ddtrace.llmobs._constants import SPAN_KIND
from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
Expand Down Expand Up @@ -1538,8 +1538,10 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner):


def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs):
with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
pass
with llmobs.agent(name="test") as agent:
agent._set_ctx_item(IS_EVALUATION_SPAN, True)
with llmobs.llm(model_name="test_model"):
pass
time.sleep(0.1)
assert llmobs._instance._evaluator_runner.enqueue.call_count == 0

Expand Down
Loading