DataDog · lievan · Jan 10, 2025 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -4,6 +4,7 @@
 from typing import Any
 from typing import Dict
 from typing import Optional
+from typing import Tuple
 from typing import Union
 
 import ddtrace
@@ -22,6 +23,7 @@
 from ddtrace.internal.service import ServiceStatusError
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
@@ -58,6 +60,7 @@
 from ddtrace.llmobs.utils import ExportedLLMObsSpan
 from ddtrace.llmobs.utils import Messages
 from ddtrace.propagation.http import HTTPPropagator
+from ddtrace.vendor.debtcollector import deprecate
 
 
 log = get_logger(__name__)
@@ -774,6 +777,153 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
             return
         span.set_tag_str(METRICS, safe_json(metrics))
 
+    @classmethod
+    def submit_evaluation_for(
+        cls,
+        label: str,
+        metric_type: str,
+        value: Union[str, int, float],
+        span: Optional[dict] = None,
+        span_with_tag: Optional[Tuple[str, str]] = None,
+        tags: Optional[Dict[str, str]] = None,
+        ml_app: Optional[str] = None,
+        timestamp_ms: Optional[int] = None,
+        metadata: Optional[Dict[str, object]] = None,
+    ) -> None:
+        """
+        Submits a custom evaluation metric for a given span.
+
+        :param str label: The name of the evaluation metric.
+        :param str metric_type: The type of the evaluation metric. One of "categorical", "score".
+        :param value: The value of the evaluation metric.
+                      Must be a string (categorical), integer (score), or float (score).
+        :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
+                            the span associated with this evaluation.
+        :param tuple span_with_tag: A tuple of shape (tag_key, tag_value) uniquely identifying
+                            the span associated with this evaluation.
+        :param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
+        :param str ml_app: The name of the ML application
+        :param int timestamp_ms: The timestamp in milliseconds when the evaluation metric result was generated.
+        :param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
+                                evaluation metric.
+        """
+        if cls.enabled is False:
+            log.warning(
+                "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
+                "Evaluation metric data will not be sent.",
+            )
+            return
+        if not config._dd_api_key:
+            log.warning(
+                "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                "Ensure this configuration is set before running your application."
+            )
+            return
+
+        has_exactly_one_joining_key = (span is not None) ^ (span_with_tag is not None)
+
+        if not has_exactly_one_joining_key:
+            log.warning("Exactly one of `span` or `span_with_tag` must be specified to submit an evaluation metric.")
+            return
+
+        join_on = {}
+        if span is not None:
+            if (
+                not isinstance(span, dict)
+                or "span_id" not in span
+                or "trace_id" not in span
+                or not isinstance(span.get("span_id"), str)
+                or not isinstance(span.get("trace_id"), str)
+            ):
+                log.warning(
+                    "`span` must be a dictionary containing both span_id and trace_id keys. "
+                    "LLMObs.export_span() can be used to generate this dictionary from a given span."
+                )
+                return
+            join_on["span"] = span
+        elif span_with_tag is not None:
+            if (
+                not isinstance(span_with_tag, tuple)
+                or len(span_with_tag) != 2
+                or not all(isinstance(i, str) for i in span_with_tag)
+            ):
+                log.warning("`span_with_tag` must be a tuple of shape (tag_key, tag_value)")
+                return
+            join_on["tag"] = {"tag_key": span_with_tag[0], "tag_value": span_with_tag[1]}
+
+        ml_app = ml_app if ml_app else config._llmobs_ml_app
+        if not ml_app:
+            log.warning(
+                "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                "Ensure this configuration is set before running your application."
+            )
+            return
+
+        timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
+
+        if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
+            log.warning("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
+            return
+
+        if not label:
+            log.warning("label must be the specified name of the evaluation metric.")
+            return
+
+        if not metric_type or metric_type.lower() not in ("categorical", "numerical", "score"):
+            log.warning("metric_type must be one of 'categorical' or 'score'.")
+            return
+
+        metric_type = metric_type.lower()
+        if metric_type == "numerical":
+            log.warning(
+                "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
+                "Converting `numerical` metric to `score` type."
+            )
+            metric_type = "score"
+
+        if metric_type == "categorical" and not isinstance(value, str):
+            log.warning("value must be a string for a categorical metric.")
+            return
+        if metric_type == "score" and not isinstance(value, (int, float)):
+            log.warning("value must be an integer or float for a score metric.")
+            return
+        if tags is not None and not isinstance(tags, dict):
+            log.warning("tags must be a dictionary of string key-value pairs.")
+            return
+
+        # initialize tags with default values that will be overridden by user-provided tags
+        evaluation_tags = {
+            "ddtrace.version": ddtrace.__version__,
+            "ml_app": ml_app,
+        }
+
+        if tags:
+            for k, v in tags.items():
+                try:
+                    evaluation_tags[ensure_text(k)] = ensure_text(v)
+                except TypeError:
+                    log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
+
+        evaluation_metric = {
+            "join_on": join_on,
+            "label": str(label),
+            "metric_type": metric_type.lower(),
+            "timestamp_ms": timestamp_ms,
+            "{}_value".format(metric_type): value,
+            "ml_app": ml_app,
+            "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
+        }
+
+        if metadata:
+            if not isinstance(metadata, dict):
+                log.warning("metadata must be json serializable dictionary.")
+            else:
+                metadata = safe_json(metadata)
+                if metadata and isinstance(metadata, str):
+                    evaluation_metric["metadata"] = json.loads(metadata)
+
+        cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
+
     @classmethod
     def submit_evaluation(
         cls,
@@ -786,6 +936,13 @@ def submit_evaluation(
         timestamp_ms: Optional[int] = None,
         metadata: Optional[Dict[str, object]] = None,
     ) -> None:
+        deprecate(
+            "Using `LLMObs.submit_evaluation` is deprecated",
+            message="Please use `LLMObs.submit_evaluation_for` instead.",
+            removal_version="3.0.0",
+            category=DDTraceDeprecationWarning,
+        )
+
         """
         Submits a custom evaluation metric for a given span ID and trace ID.
 
@@ -877,8 +1034,7 @@ def submit_evaluation(
                     log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
         evaluation_metric = {
-            "span_id": span_id,
-            "trace_id": trace_id,
+            "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
             "label": str(label),
             "metric_type": metric_type.lower(),
             "timestamp_ms": timestamp_ms,

@@ -158,7 +158,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
         super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
         self._event_type = "evaluation_metric"
         self._buffer = []
-        self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
+        self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
         self._intake = "api.%s" % self._site  # type: str
 
     def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:

@@ -0,0 +1,12 @@
+---
+features:
+  - |
+    Format: LLM Observability: This introduces the ability to submit custom evaluations joined to a span by a tag key-value pair using the
+            `LLMObs.submit_evaluation_for()` method. The tag key-value pair is expected to uniquely a span.
+            Example usage: 
+              - Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag=("message_id", "dummy-message-id"), label="rating", ...)`.
+              - Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
+deprecations:
+  - |
+    Format: LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in 3.0.0.
+            As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
@@ -212,11 +212,13 @@ def _get_llmobs_parent_id(span: Span):
 
 
 def _expected_llmobs_eval_metric_event(
-    span_id,
-    trace_id,
     metric_type,
     label,
     ml_app,
+    tag_key=None,
+    tag_val=None,
+    span_id=None,
+    trace_id=None,
     timestamp_ms=None,
     categorical_value=None,
     score_value=None,
@@ -225,15 +227,18 @@ def _expected_llmobs_eval_metric_event(
     metadata=None,
 ):
     eval_metric_event = {
-        "span_id": span_id,
-        "trace_id": trace_id,
+        "join_on": {},
         "metric_type": metric_type,
         "label": label,
         "tags": [
             "ddtrace.version:{}".format(ddtrace.__version__),
             "ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
         ],
     }
+    if tag_key is not None and tag_val is not None:
+        eval_metric_event["join_on"]["tag"] = {"tag_key": tag_key, "tag_value": tag_val}
+    if span_id is not None and trace_id is not None:
+        eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
     if categorical_value is not None:
         eval_metric_event["categorical_value"] = categorical_value
     if score_value is not None:

@@ -7,15 +7,19 @@
 from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
 
 
-INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric"
+INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric"
 DD_SITE = "datad0g.com"
 dd_api_key = os.getenv("DD_API_KEY", default="<not-a-real-api-key>")
 
 
 def _categorical_metric_event():
     return {
-        "span_id": "12345678901",
-        "trace_id": "98765432101",
+        "join_on": {
+            "span": {
+                "span_id": "12345678901",
+                "trace_id": "98765432101",
+            },
+        },
         "metric_type": "categorical",
         "categorical_value": "very",
         "label": "toxicity",
@@ -26,8 +30,12 @@ def _categorical_metric_event():
 
 def _score_metric_event():
     return {
-        "span_id": "12345678902",
-        "trace_id": "98765432102",
+        "join_on": {
+            "span": {
+                "span_id": "12345678902",
+                "trace_id": "98765432102",
+            },
+        },
         "metric_type": "score",
         "label": "sentiment",
         "score_value": 0.9,