Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(llmobs): support joining custom evaluations via tags #11535

Merged
merged 27 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 158 additions & 2 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any
from typing import Dict
from typing import Optional
from typing import Tuple
from typing import Union

import ddtrace
Expand All @@ -22,6 +23,7 @@
from ddtrace.internal.service import ServiceStatusError
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
from ddtrace.internal.utils.formats import asbool
from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
from ddtrace.llmobs._constants import INPUT_DOCUMENTS
Expand Down Expand Up @@ -58,6 +60,7 @@
from ddtrace.llmobs.utils import ExportedLLMObsSpan
from ddtrace.llmobs.utils import Messages
from ddtrace.propagation.http import HTTPPropagator
from ddtrace.vendor.debtcollector import deprecate


log = get_logger(__name__)
Expand Down Expand Up @@ -774,6 +777,153 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
return
span.set_tag_str(METRICS, safe_json(metrics))

@classmethod
def submit_evaluation_for(
cls,
label: str,
metric_type: str,
value: Union[str, int, float],
span: Optional[dict] = None,
span_with_tag: Optional[Tuple[str, str]] = None,
lievan marked this conversation as resolved.
Show resolved Hide resolved
tags: Optional[Dict[str, str]] = None,
ml_app: Optional[str] = None,
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
) -> None:
"""
Submits a custom evaluation metric for a given span.

:param str label: The name of the evaluation metric.
:param str metric_type: The type of the evaluation metric. One of "categorical", "score".
:param value: The value of the evaluation metric.
Must be a string (categorical), integer (score), or float (score).
:param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
the span associated with this evaluation.
:param tuple span_with_tag: A tuple of shape (tag_key, tag_value) uniquely identifying
the span associated with this evaluation.
:param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
:param str ml_app: The name of the ML application
:param int timestamp_ms: The timestamp in milliseconds when the evaluation metric result was generated.
lievan marked this conversation as resolved.
Show resolved Hide resolved
:param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
lievan marked this conversation as resolved.
Show resolved Hide resolved
evaluation metric.
"""
if cls.enabled is False:
log.warning(
lievan marked this conversation as resolved.
Show resolved Hide resolved
"LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
"Evaluation metric data will not be sent.",
)
return
if not config._dd_api_key:
lievan marked this conversation as resolved.
Show resolved Hide resolved
log.warning(
"DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. "
"Ensure this configuration is set before running your application."
)
return

has_exactly_one_joining_key = (span is not None) ^ (span_with_tag is not None)

if not has_exactly_one_joining_key:
log.warning("Exactly one of `span` or `span_with_tag` must be specified to submit an evaluation metric.")
lievan marked this conversation as resolved.
Show resolved Hide resolved
return

join_on = {}
if span is not None:
if (
not isinstance(span, dict)
or "span_id" not in span
or "trace_id" not in span
or not isinstance(span.get("span_id"), str)
or not isinstance(span.get("trace_id"), str)
):
log.warning(
lievan marked this conversation as resolved.
Show resolved Hide resolved
"`span` must be a dictionary containing both span_id and trace_id keys. "
"LLMObs.export_span() can be used to generate this dictionary from a given span."
)
return
join_on["span"] = span
elif span_with_tag is not None:
if (
not isinstance(span_with_tag, tuple)
or len(span_with_tag) != 2
or not all(isinstance(i, str) for i in span_with_tag)
):
log.warning("`span_with_tag` must be a tuple of shape (tag_key, tag_value)")
return
join_on["tag"] = {"tag_key": span_with_tag[0], "tag_value": span_with_tag[1]}

ml_app = ml_app if ml_app else config._llmobs_ml_app
if not ml_app:
log.warning(
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
"Ensure this configuration is set before running your application."
)
return

timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)

if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
log.warning("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
return

if not label:
log.warning("label must be the specified name of the evaluation metric.")
return

if not metric_type or metric_type.lower() not in ("categorical", "numerical", "score"):
log.warning("metric_type must be one of 'categorical' or 'score'.")
return

metric_type = metric_type.lower()
lievan marked this conversation as resolved.
Show resolved Hide resolved
if metric_type == "numerical":
log.warning(
lievan marked this conversation as resolved.
Show resolved Hide resolved
"The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
"Converting `numerical` metric to `score` type."
)
metric_type = "score"

if metric_type == "categorical" and not isinstance(value, str):
log.warning("value must be a string for a categorical metric.")
return
if metric_type == "score" and not isinstance(value, (int, float)):
log.warning("value must be an integer or float for a score metric.")
return
if tags is not None and not isinstance(tags, dict):
log.warning("tags must be a dictionary of string key-value pairs.")
return

# initialize tags with default values that will be overridden by user-provided tags
lievan marked this conversation as resolved.
Show resolved Hide resolved
evaluation_tags = {
"ddtrace.version": ddtrace.__version__,
"ml_app": ml_app,
}

if tags:
lievan marked this conversation as resolved.
Show resolved Hide resolved
for k, v in tags.items():
try:
evaluation_tags[ensure_text(k)] = ensure_text(v)
except TypeError:
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")

evaluation_metric = {
"join_on": join_on,
"label": str(label),
"metric_type": metric_type.lower(),
lievan marked this conversation as resolved.
Show resolved Hide resolved
"timestamp_ms": timestamp_ms,
"{}_value".format(metric_type): value,
"ml_app": ml_app,
"tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
}

if metadata:
if not isinstance(metadata, dict):
log.warning("metadata must be json serializable dictionary.")
lievan marked this conversation as resolved.
Show resolved Hide resolved
else:
metadata = safe_json(metadata)
if metadata and isinstance(metadata, str):
evaluation_metric["metadata"] = json.loads(metadata)

cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)

@classmethod
def submit_evaluation(
cls,
Expand All @@ -786,6 +936,13 @@ def submit_evaluation(
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
) -> None:
deprecate(
"Using `LLMObs.submit_evaluation` is deprecated",
message="Please use `LLMObs.submit_evaluation_for` instead.",
removal_version="3.0.0",
category=DDTraceDeprecationWarning,
)

"""
Submits a custom evaluation metric for a given span ID and trace ID.

Expand Down Expand Up @@ -877,8 +1034,7 @@ def submit_evaluation(
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")

evaluation_metric = {
"span_id": span_id,
"trace_id": trace_id,
"join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
"label": str(label),
"metric_type": metric_type.lower(),
"timestamp_ms": timestamp_ms,
Expand Down
5 changes: 2 additions & 3 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):


class LLMObsEvaluationMetricEvent(TypedDict, total=False):
span_id: str
trace_id: str
join_on: Dict[str, Dict[str, str]]
metric_type: str
label: str
categorical_value: str
Expand Down Expand Up @@ -158,7 +157,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
self._event_type = "evaluation_metric"
self._buffer = []
self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
self._intake = "api.%s" % self._site # type: str

def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:
Expand Down
12 changes: 12 additions & 0 deletions releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
features:
- |
Format: LLM Observability: This introduces the ability to submit custom evaluations joined to a span by a tag key-value pair using the
lievan marked this conversation as resolved.
Show resolved Hide resolved
`LLMObs.submit_evaluation_for()` method. The tag key-value pair is expected to uniquely a span.
Example usage:
- Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag=("message_id", "dummy-message-id"), label="rating", ...)`.
- Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
deprecations:
- |
Format: LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in 3.0.0.
As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
16 changes: 10 additions & 6 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,13 @@ def _get_llmobs_parent_id(span: Span):


def _expected_llmobs_eval_metric_event(
span_id,
trace_id,
metric_type,
label,
ml_app,
tag_key=None,
tag_val=None,
span_id=None,
trace_id=None,
timestamp_ms=None,
categorical_value=None,
score_value=None,
Expand All @@ -225,15 +227,18 @@ def _expected_llmobs_eval_metric_event(
metadata=None,
):
eval_metric_event = {
"span_id": span_id,
"trace_id": trace_id,
"join_on": {},
"metric_type": metric_type,
"label": label,
"tags": [
"ddtrace.version:{}".format(ddtrace.__version__),
"ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
],
}
if tag_key is not None and tag_val is not None:
eval_metric_event["join_on"]["tag"] = {"tag_key": tag_key, "tag_value": tag_val}
if span_id is not None and trace_id is not None:
eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
if categorical_value is not None:
eval_metric_event["categorical_value"] = categorical_value
if score_value is not None:
Expand Down Expand Up @@ -505,8 +510,7 @@ def run_and_submit_evaluation(self, span):

def _dummy_evaluator_eval_metric_event(span_id, trace_id):
return LLMObsEvaluationMetricEvent(
span_id=span_id,
trace_id=trace_id,
join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
score_value=1.0,
ml_app="unnamed-ml-app",
timestamp_ms=mock.ANY,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
"score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
"score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568298743}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
headers:
content-length:
- '316'
- '311'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
content-type:
- application/vnd.api+json
date:
- Wed, 21 Aug 2024 14:11:41 GMT
- Mon, 25 Nov 2024 20:58:19 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568297450}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
headers:
content-length:
- '330'
- '325'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
content-type:
- application/vnd.api+json
date:
- Wed, 21 Aug 2024 14:11:40 GMT
- Mon, 25 Nov 2024 20:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568297307}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
Expand All @@ -21,7 +22,7 @@ interactions:
content-type:
- application/json
date:
- Wed, 21 Aug 2024 14:11:40 GMT
- Mon, 25 Nov 2024 20:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-content-type-options:
Expand Down
Loading
Loading