diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py index bd94911fa6..e640ff6ed8 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py @@ -139,7 +139,7 @@ async def ascore( def _parse_model_output(self, content: str) -> score_result.ScoreResult: try: dict_content = json.loads(content) - score: float = dict_content["context_precision_score"] + score: float = float(dict_content["context_precision_score"]) if not (0.0 <= score <= 1.0): score = 0.5 diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py index 1d91124c2a..37f92d1c90 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py @@ -138,7 +138,7 @@ async def ascore( def _parse_model_output(self, content: str) -> score_result.ScoreResult: try: dict_content = json.loads(content) - score: float = dict_content["context_recall_score"] + score: float = float(dict_content["context_recall_score"]) if not (0.0 <= score <= 1.0): score = 0.5 diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py index 89459fa448..02f681e08e 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py @@ -130,7 +130,7 @@ async def ascore( def _parse_model_output(self, content: str) -> score_result.ScoreResult: try: dict_content = json.loads(content) - score = dict_content["score"] + score = float(dict_content["score"]) return score_result.ScoreResult( name=self.name, value=score, diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py index 6bee822977..292ea7ff64 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py @@ -108,7 +108,7 @@ async def ascore( def _parse_model_output(self, content: str) -> score_result.ScoreResult: try: dict_content = json.loads(content) - score: float = dict_content["score"] + score: float = float(dict_content["score"]) if not (0.0 <= score <= 1.0): score = 0.5 diff --git a/sdks/python/tests/unit/decorator/test_tracker_outputs.py b/sdks/python/tests/unit/decorator/test_tracker_outputs.py index 06dbde4d02..86452481c0 100644 --- a/sdks/python/tests/unit/decorator/test_tracker_outputs.py +++ b/sdks/python/tests/unit/decorator/test_tracker_outputs.py @@ -1,18 +1,19 @@ -import mock -import threading import asyncio +import threading +from typing import Dict + +import mock import pytest -from opik.decorator import tracker -from opik import context_storage, opik_context -from opik.api_objects import opik_client -from opik.api_objects import trace +from opik import context_storage, opik_context +from opik.api_objects import opik_client, trace +from opik.decorator import tracker from ...testlib import ( - SpanModel, - TraceModel, - FeedbackScoreModel, ANY_BUT_NONE, ANY_STRING, + FeedbackScoreModel, + SpanModel, + TraceModel, assert_equal, ) @@ -477,12 +478,23 @@ async def async_f(x): def test_track__nested_calls_in_separate_threads__3_traces_in_result(fake_backend): + ID_STORAGE: Dict[str, str] = {} + @tracker.track(capture_output=True) def f_inner(y, thread_id): + ID_STORAGE[f"f_inner-trace-id-{thread_id}"] = ( + opik_context.get_current_trace_data().id + ) + ID_STORAGE[f"f_inner-span-id-{thread_id}"] = ( + opik_context.get_current_span_data().id + ) return f"inner-output-from-{thread_id}" @tracker.track(capture_output=True) def f_outer(x): + ID_STORAGE["f_outer-trace-id"] = opik_context.get_current_trace_data().id + ID_STORAGE["f_outer-span-id"] = opik_context.get_current_span_data().id + t1 = threading.Thread(target=f_inner, args=("inner-input-1", "thread-1")) t2 = threading.Thread(target=f_inner, args=("inner-input-2", "thread-2")) t1.start() @@ -497,7 +509,7 @@ def f_outer(x): EXPECTED_TRACE_TREES = [ TraceModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_outer-trace-id"], name="f_outer", input={"x": "outer-input"}, output={"output": "outer-output"}, @@ -505,7 +517,7 @@ def f_outer(x): end_time=ANY_BUT_NONE, spans=[ SpanModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_outer-span-id"], name="f_outer", input={"x": "outer-input"}, output={"output": "outer-output"}, @@ -516,7 +528,7 @@ def f_outer(x): ], ), TraceModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_inner-trace-id-thread-1"], name="f_inner", input={"y": "inner-input-1", "thread_id": "thread-1"}, output={"output": "inner-output-from-thread-1"}, @@ -524,7 +536,7 @@ def f_outer(x): end_time=ANY_BUT_NONE, spans=[ SpanModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_inner-span-id-thread-1"], name="f_inner", input={"y": "inner-input-1", "thread_id": "thread-1"}, output={"output": "inner-output-from-thread-1"}, @@ -535,7 +547,7 @@ def f_outer(x): ], ), TraceModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_inner-trace-id-thread-2"], name="f_inner", input={"y": "inner-input-2", "thread_id": "thread-2"}, output={"output": "inner-output-from-thread-2"}, @@ -543,7 +555,7 @@ def f_outer(x): end_time=ANY_BUT_NONE, spans=[ SpanModel( - id=ANY_BUT_NONE, + id=ID_STORAGE["f_inner-span-id-thread-2"], name="f_inner", input={"y": "inner-input-2", "thread_id": "thread-2"}, output={"output": "inner-output-from-thread-2"}, @@ -557,9 +569,27 @@ def f_outer(x): assert len(fake_backend.trace_trees) == 3 - assert_equal(EXPECTED_TRACE_TREES[0], fake_backend.trace_trees[0]) - assert_equal(EXPECTED_TRACE_TREES[1], fake_backend.trace_trees[1]) - assert_equal(EXPECTED_TRACE_TREES[2], fake_backend.trace_trees[2]) + trace_outer = EXPECTED_TRACE_TREES[0] + trace_inner_thread1 = EXPECTED_TRACE_TREES[1] + trace_inner_thread2 = EXPECTED_TRACE_TREES[2] + + trace_backend_outer = [ + trace for trace in fake_backend.trace_trees if trace.id == trace_outer.id + ][0] + trace_backend_inner_thread1 = [ + trace + for trace in fake_backend.trace_trees + if trace.id == trace_inner_thread1.id + ][0] + trace_backend_inner_thread2 = [ + trace + for trace in fake_backend.trace_trees + if trace.id == trace_inner_thread2.id + ][0] + + assert_equal(expected=trace_outer, actual=trace_backend_outer) + assert_equal(expected=trace_inner_thread1, actual=trace_backend_inner_thread1) + assert_equal(expected=trace_inner_thread2, actual=trace_backend_inner_thread2) def test_track__single_generator_function_tracked__generator_exhausted__happyflow( diff --git a/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py b/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py index 2d37ca4a18..131fb49926 100644 --- a/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py +++ b/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py @@ -21,7 +21,7 @@ def test_flushing_thread__batcher_is_flushed__every_time_flush_interval_time_pas batcher.add("some-value-to-make-batcher-not-empty") flush_callback.assert_not_called() - time.sleep(FLUSH_INTERVAL + 0.01) + time.sleep(FLUSH_INTERVAL + 0.1) # flush interval has passed after batcher was created, batcher is ready to be flushed # (0.1 is added because thread probation interval is 0.1 and it's already made it first check) flush_callback.assert_called_once()