diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
index bd94911fa6..e640ff6ed8 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
@@ -139,7 +139,7 @@ async def ascore(
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content["context_precision_score"]
+            score: float = float(dict_content["context_precision_score"])
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
index 1d91124c2a..37f92d1c90 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
@@ -138,7 +138,7 @@ async def ascore(
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content["context_recall_score"]
+            score: float = float(dict_content["context_recall_score"])
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
index 89459fa448..02f681e08e 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
@@ -130,7 +130,7 @@ async def ascore(
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score = dict_content["score"]
+            score = float(dict_content["score"])
             return score_result.ScoreResult(
                 name=self.name,
                 value=score,
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
index 6bee822977..292ea7ff64 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
@@ -108,7 +108,7 @@ async def ascore(
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content["score"]
+            score: float = float(dict_content["score"])
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
diff --git a/sdks/python/tests/unit/decorator/test_tracker_outputs.py b/sdks/python/tests/unit/decorator/test_tracker_outputs.py
index 06dbde4d02..86452481c0 100644
--- a/sdks/python/tests/unit/decorator/test_tracker_outputs.py
+++ b/sdks/python/tests/unit/decorator/test_tracker_outputs.py
@@ -1,18 +1,19 @@
-import mock
-import threading
 import asyncio
+import threading
+from typing import Dict
+
+import mock
 import pytest
-from opik.decorator import tracker
-from opik import context_storage, opik_context
-from opik.api_objects import opik_client
-from opik.api_objects import trace
 
+from opik import context_storage, opik_context
+from opik.api_objects import opik_client, trace
+from opik.decorator import tracker
 from ...testlib import (
-    SpanModel,
-    TraceModel,
-    FeedbackScoreModel,
     ANY_BUT_NONE,
     ANY_STRING,
+    FeedbackScoreModel,
+    SpanModel,
+    TraceModel,
     assert_equal,
 )
 
@@ -477,12 +478,23 @@ async def async_f(x):
 
 
 def test_track__nested_calls_in_separate_threads__3_traces_in_result(fake_backend):
+    ID_STORAGE: Dict[str, str] = {}
+
     @tracker.track(capture_output=True)
     def f_inner(y, thread_id):
+        ID_STORAGE[f"f_inner-trace-id-{thread_id}"] = (
+            opik_context.get_current_trace_data().id
+        )
+        ID_STORAGE[f"f_inner-span-id-{thread_id}"] = (
+            opik_context.get_current_span_data().id
+        )
         return f"inner-output-from-{thread_id}"
 
     @tracker.track(capture_output=True)
     def f_outer(x):
+        ID_STORAGE["f_outer-trace-id"] = opik_context.get_current_trace_data().id
+        ID_STORAGE["f_outer-span-id"] = opik_context.get_current_span_data().id
+
         t1 = threading.Thread(target=f_inner, args=("inner-input-1", "thread-1"))
         t2 = threading.Thread(target=f_inner, args=("inner-input-2", "thread-2"))
         t1.start()
@@ -497,7 +509,7 @@ def f_outer(x):
 
     EXPECTED_TRACE_TREES = [
         TraceModel(
-            id=ANY_BUT_NONE,
+            id=ID_STORAGE["f_outer-trace-id"],
             name="f_outer",
             input={"x": "outer-input"},
             output={"output": "outer-output"},
@@ -505,7 +517,7 @@ def f_outer(x):
             end_time=ANY_BUT_NONE,
             spans=[
                 SpanModel(
-                    id=ANY_BUT_NONE,
+                    id=ID_STORAGE["f_outer-span-id"],
                     name="f_outer",
                     input={"x": "outer-input"},
                     output={"output": "outer-output"},
@@ -516,7 +528,7 @@ def f_outer(x):
             ],
         ),
         TraceModel(
-            id=ANY_BUT_NONE,
+            id=ID_STORAGE["f_inner-trace-id-thread-1"],
             name="f_inner",
             input={"y": "inner-input-1", "thread_id": "thread-1"},
             output={"output": "inner-output-from-thread-1"},
@@ -524,7 +536,7 @@ def f_outer(x):
             end_time=ANY_BUT_NONE,
             spans=[
                 SpanModel(
-                    id=ANY_BUT_NONE,
+                    id=ID_STORAGE["f_inner-span-id-thread-1"],
                     name="f_inner",
                     input={"y": "inner-input-1", "thread_id": "thread-1"},
                     output={"output": "inner-output-from-thread-1"},
@@ -535,7 +547,7 @@ def f_outer(x):
             ],
         ),
         TraceModel(
-            id=ANY_BUT_NONE,
+            id=ID_STORAGE["f_inner-trace-id-thread-2"],
             name="f_inner",
             input={"y": "inner-input-2", "thread_id": "thread-2"},
             output={"output": "inner-output-from-thread-2"},
@@ -543,7 +555,7 @@ def f_outer(x):
             end_time=ANY_BUT_NONE,
             spans=[
                 SpanModel(
-                    id=ANY_BUT_NONE,
+                    id=ID_STORAGE["f_inner-span-id-thread-2"],
                     name="f_inner",
                     input={"y": "inner-input-2", "thread_id": "thread-2"},
                     output={"output": "inner-output-from-thread-2"},
@@ -557,9 +569,27 @@ def f_outer(x):
 
     assert len(fake_backend.trace_trees) == 3
 
-    assert_equal(EXPECTED_TRACE_TREES[0], fake_backend.trace_trees[0])
-    assert_equal(EXPECTED_TRACE_TREES[1], fake_backend.trace_trees[1])
-    assert_equal(EXPECTED_TRACE_TREES[2], fake_backend.trace_trees[2])
+    trace_outer = EXPECTED_TRACE_TREES[0]
+    trace_inner_thread1 = EXPECTED_TRACE_TREES[1]
+    trace_inner_thread2 = EXPECTED_TRACE_TREES[2]
+
+    trace_backend_outer = [
+        trace for trace in fake_backend.trace_trees if trace.id == trace_outer.id
+    ][0]
+    trace_backend_inner_thread1 = [
+        trace
+        for trace in fake_backend.trace_trees
+        if trace.id == trace_inner_thread1.id
+    ][0]
+    trace_backend_inner_thread2 = [
+        trace
+        for trace in fake_backend.trace_trees
+        if trace.id == trace_inner_thread2.id
+    ][0]
+
+    assert_equal(expected=trace_outer, actual=trace_backend_outer)
+    assert_equal(expected=trace_inner_thread1, actual=trace_backend_inner_thread1)
+    assert_equal(expected=trace_inner_thread2, actual=trace_backend_inner_thread2)
 
 
 def test_track__single_generator_function_tracked__generator_exhausted__happyflow(
diff --git a/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py b/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py
index 2d37ca4a18..131fb49926 100644
--- a/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py
+++ b/sdks/python/tests/unit/message_processing/batching/test_flushing_thread.py
@@ -21,7 +21,7 @@ def test_flushing_thread__batcher_is_flushed__every_time_flush_interval_time_pas
     batcher.add("some-value-to-make-batcher-not-empty")
     flush_callback.assert_not_called()
 
-    time.sleep(FLUSH_INTERVAL + 0.01)
+    time.sleep(FLUSH_INTERVAL + 0.1)
     # flush interval has passed after batcher was created, batcher is ready to be flushed
     # (0.1 is added because thread probation interval is 0.1 and it's already made it first check)
     flush_callback.assert_called_once()