Skip to content

Commit

Permalink
[OPIK-277] [SDK] Flaky unit-tests on Windows (#846)
Browse files Browse the repository at this point in the history
* [OPIK-277] [SDK] Flaky unit-tests on Windows

* more accurate comparison

* fix flaky metric evaluation tests
  • Loading branch information
japdubengsub authored Dec 11, 2024
1 parent 2d07e60 commit 0571441
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ async def ascore(
def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content["context_precision_score"]
score: float = float(dict_content["context_precision_score"])

if not (0.0 <= score <= 1.0):
score = 0.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ async def ascore(
def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content["context_recall_score"]
score: float = float(dict_content["context_recall_score"])

if not (0.0 <= score <= 1.0):
score = 0.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ async def ascore(
def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score = dict_content["score"]
score = float(dict_content["score"])
return score_result.ScoreResult(
name=self.name,
value=score,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ async def ascore(
def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content["score"]
score: float = float(dict_content["score"])

if not (0.0 <= score <= 1.0):
score = 0.5
Expand Down
66 changes: 48 additions & 18 deletions sdks/python/tests/unit/decorator/test_tracker_outputs.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import mock
import threading
import asyncio
import threading
from typing import Dict

import mock
import pytest
from opik.decorator import tracker
from opik import context_storage, opik_context
from opik.api_objects import opik_client
from opik.api_objects import trace

from opik import context_storage, opik_context
from opik.api_objects import opik_client, trace
from opik.decorator import tracker
from ...testlib import (
SpanModel,
TraceModel,
FeedbackScoreModel,
ANY_BUT_NONE,
ANY_STRING,
FeedbackScoreModel,
SpanModel,
TraceModel,
assert_equal,
)

Expand Down Expand Up @@ -477,12 +478,23 @@ async def async_f(x):


def test_track__nested_calls_in_separate_threads__3_traces_in_result(fake_backend):
ID_STORAGE: Dict[str, str] = {}

@tracker.track(capture_output=True)
def f_inner(y, thread_id):
ID_STORAGE[f"f_inner-trace-id-{thread_id}"] = (
opik_context.get_current_trace_data().id
)
ID_STORAGE[f"f_inner-span-id-{thread_id}"] = (
opik_context.get_current_span_data().id
)
return f"inner-output-from-{thread_id}"

@tracker.track(capture_output=True)
def f_outer(x):
ID_STORAGE["f_outer-trace-id"] = opik_context.get_current_trace_data().id
ID_STORAGE["f_outer-span-id"] = opik_context.get_current_span_data().id

t1 = threading.Thread(target=f_inner, args=("inner-input-1", "thread-1"))
t2 = threading.Thread(target=f_inner, args=("inner-input-2", "thread-2"))
t1.start()
Expand All @@ -497,15 +509,15 @@ def f_outer(x):

EXPECTED_TRACE_TREES = [
TraceModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_outer-trace-id"],
name="f_outer",
input={"x": "outer-input"},
output={"output": "outer-output"},
start_time=ANY_BUT_NONE,
end_time=ANY_BUT_NONE,
spans=[
SpanModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_outer-span-id"],
name="f_outer",
input={"x": "outer-input"},
output={"output": "outer-output"},
Expand All @@ -516,15 +528,15 @@ def f_outer(x):
],
),
TraceModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_inner-trace-id-thread-1"],
name="f_inner",
input={"y": "inner-input-1", "thread_id": "thread-1"},
output={"output": "inner-output-from-thread-1"},
start_time=ANY_BUT_NONE,
end_time=ANY_BUT_NONE,
spans=[
SpanModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_inner-span-id-thread-1"],
name="f_inner",
input={"y": "inner-input-1", "thread_id": "thread-1"},
output={"output": "inner-output-from-thread-1"},
Expand All @@ -535,15 +547,15 @@ def f_outer(x):
],
),
TraceModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_inner-trace-id-thread-2"],
name="f_inner",
input={"y": "inner-input-2", "thread_id": "thread-2"},
output={"output": "inner-output-from-thread-2"},
start_time=ANY_BUT_NONE,
end_time=ANY_BUT_NONE,
spans=[
SpanModel(
id=ANY_BUT_NONE,
id=ID_STORAGE["f_inner-span-id-thread-2"],
name="f_inner",
input={"y": "inner-input-2", "thread_id": "thread-2"},
output={"output": "inner-output-from-thread-2"},
Expand All @@ -557,9 +569,27 @@ def f_outer(x):

assert len(fake_backend.trace_trees) == 3

assert_equal(EXPECTED_TRACE_TREES[0], fake_backend.trace_trees[0])
assert_equal(EXPECTED_TRACE_TREES[1], fake_backend.trace_trees[1])
assert_equal(EXPECTED_TRACE_TREES[2], fake_backend.trace_trees[2])
trace_outer = EXPECTED_TRACE_TREES[0]
trace_inner_thread1 = EXPECTED_TRACE_TREES[1]
trace_inner_thread2 = EXPECTED_TRACE_TREES[2]

trace_backend_outer = [
trace for trace in fake_backend.trace_trees if trace.id == trace_outer.id
][0]
trace_backend_inner_thread1 = [
trace
for trace in fake_backend.trace_trees
if trace.id == trace_inner_thread1.id
][0]
trace_backend_inner_thread2 = [
trace
for trace in fake_backend.trace_trees
if trace.id == trace_inner_thread2.id
][0]

assert_equal(expected=trace_outer, actual=trace_backend_outer)
assert_equal(expected=trace_inner_thread1, actual=trace_backend_inner_thread1)
assert_equal(expected=trace_inner_thread2, actual=trace_backend_inner_thread2)


def test_track__single_generator_function_tracked__generator_exhausted__happyflow(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_flushing_thread__batcher_is_flushed__every_time_flush_interval_time_pas
batcher.add("some-value-to-make-batcher-not-empty")
flush_callback.assert_not_called()

time.sleep(FLUSH_INTERVAL + 0.01)
time.sleep(FLUSH_INTERVAL + 0.1)
# flush interval has passed after batcher was created, batcher is ready to be flushed
# (0.1 is added because thread probation interval is 0.1 and it's already made it first check)
flush_callback.assert_called_once()
Expand Down

0 comments on commit 0571441

Please sign in to comment.