diff --git a/CHANGELOG.md b/CHANGELOG.md index 0260fcc18..c948610c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - Log viewer: metadata which contains images will now render the images. - Log viewer: show custom tool call views in messages display. - Bugfix: Correctly read and forward image detail property. +- Bugfix: Correct resolution of global eval override of task or sample sandboxes. ## v0.3.47 (18 November 2024) diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index 414dd2677..b51898be9 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -383,6 +383,7 @@ async def eval_async( tasks=task_batch, parallel=parallel, eval_config=eval_config, + eval_sandbox=sandbox, recorder=recorder, model_args=model_args, epochs_reducer=epochs_reducer, @@ -407,6 +408,7 @@ async def eval_async( tasks=resolved_tasks, parallel=parallel, eval_config=eval_config, + eval_sandbox=sandbox, recorder=recorder, model_args=model_args, epochs_reducer=epochs_reducer, diff --git a/src/inspect_ai/_eval/run.py b/src/inspect_ai/_eval/run.py index 886e87c98..a6ce6c2d7 100644 --- a/src/inspect_ai/_eval/run.py +++ b/src/inspect_ai/_eval/run.py @@ -22,7 +22,13 @@ from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names from inspect_ai.scorer._reducer.registry import validate_reducer from inspect_ai.solver._solver import Solver, SolverSpec -from inspect_ai.util._sandbox.environment import TaskCleanup, TaskInit +from inspect_ai.util._sandbox.environment import ( + SandboxEnvironmentSpec, + SandboxEnvironmentType, + TaskCleanup, + TaskInit, + resolve_sandbox_environment, +) from inspect_ai.util._sandbox.registry import registry_find_sandboxenv from .loader import ( @@ -44,6 +50,7 @@ async def eval_run( tasks: list[ResolvedTask], parallel: int, eval_config: EvalConfig, + eval_sandbox: SandboxEnvironmentType | None, recorder: Recorder, model_args: dict[str, Any], epochs_reducer: list[ScoreReducer] | None = None, @@ -66,7 +73,7 @@ async def eval_run( if has_sandbox: cleanup = eval_config.sandbox_cleanup is not False shutdown_sandbox_environments = await startup_sandbox_environments( - tasks, cleanup + resolve_sandbox_environment(eval_sandbox), tasks, cleanup ) # resolve solver and solver spec @@ -319,14 +326,16 @@ async def worker() -> None: async def startup_sandbox_environments( - tasks: list[ResolvedTask], cleanup: bool + eval_sandbox: SandboxEnvironmentSpec | None, + tasks: list[ResolvedTask], + cleanup: bool, ) -> Callable[[], Awaitable[None]]: # find unique sandboxenvs sandboxenvs: Set[TaskSandboxEnvironment] = set() for task in tasks: # resolve each sample and add to sandboxenvs for sample in task.task.dataset: - sandbox = resolve_sandbox_for_task(task.task, sample) + sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample) if sandbox is not None and sandbox not in sandboxenvs: sandboxenvs.add(sandbox) diff --git a/src/inspect_ai/_eval/task/sandbox.py b/src/inspect_ai/_eval/task/sandbox.py index 1a9ee9c1f..ccf2a8d30 100644 --- a/src/inspect_ai/_eval/task/sandbox.py +++ b/src/inspect_ai/_eval/task/sandbox.py @@ -104,10 +104,12 @@ class TaskSandboxEnvironment(NamedTuple): def resolve_sandbox_for_task( + eval_sandbox: SandboxEnvironmentSpec | None, task: Task, sample: Sample, ) -> TaskSandboxEnvironment | None: - sandbox = resolve_sandbox(task.sandbox, sample) + # eval_sandbox overrides task or sample sandbox + sandbox = eval_sandbox or resolve_sandbox(task.sandbox, sample) if sandbox is not None: return TaskSandboxEnvironment(sandbox, task_run_dir(task)) else: