From c019f1b02050f4e194cebe056870543b6322e1c6 Mon Sep 17 00:00:00 2001 From: art-dsit <153507562+art-dsit@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:24:30 +0100 Subject: [PATCH] Add test and fix incorrect assumption that project has a single working dir. (#303) * Add failing test and fix incorrect assumption that project has a single working dir. * ruff and mypy fixes * correct logic in mockllm * more mypy * Moved tool_call_utils into test_helpers package; asserted on tool call read results * remove working_dir from ComposeProject --- src/inspect_ai/model/_providers/mockllm.py | 16 +- src/inspect_ai/util/_sandbox/docker/docker.py | 32 ++-- src/inspect_ai/util/_sandbox/docker/util.py | 11 +- src/inspect_ai/util/_sandbox/self_check.py | 2 + tests/model/test_mock_model_llm.py | 10 ++ .../tool_call_utils.py | 0 tests/tools/test_sandbox_tool_eval.py | 2 +- tests/tools/test_tools.py | 6 +- .../docker-compose.yaml | 21 +++ .../test_docker_compose_multiple_services.py | 164 ++++++++++++++++++ 10 files changed, 231 insertions(+), 33 deletions(-) rename tests/{tools => test_helpers}/tool_call_utils.py (100%) create mode 100644 tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml create mode 100644 tests/util/sandbox/test_docker_compose_multiple_services.py diff --git a/src/inspect_ai/model/_providers/mockllm.py b/src/inspect_ai/model/_providers/mockllm.py index 8a34227b0..89cc4f42a 100644 --- a/src/inspect_ai/model/_providers/mockllm.py +++ b/src/inspect_ai/model/_providers/mockllm.py @@ -1,4 +1,4 @@ -from typing import Any, Iterable, Iterator +from typing import Any, Generator, Iterable, Iterator from inspect_ai.tool import ToolChoice, ToolInfo @@ -27,19 +27,21 @@ def __init__( base_url: str | None = None, api_key: str | None = None, config: GenerateConfig = GenerateConfig(), - custom_outputs: Iterable[ModelOutput] = [], + custom_outputs: Iterable[ModelOutput] + | Generator[ModelOutput, None, None] + | None = None, **model_args: dict[str, Any], ) -> None: super().__init__(model_name, base_url, api_key, [], config) self.model_args = model_args if model_name != "model": raise ValueError(f"Invalid model name: {model_name}") - if custom_outputs: + if custom_outputs is not None: # We cannot rely on the user of this model giving custom_outputs the correct type since they do not call this constructor # Hence this type check and the one in generate. - if not isinstance(custom_outputs, Iterable): + if not isinstance(custom_outputs, Iterable | Generator): raise ValueError( - f"model_args['custom_outputs'] must be an Iterable, got {custom_outputs}" + f"model_args['custom_outputs'] must be an Iterable or a Generator, got {custom_outputs}" ) self.outputs = iter(custom_outputs) else: @@ -65,5 +67,7 @@ async def generate( raise ValueError("custom_outputs ran out of values") if not isinstance(output, ModelOutput): - raise ValueError("output must be an instance of ModelOutput") + raise ValueError( + f"output must be an instance of ModelOutput; got {type(output)}; content: {repr(output)}" + ) return output diff --git a/src/inspect_ai/util/_sandbox/docker/docker.py b/src/inspect_ai/util/_sandbox/docker/docker.py index 3df5cd320..33f309851 100644 --- a/src/inspect_ai/util/_sandbox/docker/docker.py +++ b/src/inspect_ai/util/_sandbox/docker/docker.py @@ -117,10 +117,10 @@ async def sample_init( environments: dict[str, SandboxEnvironment] = {} for service, service_info in services.items(): # update the project w/ the working directory - project.working_dir = await container_working_dir(service, project) + working_dir = await container_working_dir(service, project) # create the docker sandbox environemnt - docker_env = DockerSandboxEnvironment(service, project) + docker_env = DockerSandboxEnvironment(service, project, working_dir) # save reference to environment (mark as default if requested) is_default = service_info.get("x-default", False) is True @@ -166,10 +166,11 @@ async def task_cleanup( async def cli_cleanup(cls, id: str | None) -> None: await cli_cleanup(id) - def __init__(self, service: str, project: ComposeProject) -> None: + def __init__(self, service: str, project: ComposeProject, working_dir: str) -> None: super().__init__() self._service = service self._project = project + self._working_dir = working_dir @override async def exec( @@ -184,9 +185,9 @@ async def exec( # additional args args = [] - final_cwd = Path(self._project.working_dir if cwd is None else cwd) + final_cwd = Path(self._working_dir if cwd is None else cwd) if not final_cwd.is_absolute(): - final_cwd = self._project.working_dir / final_cwd + final_cwd = self._working_dir / final_cwd args.append("--workdir") args.append(str(final_cwd)) @@ -214,7 +215,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None: sandbox_log(f"write_file: {file}") # resolve relative file paths - file = container_file(self._project, file) + file = self.container_file(file) # ensure that the directory exists parent = Path(file).parent.as_posix() @@ -224,7 +225,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None: if "permission denied" in result.stderr.lower(): raise PermissionError(errno.EACCES, "Permission denied.", parent) else: - msg = f"Failed to create container directory {parent}: {result.stderr}" + msg = f"Failed to create container directory {parent}: {result}" raise RuntimeError(msg) # We want to be able to write a file in the container, @@ -259,7 +260,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None: # compose cp will leave the file owned by root await compose_cp( src=local_tmpfile.name, - dest=f"{self._service}:{container_file(self._project,container_tmpfile)}", + dest=f"{self._service}:{self.container_file(container_tmpfile)}", project=self._project, ) @@ -316,7 +317,7 @@ async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]: with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir: # resolve relative file paths original_file = file - file = container_file(self._project, file) + file = self.container_file(file) # copy the file dest_file = os.path.join(temp_dir, os.path.basename(file)) @@ -353,6 +354,12 @@ async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]: async with aiofiles.open(dest_file, "rb") as f: return await f.read() + def container_file(self, file: str) -> str: + path = Path(file) + if not path.is_absolute(): + path = Path(self._working_dir) / path + return path.as_posix() + async def container_working_dir( service: str, project: ComposeProject, default: str = "/" @@ -366,10 +373,3 @@ async def container_working_dir( + f"{result.stderr}" ) return default - - -def container_file(project: ComposeProject, file: str) -> str: - path = Path(file) - if not path.is_absolute(): - path = Path(project.working_dir) / path - return path.as_posix() diff --git a/src/inspect_ai/util/_sandbox/docker/util.py b/src/inspect_ai/util/_sandbox/docker/util.py index c98007e27..d03e7ac79 100644 --- a/src/inspect_ai/util/_sandbox/docker/util.py +++ b/src/inspect_ai/util/_sandbox/docker/util.py @@ -17,15 +17,10 @@ class ComposeProject: name: str config: str | None env: dict[str, str] | None - working_dir: str @classmethod async def create( - cls, - name: str, - config: str | None, - env: dict[str, str] = {}, - working_dir: str = "/", + cls, name: str, config: str | None, env: dict[str, str] = {} ) -> "ComposeProject": # ensure we have an auto-compose file if we need one config = ( @@ -36,19 +31,17 @@ async def create( await ensure_auto_compose_file(config) # return project - return ComposeProject(name, config, env, working_dir) + return ComposeProject(name, config, env) def __init__( self, name: str, config: str | None, env: dict[str, str], - working_dir: str, ) -> None: self.name = name self.config = config self.env = env - self.working_dir = working_dir def __eq__(self, other: object) -> bool: if not isinstance(other, ComposeProject): diff --git a/src/inspect_ai/util/_sandbox/self_check.py b/src/inspect_ai/util/_sandbox/self_check.py index 69ec54950..816e61506 100644 --- a/src/inspect_ai/util/_sandbox/self_check.py +++ b/src/inspect_ai/util/_sandbox/self_check.py @@ -205,6 +205,8 @@ async def test_cwd_absolute(sandbox_env: SandboxEnvironment) -> None: await _cleanup_file(sandbox_env, file_name) +# TODO: write a test for when cwd doesn't exist + # Generic type variable for exceptions E = TypeVar("E", bound=BaseException) diff --git a/tests/model/test_mock_model_llm.py b/tests/model/test_mock_model_llm.py index 4f4ed451c..7cad755f4 100644 --- a/tests/model/test_mock_model_llm.py +++ b/tests/model/test_mock_model_llm.py @@ -72,6 +72,16 @@ async def test_mock_generate_custom_invalid_iterable_string() -> None: assert "must be an instance of ModelOutput" in str(e_info.value) +@pytest.mark.asyncio +async def test_mock_generate_custom_invalid_iterable_number() -> None: + with pytest.raises(ValueError) as e_info: + get_model( + "mockllm/model", + custom_outputs=0, + ) + assert "must be an Iterable or a Generator" in str(e_info.value) + + @pytest.mark.asyncio async def test_mock_generate_not_enough() -> None: model = get_model( diff --git a/tests/tools/tool_call_utils.py b/tests/test_helpers/tool_call_utils.py similarity index 100% rename from tests/tools/tool_call_utils.py rename to tests/test_helpers/tool_call_utils.py diff --git a/tests/tools/test_sandbox_tool_eval.py b/tests/tools/test_sandbox_tool_eval.py index e24a3f0f9..33aa12429 100644 --- a/tests/tools/test_sandbox_tool_eval.py +++ b/tests/tools/test_sandbox_tool_eval.py @@ -1,7 +1,7 @@ import itertools +from test_helpers.tool_call_utils import get_tool_calls, get_tool_response from test_helpers.tools import list_files, read_file -from tool_call_utils import get_tool_calls, get_tool_response from inspect_ai import Task, eval from inspect_ai.dataset import Sample diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py index 156faf3c0..6c5289840 100644 --- a/tests/tools/test_tools.py +++ b/tests/tools/test_tools.py @@ -1,6 +1,11 @@ from random import randint from typing import Literal +from test_helpers.tool_call_utils import ( + get_tool_call, + get_tool_calls, + get_tool_response, +) from test_helpers.tools import addition, raise_error, read_file from test_helpers.utils import ( skip_if_no_anthropic, @@ -10,7 +15,6 @@ skip_if_no_openai, skip_if_no_vertex, ) -from tool_call_utils import get_tool_call, get_tool_calls, get_tool_response from inspect_ai import Task, eval from inspect_ai.dataset import Sample diff --git a/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml b/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml new file mode 100644 index 000000000..8382db057 --- /dev/null +++ b/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml @@ -0,0 +1,21 @@ +services: + default: + image: "python:3.12-bookworm" + command: "tail -f /dev/null" + init: true + network_mode: none + stop_grace_period: 1s + service_1: + image: "python:3.12-bookworm" + command: "tail -f /dev/null" + working_dir: /usr/local/dir1 + init: true + network_mode: none + stop_grace_period: 1s + service_2: + image: "python:3.12-bookworm" + command: "tail -f /dev/null" + working_dir: /usr/local/dir2 + init: true + network_mode: none + stop_grace_period: 1s \ No newline at end of file diff --git a/tests/util/sandbox/test_docker_compose_multiple_services.py b/tests/util/sandbox/test_docker_compose_multiple_services.py new file mode 100644 index 000000000..31ca946a8 --- /dev/null +++ b/tests/util/sandbox/test_docker_compose_multiple_services.py @@ -0,0 +1,164 @@ +from pathlib import Path + +from test_helpers.tool_call_utils import get_tool_calls, get_tool_response + +from inspect_ai import Task, eval +from inspect_ai.dataset import Sample +from inspect_ai.model import ModelOutput, get_model +from inspect_ai.scorer import includes +from inspect_ai.solver import generate, use_tools +from inspect_ai.tool import Tool, tool +from inspect_ai.util import sandbox + +CURRENT_DIRECTORY = Path(__file__).resolve().parent + + +@tool +def write_file_service_1() -> Tool: + async def execute(file: str, content: str): + """ + Writes the contents of a file. + + Args: + file (str): File to write. + content(str): Content to write to the file. + + + """ + await sandbox("service_1").write_file(file, content) + + return execute + + +@tool +def write_file_service_2() -> Tool: + async def execute(file: str, content: str): + """ + Writes the contents of a file. + + Args: + file (str): File to write. + content(str): Content to write to the file. + + + """ + await sandbox("service_2").write_file(file, content) + + return execute + + +@tool +def read_file_service_1() -> Tool: + async def execute(file: str) -> str: + """ + Reads the contents of a file. + + Args: + file (str): File to read. + + + Returns: + str: Contents of the file. + + """ + return await sandbox("service_1").read_file(file, text=True) + + return execute + + +@tool +def read_file_service_2() -> Tool: + async def execute(file: str) -> str: + """ + Reads the contents of a file. + + Args: + file (str): File to read. + + + Returns: + str: Contents of the file. + + """ + return await sandbox("service_2").read_file(file, text=True) + + return execute + + +def test_docker_compose_multiple_services_write_file(): + dataset = [ + Sample( + input="Writes some files please", + target="unused", + files={"foo.txt": "contents_of_foo.txt"}, + ) + ] + task = Task( + dataset=dataset, + plan=[ + use_tools( + [ + write_file_service_1(), + write_file_service_2(), + read_file_service_1(), + read_file_service_2(), + ] + ), + generate(), + ], + scorer=includes(), + sandbox=( + "docker", + str( + CURRENT_DIRECTORY + / "docker_compose_multiple_services/docker-compose.yaml" + ), + ), + ) + + def tool_calls(): + yield ModelOutput.for_tool_call( + model="mockllm/model", + tool_name="write_file_service_1", + tool_arguments={"file": "foo.txt", "content": "unused_1"}, + ) + yield ModelOutput.for_tool_call( + model="mockllm/model", + tool_name="write_file_service_2", + tool_arguments={"file": "foo.txt", "content": "unused_2"}, + ) + yield ModelOutput.for_tool_call( + model="mockllm/model", + tool_name="read_file_service_1", + tool_arguments={"file": "foo.txt"}, + ) + yield ModelOutput.for_tool_call( + model="mockllm/model", + tool_name="read_file_service_2", + tool_arguments={"file": "foo.txt"}, + ) + while True: + yield ModelOutput.from_content( + model="mockllm/model", content="nothing left" + ) + + result = eval( + task, + model=get_model("mockllm/model", custom_outputs=tool_calls()), + max_messages=10, # otherwise we can get into an infinite loop if the tools error + )[0] + + assert result.status == "success" + messages = result.samples[0].messages + assert ( + get_tool_response( + messages, get_tool_calls(messages, "read_file_service_1")[0] + ).content + == "unused_1" + ) + assert ( + get_tool_response( + messages, get_tool_calls(messages, "read_file_service_2")[0] + ).content + == "unused_2" + )