From c019f1b02050f4e194cebe056870543b6322e1c6 Mon Sep 17 00:00:00 2001
From: art-dsit <153507562+art-dsit@users.noreply.github.com>
Date: Fri, 30 Aug 2024 14:24:30 +0100
Subject: [PATCH] Add test and fix incorrect assumption that project has a
 single working dir. (#303)

* Add failing test and fix incorrect assumption that project
has a single working dir.

* ruff and mypy fixes

* correct logic in mockllm

* more mypy

* Moved tool_call_utils into test_helpers package; asserted on tool call read results

* remove working_dir from ComposeProject
---
 src/inspect_ai/model/_providers/mockllm.py    |  16 +-
 src/inspect_ai/util/_sandbox/docker/docker.py |  32 ++--
 src/inspect_ai/util/_sandbox/docker/util.py   |  11 +-
 src/inspect_ai/util/_sandbox/self_check.py    |   2 +
 tests/model/test_mock_model_llm.py            |  10 ++
 .../tool_call_utils.py                        |   0
 tests/tools/test_sandbox_tool_eval.py         |   2 +-
 tests/tools/test_tools.py                     |   6 +-
 .../docker-compose.yaml                       |  21 +++
 .../test_docker_compose_multiple_services.py  | 164 ++++++++++++++++++
 10 files changed, 231 insertions(+), 33 deletions(-)
 rename tests/{tools => test_helpers}/tool_call_utils.py (100%)
 create mode 100644 tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml
 create mode 100644 tests/util/sandbox/test_docker_compose_multiple_services.py

diff --git a/src/inspect_ai/model/_providers/mockllm.py b/src/inspect_ai/model/_providers/mockllm.py
index 8a34227b0..89cc4f42a 100644
--- a/src/inspect_ai/model/_providers/mockllm.py
+++ b/src/inspect_ai/model/_providers/mockllm.py
@@ -1,4 +1,4 @@
-from typing import Any, Iterable, Iterator
+from typing import Any, Generator, Iterable, Iterator
 
 from inspect_ai.tool import ToolChoice, ToolInfo
 
@@ -27,19 +27,21 @@ def __init__(
         base_url: str | None = None,
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
-        custom_outputs: Iterable[ModelOutput] = [],
+        custom_outputs: Iterable[ModelOutput]
+        | Generator[ModelOutput, None, None]
+        | None = None,
         **model_args: dict[str, Any],
     ) -> None:
         super().__init__(model_name, base_url, api_key, [], config)
         self.model_args = model_args
         if model_name != "model":
             raise ValueError(f"Invalid model name: {model_name}")
-        if custom_outputs:
+        if custom_outputs is not None:
             # We cannot rely on the user of this model giving custom_outputs the correct type since they do not call this constructor
             # Hence this type check and the one in generate.
-            if not isinstance(custom_outputs, Iterable):
+            if not isinstance(custom_outputs, Iterable | Generator):
                 raise ValueError(
-                    f"model_args['custom_outputs'] must be an Iterable, got {custom_outputs}"
+                    f"model_args['custom_outputs'] must be an Iterable or a Generator, got {custom_outputs}"
                 )
             self.outputs = iter(custom_outputs)
         else:
@@ -65,5 +67,7 @@ async def generate(
             raise ValueError("custom_outputs ran out of values")
 
         if not isinstance(output, ModelOutput):
-            raise ValueError("output must be an instance of ModelOutput")
+            raise ValueError(
+                f"output must be an instance of ModelOutput; got {type(output)}; content: {repr(output)}"
+            )
         return output
diff --git a/src/inspect_ai/util/_sandbox/docker/docker.py b/src/inspect_ai/util/_sandbox/docker/docker.py
index 3df5cd320..33f309851 100644
--- a/src/inspect_ai/util/_sandbox/docker/docker.py
+++ b/src/inspect_ai/util/_sandbox/docker/docker.py
@@ -117,10 +117,10 @@ async def sample_init(
         environments: dict[str, SandboxEnvironment] = {}
         for service, service_info in services.items():
             # update the project w/ the working directory
-            project.working_dir = await container_working_dir(service, project)
+            working_dir = await container_working_dir(service, project)
 
             # create the docker sandbox environemnt
-            docker_env = DockerSandboxEnvironment(service, project)
+            docker_env = DockerSandboxEnvironment(service, project, working_dir)
 
             # save reference to environment (mark as default if requested)
             is_default = service_info.get("x-default", False) is True
@@ -166,10 +166,11 @@ async def task_cleanup(
     async def cli_cleanup(cls, id: str | None) -> None:
         await cli_cleanup(id)
 
-    def __init__(self, service: str, project: ComposeProject) -> None:
+    def __init__(self, service: str, project: ComposeProject, working_dir: str) -> None:
         super().__init__()
         self._service = service
         self._project = project
+        self._working_dir = working_dir
 
     @override
     async def exec(
@@ -184,9 +185,9 @@ async def exec(
         # additional args
         args = []
 
-        final_cwd = Path(self._project.working_dir if cwd is None else cwd)
+        final_cwd = Path(self._working_dir if cwd is None else cwd)
         if not final_cwd.is_absolute():
-            final_cwd = self._project.working_dir / final_cwd
+            final_cwd = self._working_dir / final_cwd
 
         args.append("--workdir")
         args.append(str(final_cwd))
@@ -214,7 +215,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None:
         sandbox_log(f"write_file: {file}")
 
         # resolve relative file paths
-        file = container_file(self._project, file)
+        file = self.container_file(file)
 
         # ensure that the directory exists
         parent = Path(file).parent.as_posix()
@@ -224,7 +225,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None:
                 if "permission denied" in result.stderr.lower():
                     raise PermissionError(errno.EACCES, "Permission denied.", parent)
                 else:
-                    msg = f"Failed to create container directory {parent}: {result.stderr}"
+                    msg = f"Failed to create container directory {parent}: {result}"
                     raise RuntimeError(msg)
 
         # We want to be able to write a file in the container,
@@ -259,7 +260,7 @@ async def write_file(self, file: str, contents: str | bytes) -> None:
         # compose cp will leave the file owned by root
         await compose_cp(
             src=local_tmpfile.name,
-            dest=f"{self._service}:{container_file(self._project,container_tmpfile)}",
+            dest=f"{self._service}:{self.container_file(container_tmpfile)}",
             project=self._project,
         )
 
@@ -316,7 +317,7 @@ async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]:
         with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
             # resolve relative file paths
             original_file = file
-            file = container_file(self._project, file)
+            file = self.container_file(file)
 
             # copy the file
             dest_file = os.path.join(temp_dir, os.path.basename(file))
@@ -353,6 +354,12 @@ async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]:
                 async with aiofiles.open(dest_file, "rb") as f:
                     return await f.read()
 
+    def container_file(self, file: str) -> str:
+        path = Path(file)
+        if not path.is_absolute():
+            path = Path(self._working_dir) / path
+        return path.as_posix()
+
 
 async def container_working_dir(
     service: str, project: ComposeProject, default: str = "/"
@@ -366,10 +373,3 @@ async def container_working_dir(
             + f"{result.stderr}"
         )
         return default
-
-
-def container_file(project: ComposeProject, file: str) -> str:
-    path = Path(file)
-    if not path.is_absolute():
-        path = Path(project.working_dir) / path
-    return path.as_posix()
diff --git a/src/inspect_ai/util/_sandbox/docker/util.py b/src/inspect_ai/util/_sandbox/docker/util.py
index c98007e27..d03e7ac79 100644
--- a/src/inspect_ai/util/_sandbox/docker/util.py
+++ b/src/inspect_ai/util/_sandbox/docker/util.py
@@ -17,15 +17,10 @@ class ComposeProject:
     name: str
     config: str | None
     env: dict[str, str] | None
-    working_dir: str
 
     @classmethod
     async def create(
-        cls,
-        name: str,
-        config: str | None,
-        env: dict[str, str] = {},
-        working_dir: str = "/",
+        cls, name: str, config: str | None, env: dict[str, str] = {}
     ) -> "ComposeProject":
         # ensure we have an auto-compose file if we need one
         config = (
@@ -36,19 +31,17 @@ async def create(
         await ensure_auto_compose_file(config)
 
         # return project
-        return ComposeProject(name, config, env, working_dir)
+        return ComposeProject(name, config, env)
 
     def __init__(
         self,
         name: str,
         config: str | None,
         env: dict[str, str],
-        working_dir: str,
     ) -> None:
         self.name = name
         self.config = config
         self.env = env
-        self.working_dir = working_dir
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, ComposeProject):
diff --git a/src/inspect_ai/util/_sandbox/self_check.py b/src/inspect_ai/util/_sandbox/self_check.py
index 69ec54950..816e61506 100644
--- a/src/inspect_ai/util/_sandbox/self_check.py
+++ b/src/inspect_ai/util/_sandbox/self_check.py
@@ -205,6 +205,8 @@ async def test_cwd_absolute(sandbox_env: SandboxEnvironment) -> None:
     await _cleanup_file(sandbox_env, file_name)
 
 
+# TODO: write a test for when cwd doesn't exist
+
 # Generic type variable for exceptions
 E = TypeVar("E", bound=BaseException)
 
diff --git a/tests/model/test_mock_model_llm.py b/tests/model/test_mock_model_llm.py
index 4f4ed451c..7cad755f4 100644
--- a/tests/model/test_mock_model_llm.py
+++ b/tests/model/test_mock_model_llm.py
@@ -72,6 +72,16 @@ async def test_mock_generate_custom_invalid_iterable_string() -> None:
     assert "must be an instance of ModelOutput" in str(e_info.value)
 
 
+@pytest.mark.asyncio
+async def test_mock_generate_custom_invalid_iterable_number() -> None:
+    with pytest.raises(ValueError) as e_info:
+        get_model(
+            "mockllm/model",
+            custom_outputs=0,
+        )
+    assert "must be an Iterable or a Generator" in str(e_info.value)
+
+
 @pytest.mark.asyncio
 async def test_mock_generate_not_enough() -> None:
     model = get_model(
diff --git a/tests/tools/tool_call_utils.py b/tests/test_helpers/tool_call_utils.py
similarity index 100%
rename from tests/tools/tool_call_utils.py
rename to tests/test_helpers/tool_call_utils.py
diff --git a/tests/tools/test_sandbox_tool_eval.py b/tests/tools/test_sandbox_tool_eval.py
index e24a3f0f9..33aa12429 100644
--- a/tests/tools/test_sandbox_tool_eval.py
+++ b/tests/tools/test_sandbox_tool_eval.py
@@ -1,7 +1,7 @@
 import itertools
 
+from test_helpers.tool_call_utils import get_tool_calls, get_tool_response
 from test_helpers.tools import list_files, read_file
-from tool_call_utils import get_tool_calls, get_tool_response
 
 from inspect_ai import Task, eval
 from inspect_ai.dataset import Sample
diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py
index 156faf3c0..6c5289840 100644
--- a/tests/tools/test_tools.py
+++ b/tests/tools/test_tools.py
@@ -1,6 +1,11 @@
 from random import randint
 from typing import Literal
 
+from test_helpers.tool_call_utils import (
+    get_tool_call,
+    get_tool_calls,
+    get_tool_response,
+)
 from test_helpers.tools import addition, raise_error, read_file
 from test_helpers.utils import (
     skip_if_no_anthropic,
@@ -10,7 +15,6 @@
     skip_if_no_openai,
     skip_if_no_vertex,
 )
-from tool_call_utils import get_tool_call, get_tool_calls, get_tool_response
 
 from inspect_ai import Task, eval
 from inspect_ai.dataset import Sample
diff --git a/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml b/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml
new file mode 100644
index 000000000..8382db057
--- /dev/null
+++ b/tests/util/sandbox/docker_compose_multiple_services/docker-compose.yaml
@@ -0,0 +1,21 @@
+services:
+  default:
+    image: "python:3.12-bookworm"
+    command: "tail -f /dev/null"
+    init: true
+    network_mode: none
+    stop_grace_period: 1s
+  service_1:
+    image: "python:3.12-bookworm"
+    command: "tail -f /dev/null"
+    working_dir: /usr/local/dir1
+    init: true
+    network_mode: none
+    stop_grace_period: 1s
+  service_2:
+    image: "python:3.12-bookworm"
+    command: "tail -f /dev/null"
+    working_dir: /usr/local/dir2
+    init: true
+    network_mode: none
+    stop_grace_period: 1s
\ No newline at end of file
diff --git a/tests/util/sandbox/test_docker_compose_multiple_services.py b/tests/util/sandbox/test_docker_compose_multiple_services.py
new file mode 100644
index 000000000..31ca946a8
--- /dev/null
+++ b/tests/util/sandbox/test_docker_compose_multiple_services.py
@@ -0,0 +1,164 @@
+from pathlib import Path
+
+from test_helpers.tool_call_utils import get_tool_calls, get_tool_response
+
+from inspect_ai import Task, eval
+from inspect_ai.dataset import Sample
+from inspect_ai.model import ModelOutput, get_model
+from inspect_ai.scorer import includes
+from inspect_ai.solver import generate, use_tools
+from inspect_ai.tool import Tool, tool
+from inspect_ai.util import sandbox
+
+CURRENT_DIRECTORY = Path(__file__).resolve().parent
+
+
+@tool
+def write_file_service_1() -> Tool:
+    async def execute(file: str, content: str):
+        """
+        Writes the contents of a file.
+
+        Args:
+          file (str): File to write.
+          content(str): Content to write to the file.
+
+
+        """
+        await sandbox("service_1").write_file(file, content)
+
+    return execute
+
+
+@tool
+def write_file_service_2() -> Tool:
+    async def execute(file: str, content: str):
+        """
+        Writes the contents of a file.
+
+        Args:
+          file (str): File to write.
+          content(str): Content to write to the file.
+
+
+        """
+        await sandbox("service_2").write_file(file, content)
+
+    return execute
+
+
+@tool
+def read_file_service_1() -> Tool:
+    async def execute(file: str) -> str:
+        """
+        Reads the contents of a file.
+
+        Args:
+          file (str): File to read.
+
+
+        Returns:
+            str: Contents of the file.
+
+        """
+        return await sandbox("service_1").read_file(file, text=True)
+
+    return execute
+
+
+@tool
+def read_file_service_2() -> Tool:
+    async def execute(file: str) -> str:
+        """
+        Reads the contents of a file.
+
+        Args:
+          file (str): File to read.
+
+
+        Returns:
+            str: Contents of the file.
+
+        """
+        return await sandbox("service_2").read_file(file, text=True)
+
+    return execute
+
+
+def test_docker_compose_multiple_services_write_file():
+    dataset = [
+        Sample(
+            input="Writes some files please",
+            target="unused",
+            files={"foo.txt": "contents_of_foo.txt"},
+        )
+    ]
+    task = Task(
+        dataset=dataset,
+        plan=[
+            use_tools(
+                [
+                    write_file_service_1(),
+                    write_file_service_2(),
+                    read_file_service_1(),
+                    read_file_service_2(),
+                ]
+            ),
+            generate(),
+        ],
+        scorer=includes(),
+        sandbox=(
+            "docker",
+            str(
+                CURRENT_DIRECTORY
+                / "docker_compose_multiple_services/docker-compose.yaml"
+            ),
+        ),
+    )
+
+    def tool_calls():
+        yield ModelOutput.for_tool_call(
+            model="mockllm/model",
+            tool_name="write_file_service_1",
+            tool_arguments={"file": "foo.txt", "content": "unused_1"},
+        )
+        yield ModelOutput.for_tool_call(
+            model="mockllm/model",
+            tool_name="write_file_service_2",
+            tool_arguments={"file": "foo.txt", "content": "unused_2"},
+        )
+        yield ModelOutput.for_tool_call(
+            model="mockllm/model",
+            tool_name="read_file_service_1",
+            tool_arguments={"file": "foo.txt"},
+        )
+        yield ModelOutput.for_tool_call(
+            model="mockllm/model",
+            tool_name="read_file_service_2",
+            tool_arguments={"file": "foo.txt"},
+        )
+        while True:
+            yield ModelOutput.from_content(
+                model="mockllm/model", content="nothing left"
+            )
+
+    result = eval(
+        task,
+        model=get_model("mockllm/model", custom_outputs=tool_calls()),
+        max_messages=10,  # otherwise we can get into an infinite loop if the tools error
+    )[0]
+
+    assert result.status == "success"
+    messages = result.samples[0].messages
+    assert (
+        get_tool_response(
+            messages, get_tool_calls(messages, "read_file_service_1")[0]
+        ).content
+        == "unused_1"
+    )
+    assert (
+        get_tool_response(
+            messages, get_tool_calls(messages, "read_file_service_2")[0]
+        ).content
+        == "unused_2"
+    )