diff --git a/CHANGELOG.md b/CHANGELOG.md index 65144684c..1401c2b3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,14 @@ ## Unreleased +- [Per-sample](https://inspect.ai-safety-institute.org.uk/agents.html#sec-per-sample-sandbox) Sandbox environments can now be specified (e.g. allowing for a distinct Dockerfile or Docker compose file for each sample). +- [input_screen()](https://inspect.ai-safety-institute.org.uk/interactivity.html) context manager to temporarily clear task display for user input. - Add optional user parameter to SandboxEnvironment.exec for specifying the user. Currently only DockerSandboxEnvironment is supported. -- Sandbox environments can now be specified [per-sample](https://inspect.ai-safety-institute.org.uk/agents.html#sec-per-sample-sandbox) (e.g. allowing for a distinct Dockerfile or Docker compose file for each sample). -- [input_screen()](https://inspect.ai-safety-institute.org.uk/interactivity.html) context manager to temporairly clear task display for user input. +- Fix issue with resolving Docker configuration files when not running from the task directory. +- Treat `cwd` that are relative paths as relative to sample working directry. - Raise error when a Solver does not return a TaskState. -- Only run tests that use model APIs when the `--runapi` flag is passed to `pytest` (prevents unintented token usage) +- Only run tests that use model APIs when the `--runapi` flag is passed to `pytest` (prevents unintended token usage) +- Added [CommonsenseQA](https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/benchmarks/commonsense_qa) benchmark. ## v0.3.25 (25 August 2024) diff --git a/benchmarks/README.md b/benchmarks/README.md index de27355b7..0a5ed3ae3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -16,4 +16,5 @@ This directory contains evals for several benchmarks. Datasets for evals are not | HumanEval: Evaluating Large Language Models Trained on Code | | [humaneval.py](humaneval/humaneval.py) | Hugging Face | | DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs | | [drop.py](drop/drop.py) | Hugging Face | | WINOGRANDE: An Adversarial Winograd Schema Challenge at Scale | | [winogrande.py](winogrande/winogrande.py) | Hugging Face | -| RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models. | | [race-h.py](race-h/race-h.py) | Hugging Face | \ No newline at end of file +| RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models. | | [race-h.py](race-h/race-h.py) | Hugging Face | +| CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge | | [commonsense_qa.py](commonsense_qa/commonsense_qa.py) | Hugging Face | \ No newline at end of file diff --git a/benchmarks/commonsense_qa/README.md b/benchmarks/commonsense_qa/README.md new file mode 100644 index 000000000..17eb50b22 --- /dev/null +++ b/benchmarks/commonsense_qa/README.md @@ -0,0 +1,19 @@ +# CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge + +[CommonsenseQA](https://arxiv.org/pdf/1811.00937) is a dataset designed to evaluate commonsense reasoning capabilities in natural language processing models. It consists of 12,247 multiple-choice questions that require background knowledge and commonsense to answer correctly. The dataset was constructed using CONCEPTNET, a graph-based knowledge base, where crowd-workers authored questions with complex semantics to challenge existing AI models. + +## Execution +Here is an example from the dataset: +``` +Question: Where can I stand on a river to see water falling without getting wet? +Options: +A) Waterfall +B) Bridge +C) Valley +D) Stream +E) Bottom +``` +The model is required to choose the correct answer from the given options. In this case, the correct answer is B) Bridge. + +## Evaluation +The model is prompted with the question and 5 options as input and required to choose one option by generating the corresponding answer choice A, B, C, D or E. The prompt tempate is based on the multiple choice template in OpenAI's [simple evals](https://github.com/openai/simple-evals/blob/main/mmlu_eval.py). diff --git a/benchmarks/commonsense_qa/commonsense_qa.py b/benchmarks/commonsense_qa/commonsense_qa.py new file mode 100644 index 000000000..f2441b6b4 --- /dev/null +++ b/benchmarks/commonsense_qa/commonsense_qa.py @@ -0,0 +1,49 @@ +""" +CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge + +Alon Talmor, Jonathan Herzig, Nicholas Lourie, Jonathan Berant +https://arxiv.org/pdf/1811.00937v2 + +# eval w/ 500 randomly selected samples +inspect eval commonsense_qa.py --limit 500 +""" + +from inspect_ai import Task, task +from inspect_ai.dataset import Sample +from inspect_ai.dataset._sources.hf import hf_dataset +from inspect_ai.model._generate_config import GenerateConfig +from inspect_ai.scorer import choice +from inspect_ai.solver import multiple_choice + + +@task +def commonsense_qa(): + dataset = hf_dataset( + path="tau/commonsense_qa", + split="validation", + sample_fields=record_to_sample, + trust=True, + shuffle=True, + ) + + return Task( + dataset=dataset, + plan=multiple_choice(), + scorer=choice(), + config=GenerateConfig(temperature=0), + ) + + +def record_to_sample(record): + return Sample( + input=record["question"], + choices=[ + str(record["choices"]["text"][0]), + str(record["choices"]["text"][1]), + str(record["choices"]["text"][2]), + str(record["choices"]["text"][3]), + str(record["choices"]["text"][4]), + ], + target=record["answerKey"], + metadata={"question_concept": record["question_concept"]}, + ) diff --git a/docs/extensions.qmd b/docs/extensions.qmd index a301b9a73..2db841b2a 100644 --- a/docs/extensions.qmd +++ b/docs/extensions.qmd @@ -118,6 +118,10 @@ The static class methods control the lifecycle of containers and other computing @sandboxenv(name="podman") class PodmanSandboxEnvironment(SandboxEnvironment): + @classmethod + def config_files(cls) -> list[str]: + ... + @classmethod async def task_init( cls, task_name: str, config: str | None @@ -160,6 +164,7 @@ The class methods take care of various stages of initialisation, setup, and tear | Method | Lifecycle | Purpose | |-------------------|-------------------|----------------------------------| +| `config_files()` | Called once to determine the names of 'default' config files for this provider (e.g. 'compose.yaml'). | | `task_init()` | Called once for each unique sandbox environment config before executing the tasks in an `eval()` run. | Expensive initialisation operations (e.g. pulling or building images) | | `sample_init()` | Called at the beginning of each `Sample`. | Create `SandboxEnvironment` instances for the sample. | | `sample_cleanup()` | Called at the end of each `Sample` | Cleanup `SandboxEnvironment` instances for the sample. | diff --git a/pyproject.toml b/pyproject.toml index 0843029c8..4c070189b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ norecursedirs = [ "tests/test_package", "tests/test_task_list", ] +asyncio_mode = "auto" log_level = "warning" [tool.mypy] diff --git a/src/inspect_ai/_display/rich.py b/src/inspect_ai/_display/rich.py index f30d35040..78af37b9c 100644 --- a/src/inspect_ai/_display/rich.py +++ b/src/inspect_ai/_display/rich.py @@ -475,9 +475,13 @@ def task_targets(profile: TaskProfile) -> str: def task_config(profile: TaskProfile, generate_config: bool = True) -> str: # merge config theme = rich_theme() - config = dict(profile.task_args) | dict( - profile.eval_config.model_dump(exclude_none=True) - ) + # wind params back for display + task_args = dict(profile.task_args) + for key in task_args.keys(): + value = task_args[key] + if isinstance(value, dict) and "plan" in value and "params" in value: + task_args[key] = value["plan"] + config = task_args | dict(profile.eval_config.model_dump(exclude_none=True)) if generate_config: config = config | dict(profile.generate_config.model_dump(exclude_none=True)) config_print: list[str] = [] diff --git a/src/inspect_ai/_eval/loader.py b/src/inspect_ai/_eval/loader.py index 78aaf893d..ebdb01600 100644 --- a/src/inspect_ai/_eval/loader.py +++ b/src/inspect_ai/_eval/loader.py @@ -1,13 +1,14 @@ import ast import inspect +import os from dataclasses import dataclass, field from importlib.machinery import SourceFileLoader from importlib.util import module_from_spec, spec_from_loader from pathlib import Path from types import ModuleType -from typing import Any, cast +from typing import Any, Callable, cast -from inspect_ai._eval.task.util import task_file +from inspect_ai._eval.task.util import task_file, task_src_dir from inspect_ai._util.dotenv import dotenv_environ from inspect_ai._util.path import chdir_python from inspect_ai._util.registry import ( @@ -19,11 +20,12 @@ ) from inspect_ai.model import Model, ModelName from inspect_ai.util import SandboxEnvironmentSpec +from inspect_ai.util._sandbox.registry import registry_find_sandboxenv from .list import task_files from .registry import task_create from .task import PreviousTask, Task, TaskInfo, Tasks -from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR +from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR, TASK_SRC_DIR_ATTR from .task.run import EvalSampleSource, eval_log_sample_source @@ -61,13 +63,7 @@ def as_resolved_tasks(tasks: list[Task]) -> list[ResolvedTask]: task_args=resolve_task_args(task), task_file=task_file(task, relative=True), model=model, - sandbox=( - (sandbox, None) - if isinstance(sandbox, str) - else sandbox - if sandbox is not None - else task.sandbox - ), + sandbox=resolve_task_sandbox(task, sandbox), sequence=sequence, ) for sequence, task in enumerate(tasks) @@ -144,6 +140,49 @@ def resolve_task_args(task: Task) -> dict[str, Any]: return {} +def resolve_task_sandbox( + task: Task, sandbox: SandboxEnvironmentSpec | None +) -> tuple[str, str | None] | None: + # do the resolution + resolved_sandbox = ( + (sandbox, None) + if isinstance(sandbox, str) + else sandbox + if sandbox is not None + else task.sandbox + ) + + # if we have a sandbox with no config, see if there are implcit + # config files available for the provider + if resolved_sandbox is not None: + # look for default + if resolved_sandbox[1] is None: + # get config files for this type + sandboxenv_type = registry_find_sandboxenv(resolved_sandbox[0]) + config_files_fn = cast( + Callable[..., list[str]], getattr(sandboxenv_type, "config_files") + ) + config_files = config_files_fn() + + # probe for them in task src dir + src_dir = task_src_dir(task) + for config_file in config_files: + config_file_path = os.path.join(src_dir, config_file) + if os.path.isfile(config_file_path): + resolved_sandbox = (resolved_sandbox[0], config_file) + break + + # resolve relative paths + if resolved_sandbox[1] is not None: + file_path = Path(resolved_sandbox[1]) + if not file_path.is_absolute(): + file_path = Path(task_src_dir(task)) / file_path + resolved_sandbox = (resolved_sandbox[0], file_path.as_posix()) + + # return resolved sandbox + return resolved_sandbox + + def load_tasks( task_specs: list[str] | None, model: Model, task_args: dict[str, Any] = {} ) -> list[Task]: @@ -231,6 +270,7 @@ def create_file_tasks( # (will be used later to ensure it runs in the directory) task = task_create(task_spec, model, **task_args) setattr(task, TASK_FILE_ATTR, file.as_posix()) + setattr(task, TASK_SRC_DIR_ATTR, file.parent.as_posix()) if task.attribs.get("chdir", True): setattr(task, TASK_RUN_DIR_ATTR, file.parent.as_posix()) tasks.append(task) diff --git a/src/inspect_ai/_eval/task/constants.py b/src/inspect_ai/_eval/task/constants.py index e1e7ca594..11a078439 100644 --- a/src/inspect_ai/_eval/task/constants.py +++ b/src/inspect_ai/_eval/task/constants.py @@ -1,2 +1,3 @@ TASK_FILE_ATTR = "__task_file__" TASK_RUN_DIR_ATTR = "__task_run_dir__" +TASK_SRC_DIR_ATTR = "__task_src_dir__" diff --git a/src/inspect_ai/_eval/task/util.py b/src/inspect_ai/_eval/task/util.py index 5d76883af..3f995bd1e 100644 --- a/src/inspect_ai/_eval/task/util.py +++ b/src/inspect_ai/_eval/task/util.py @@ -7,7 +7,7 @@ from inspect_ai.model import ChatMessage, ChatMessageUser from ..task import Task -from .constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR +from .constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR, TASK_SRC_DIR_ATTR def sample_messages(sample: Sample) -> list[ChatMessage]: @@ -24,6 +24,10 @@ def task_run_dir(task: Task) -> str: return getattr(task, TASK_RUN_DIR_ATTR, os.getcwd()) +def task_src_dir(task: Task) -> str: + return getattr(task, TASK_SRC_DIR_ATTR, os.getcwd()) + + def task_file(task: Task, relative: bool = False) -> str | None: file = cast(str | None, getattr(task, TASK_FILE_ATTR, None)) if file: diff --git a/src/inspect_ai/_util/registry.py b/src/inspect_ai/_util/registry.py index 5baf17e51..2e403a22f 100644 --- a/src/inspect_ai/_util/registry.py +++ b/src/inspect_ai/_util/registry.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Literal, cast from pydantic import BaseModel, Field +from pydantic_core import to_jsonable_python from .constants import PKG_NAME @@ -74,14 +75,26 @@ def registry_tag( named_params[params[i]] = arg named_params |= kwargs + # plan objects are serialised with name and params + for param in named_params.keys(): + value = named_params[param] + if is_registry_object(value) and registry_info(value).type == "plan": + named_params[param] = dict( + plan=registry_log_name(value), params=registry_params(value) + ) + # callables are not serializable so use their names for param in named_params.keys(): if is_registry_object(named_params[param]): named_params[param] = registry_info(named_params[param]).name - elif hasattr(named_params[param], "__name__"): + elif callable(named_params[param]): named_params[param] = getattr(named_params[param], "__name__") + elif isinstance(named_params[param], dict | list): + named_params[param] = to_jsonable_python( + named_params[param], fallback=lambda x: getattr(x, "__name__", None) + ) else: - named_params[param] = str(named_params[param]) + named_params[param] = named_params[param] # set attribute setattr(o, REGISTRY_INFO, info) @@ -157,6 +170,15 @@ def registry_create(type: RegistryType, name: str, **kwargs: Any) -> object: def with_registry_info(o: object) -> object: return set_registry_info(o, registry_info(obj)) + # instantiate plan objects for tasks + if type == "task": + for param in kwargs.keys(): + value = kwargs[param] + if isinstance(value, dict) and "plan" in value and "params" in value: + kwargs[param] = registry_create( + "plan", value["plan"], **value["params"] + ) + if isclass(obj): return with_registry_info(obj(**kwargs)) elif callable(obj): diff --git a/src/inspect_ai/util/_sandbox/docker/config.py b/src/inspect_ai/util/_sandbox/docker/config.py index d01f55142..8cf52b22f 100644 --- a/src/inspect_ai/util/_sandbox/docker/config.py +++ b/src/inspect_ai/util/_sandbox/docker/config.py @@ -7,10 +7,19 @@ logger = getLogger(__name__) -async def auto_compose(parent: str = "") -> str | None: - # compose file provides all the config we need - if has_compose_file(parent): - return None +CONFIG_FILES = [ + "compose.yaml", + "compose.yml", + "docker-compose.yaml", + "docker-compose.yml", +] + + +async def resolve_compose_file(parent: str = "") -> str | None: + # existing compose file provides all the config we need + compose = find_compose_file(parent) + if compose is not None: + return Path(os.path.join(parent, compose)).resolve().as_posix() # temporary auto-compose if has_auto_compose_file(parent): @@ -25,17 +34,11 @@ async def auto_compose(parent: str = "") -> str | None: return await auto_compose_file(COMPOSE_GENERIC_YAML, parent) -def has_compose_file(parent: str = "") -> bool: - compose_files = [ - "compose.yaml", - "compose.yml", - "docker-compose.yaml", - "docker-compose.yml", - ] - for file in compose_files: +def find_compose_file(parent: str = "") -> str | None: + for file in CONFIG_FILES: if os.path.isfile(os.path.join(parent, file)): - return True - return False + return file + return None def has_dockerfile(parent: str = "") -> bool: @@ -52,7 +55,7 @@ def is_auto_compose_file(file: str) -> bool: async def ensure_auto_compose_file(file: str | None) -> None: if file is not None and is_auto_compose_file(file) and not os.path.exists(file): - await auto_compose(os.path.dirname(file)) + await resolve_compose_file(os.path.dirname(file)) def safe_cleanup_auto_compose(file: str | None) -> None: diff --git a/src/inspect_ai/util/_sandbox/docker/docker.py b/src/inspect_ai/util/_sandbox/docker/docker.py index 567e7ee4d..3df5cd320 100644 --- a/src/inspect_ai/util/_sandbox/docker/docker.py +++ b/src/inspect_ai/util/_sandbox/docker/docker.py @@ -23,12 +23,14 @@ compose_build, compose_check_running, compose_cleanup_images, + compose_command, compose_cp, compose_exec, compose_pull, compose_services, compose_up, ) +from .config import CONFIG_FILES from .prereqs import validate_prereqs from .util import ComposeProject, sandbox_log, task_project_name @@ -37,6 +39,10 @@ @sandboxenv(name="docker") class DockerSandboxEnvironment(SandboxEnvironment): + @classmethod + def config_files(cls) -> list[str]: + return CONFIG_FILES + @classmethod async def task_init(cls, task_name: str, config: str | None) -> None: # validate prereqs @@ -178,10 +184,12 @@ async def exec( # additional args args = [] - # specify working if requested - if cwd: - args.append("--workdir") - args.append(cwd) + final_cwd = Path(self._project.working_dir if cwd is None else cwd) + if not final_cwd.is_absolute(): + final_cwd = self._project.working_dir / final_cwd + + args.append("--workdir") + args.append(str(final_cwd)) if user: args.append("--user") @@ -206,7 +214,6 @@ async def write_file(self, file: str, contents: str | bytes) -> None: sandbox_log(f"write_file: {file}") # resolve relative file paths - original_file = file file = container_file(self._project, file) # ensure that the directory exists @@ -220,32 +227,79 @@ async def write_file(self, file: str, contents: str | bytes) -> None: msg = f"Failed to create container directory {parent}: {result.stderr}" raise RuntimeError(msg) - # use docker cp for binary files, tee for text files (which will - # have higher privs b/c the command runs in the container) + # We want to be able to write a file in the container, + # but only if the container's user would be allowed to do that. + # We need to avoid implicitly trusting the provided "file" string. + # For example, it shouldn't be passed as part of a shell command, + # because of the risk of shell injection. + + local_tmpfile = tempfile.NamedTemporaryFile() + + # write contents into a local tmp file (not in the container) if isinstance(contents, str): - # write the file - result = await self.exec(["tee", "--", file], input=contents) - if not result.success: - # PermissionError - if "permission denied" in result.stderr.lower(): - raise PermissionError( - errno.EACCES, "Permission denied.", original_file - ) - else: - msg = ( - f"Failed to write file '{file}' into container: {result.stderr}" - ) - raise RuntimeError(msg) + local_tmpfile.write(contents.encode("utf-8")) else: - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir: - src_file = os.path.join(temp_dir, os.path.basename(file)) - async with aiofiles.open(src_file, "wb") as f: - await f.write(contents) - await compose_cp( - src=os.path.basename(src_file), - dest=f"{self._service}:{file}", - project=self._project, - cwd=os.path.dirname(src_file), + local_tmpfile.write(contents) + + local_tmpfile.flush() + + # Copy the local tmp file into a tmp file on the container. + # Both tmp files have safe names as we created them ourselves + + # Use a custom mktemp target in the default cwd, because there + # was much strangness using mktemp in /tmp within GitHub CI: + # the temp files were created with the wrong ownership. + mktemp_result = await self.exec(["mktemp", ".tmp_inspect_sandbox_XXXXXX"]) + if not mktemp_result.success: + raise RuntimeError( + f"failed to create temporary file in container: {mktemp_result}" + ) + container_tmpfile = mktemp_result.stdout.strip() + + # compose cp will leave the file owned by root + await compose_cp( + src=local_tmpfile.name, + dest=f"{self._service}:{container_file(self._project,container_tmpfile)}", + project=self._project, + ) + + local_tmpfile.close() # this will also delete the file + + if not hasattr(self, "_docker_user"): + uid = (await self.exec(["id", "--user"])).stdout.strip() + gid = (await self.exec(["id", "--group"])).stdout.strip() + self._docker_user = (uid, gid) + + await compose_command( + [ + "exec", + "--user", + "root", + self._service, + "chown", + f"{self._docker_user[0]}:{self._docker_user[1]}", + container_tmpfile, + ], + project=self._project, + ) + + res_cp = await self.exec( + ["cp", "--no-target-directory", "--", container_tmpfile, file] + ) + + await self.exec(["rm", container_tmpfile]) + + if res_cp.returncode != 0: + if "Permission denied" in res_cp.stderr: + error_string = f"Permission was denied. Failed to copy temporary file. Error details: {res_cp.stderr};" + raise PermissionError(error_string) + elif "cannot overwrite directory" in res_cp.stderr: + raise IsADirectoryError( + f"Failed to write file: {file} because it is a directory already" + ) + else: + raise RuntimeError( + f"failed to copy temporary file during write_file: {res_cp}" ) @overload diff --git a/src/inspect_ai/util/_sandbox/docker/util.py b/src/inspect_ai/util/_sandbox/docker/util.py index d42e0e964..c98007e27 100644 --- a/src/inspect_ai/util/_sandbox/docker/util.py +++ b/src/inspect_ai/util/_sandbox/docker/util.py @@ -7,7 +7,7 @@ from inspect_ai._util.constants import SANDBOX -from .config import auto_compose, ensure_auto_compose_file +from .config import ensure_auto_compose_file, resolve_compose_file logger = getLogger(__name__) @@ -28,7 +28,11 @@ async def create( working_dir: str = "/", ) -> "ComposeProject": # ensure we have an auto-compose file if we need one - config = Path(config).resolve().as_posix() if config else await auto_compose() + config = ( + Path(config).resolve().as_posix() + if config + else await resolve_compose_file() + ) await ensure_auto_compose_file(config) # return project diff --git a/src/inspect_ai/util/_sandbox/environment.py b/src/inspect_ai/util/_sandbox/environment.py index e031cd951..6ba9c59f9 100644 --- a/src/inspect_ai/util/_sandbox/environment.py +++ b/src/inspect_ai/util/_sandbox/environment.py @@ -22,6 +22,11 @@ class SandboxEnvironment(abc.ABC): filesystem context to copy samples files into and resolve relative paths to. """ + @classmethod + def config_files(cls) -> list[str]: + """Standard config files for this provider (used for automatic discovery)""" + return [] + @classmethod async def task_init(cls, task_name: str, config: str | None) -> None: """Called at task startup initialize resources. @@ -108,7 +113,7 @@ async def exec( Args: cmd (str | list[str]): Command or command and arguments to execute. input (str | bytes | None): Standard input (optional). - cwd (str | None): Current working dir (optional). + cwd (str | None): Current working dir (optional). If relative, will be relative to the per-sample filesystem context. env (dict[str,str]): Environment variables for execution. user (str | None): Optional username or UID to run the command as. timeout (int | None): Optional execution timeout (seconds). diff --git a/src/inspect_ai/util/_sandbox/local.py b/src/inspect_ai/util/_sandbox/local.py index 94a60d108..b386170cf 100644 --- a/src/inspect_ai/util/_sandbox/local.py +++ b/src/inspect_ai/util/_sandbox/local.py @@ -52,10 +52,14 @@ async def exec( UserWarning, ) + final_cwd = Path(self.directory.name if cwd is None else cwd) + if not final_cwd.is_absolute(): + final_cwd = self.directory.name / final_cwd + return await subprocess( args=cmd, input=input, - cwd=cwd if cwd else self.directory.name, + cwd=final_cwd, env=env, timeout=timeout, ) diff --git a/src/inspect_ai/util/_sandbox/self_check.py b/src/inspect_ai/util/_sandbox/self_check.py new file mode 100644 index 000000000..ce25ae5a8 --- /dev/null +++ b/src/inspect_ai/util/_sandbox/self_check.py @@ -0,0 +1,239 @@ +from typing import Any, Callable, Coroutine, Generic, Optional, Type, TypeVar + +from inspect_ai.util import SandboxEnvironment + + +async def check_test_fn( + fn: Callable[[SandboxEnvironment], Coroutine[Any, Any, None]], + sandbox_env: SandboxEnvironment, +) -> bool | str: + try: + await fn(sandbox_env) + return True + except AssertionError as e: + return f"FAILED: {str(e)}" + except Exception as e: + return f"ERROR: {str(e)}" + + +async def self_check(sandbox_env: SandboxEnvironment) -> dict[str, bool | str]: + results = {} + + for fn in [ + test_read_and_write_file_text, + test_read_and_write_file_binary, + test_read_and_write_file_including_directory_absolute, + test_read_and_write_file_including_directory_relative, + test_read_file_zero_length, + test_read_file_not_found, + test_read_file_not_allowed, + test_read_file_is_directory, + test_read_file_nonsense_name, + test_write_file_zero_length, + test_write_file_is_directory, + test_write_file_without_permissions, + test_exec_timeout, + test_cwd_unspecified, + test_cwd_custom, + test_cwd_relative, + test_cwd_absolute, + ]: + results[fn.__name__] = await check_test_fn(fn, sandbox_env) + + return results + + +async def _cleanup_file(sandbox_env: SandboxEnvironment, filename: str) -> None: + res = await sandbox_env.exec(["/usr/bin/rm", filename]) + assert res.success + + +async def test_read_and_write_file_text(sandbox_env: SandboxEnvironment) -> None: + await sandbox_env.write_file("test_read_and_write_file_text.file", "great #content") + written_file_string = await sandbox_env.read_file( + "test_read_and_write_file_text.file", text=True + ) + assert ( + "great #content" == written_file_string + ), f"unexpected content: [{written_file_string}]" + await _cleanup_file(sandbox_env, "test_read_and_write_file_text.file") + + +async def test_read_and_write_file_binary(sandbox_env: SandboxEnvironment) -> None: + await sandbox_env.write_file( + "test_read_and_write_file_binary.file", b"\xc3\x28" + ) # invalid UTF-8 from https://stackoverflow.com/a/17199164/116509 + + written_file_bytes = await sandbox_env.read_file( + "test_read_and_write_file_binary.file", text=False + ) + assert b"\xc3\x28" == written_file_bytes + await _cleanup_file(sandbox_env, "test_read_and_write_file_binary.file") + + +async def test_read_and_write_file_including_directory_absolute( + sandbox_env: SandboxEnvironment, +) -> None: + file_name = "/tmp/test_rw_including_directory_absolute/test.file" + await sandbox_env.write_file(file_name, "absolutely enjoying being in a directory") + written_file_string = await sandbox_env.read_file(file_name, text=True) + assert "absolutely enjoying being in a directory" == written_file_string + await _cleanup_file(sandbox_env, file_name) + + +async def test_read_and_write_file_including_directory_relative( + sandbox_env: SandboxEnvironment, +) -> None: + file_name = "test_rw_including_directory_relative/test.file" + await sandbox_env.write_file(file_name, "relatively enjoying being in a directory") + written_file_string = await sandbox_env.read_file(file_name, text=True) + assert "relatively enjoying being in a directory" == written_file_string + await _cleanup_file(sandbox_env, file_name) + + +async def test_read_file_zero_length(sandbox_env: SandboxEnvironment) -> None: + await sandbox_env.exec(["touch", "zero_length_file.file"]) + zero_length = await sandbox_env.read_file("zero_length_file.file", text=True) + assert isinstance(zero_length, str) + assert zero_length == "" + + +async def test_read_file_not_found(sandbox_env: SandboxEnvironment) -> None: + file = "nonexistent" + with Raises(FileNotFoundError) as e_info: + await sandbox_env.read_file(file, text=True) + assert file in str(e_info.value) + + +async def test_read_file_not_allowed(sandbox_env: SandboxEnvironment) -> None: + file_name = "test_read_file_not_allowed.file" + await sandbox_env.write_file(file_name, "inaccessible #content") + await sandbox_env.exec(["chmod", "-r", file_name]) + with Raises(PermissionError) as e_info: + await sandbox_env.read_file(file_name, text=True) + assert file_name in str(e_info.value) + await _cleanup_file(sandbox_env, file_name) + + +async def test_read_file_is_directory(sandbox_env: SandboxEnvironment) -> None: + file = "/etc" + with Raises(IsADirectoryError) as e_info: + await sandbox_env.read_file(file, text=True) + assert "directory" in str(e_info.value) + + +async def test_read_file_nonsense_name( + sandbox_env: SandboxEnvironment, +) -> None: + file = "https:/en.wikipedia.org/wiki/Bart%C5%82omiej_Kasprzykowski" + with Raises(FileNotFoundError) as e_info: + await sandbox_env.read_file(file, text=True) + assert "wikipedia" in str(e_info.value) + + +async def test_write_file_zero_length(sandbox_env: SandboxEnvironment) -> None: + await sandbox_env.write_file("zero_length_file.file", "") + zero_length = await sandbox_env.read_file("zero_length_file.file", text=True) + assert isinstance(zero_length, str) + assert zero_length == "" + + +async def test_write_file_is_directory( + sandbox_env: SandboxEnvironment, +) -> None: + # ensure /tmp/directory exists + await sandbox_env.write_file( + "/tmp/inspect_ai_test_write_file_is_directory/file", "unused content" + ) + with Raises(IsADirectoryError) as e_info: + await sandbox_env.write_file( + "/tmp/inspect_ai_test_write_file_is_directory", + "content cannot go in a directory, dummy", + ) + assert "directory" in str(e_info.value) + + +async def test_write_file_without_permissions( + sandbox_env: SandboxEnvironment, +) -> None: + file_name = "test_write_file_without_permissions.file" + await sandbox_env.write_file(file_name, "impervious #content") + await sandbox_env.exec(["chmod", "-w", file_name]) + with Raises(PermissionError) as e_info: + await sandbox_env.write_file(file_name, "this won't stick") + assert file_name in str(e_info.value) + + +async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None: + with Raises(TimeoutError): + await sandbox_env.exec(["sleep", "2"], timeout=1) + + +async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None: + file_name = "test_cwd_unspecified.file" + await sandbox_env.write_file(file_name, "ls me plz") + current_dir_contents = (await sandbox_env.exec(["/usr/bin/ls", "-1"])).stdout + assert file_name in current_dir_contents + await _cleanup_file(sandbox_env, file_name) + + +async def test_cwd_custom(sandbox_env: SandboxEnvironment) -> None: + current_dir_contents = (await sandbox_env.exec(["/usr/bin/ls"], cwd="/etc")).stdout + assert "passwd" in current_dir_contents + + +async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None: + cwd_subdirectory = "subdir" + await sandbox_env.exec(["mkdir", cwd_subdirectory]) + file_name = "test_cwd_relative.file" + file_path = cwd_subdirectory + "/" + file_name + await sandbox_env.write_file(file_path, "ls me plz") + current_dir_contents = ( + await sandbox_env.exec(["/usr/bin/ls"], cwd=cwd_subdirectory) + ).stdout + assert ( + file_name in current_dir_contents + ), f"{file_name} not found in {current_dir_contents}" + await _cleanup_file(sandbox_env, file_path) + + +async def test_cwd_absolute(sandbox_env: SandboxEnvironment) -> None: + cwd_directory = "/tmp/test_cwd_absolute" + await sandbox_env.exec(["mkdir", cwd_directory]) + file_name = "/tmp/test_cwd_absolute/test_cwd_absolute.file" + await sandbox_env.write_file(file_name, "ls me plz") + current_dir_contents = ( + await sandbox_env.exec(["/usr/bin/ls"], cwd=cwd_directory) + ).stdout + assert "test_cwd_absolute.file" in current_dir_contents + await _cleanup_file(sandbox_env, file_name) + + +# Generic type variable for exceptions +E = TypeVar("E", bound=BaseException) + + +class Raises(Generic[E]): + def __init__(self, expected_exception: Type[E]): + self.expected_exception = expected_exception + self.value: Optional[E] = None # Store the caught exception + + def __enter__(self) -> "Raises[E]": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[Any], + ) -> bool: + if exc_type is None: + raise AssertionError( + f"Expected exception {self.expected_exception.__name__} but no exception was raised." + ) + if not issubclass(exc_type, self.expected_exception): + raise AssertionError( + f"Expected exception {self.expected_exception.__name__}, but got {exc_type.__name__}." + ) + self.value = exc_value # type: ignore + return True diff --git a/tests/test_helpers/tools.py b/tests/test_helpers/tools.py index 3c466c4c7..19623f643 100644 --- a/tests/test_helpers/tools.py +++ b/tests/test_helpers/tools.py @@ -34,10 +34,7 @@ async def execute(file: str): Returns: File contents """ - try: - return await sandbox().read_file(file) - except FileNotFoundError: - raise ToolError(f"File {file} not found.") + return await sandbox().read_file(file) return execute diff --git a/tests/tools/docker-compose-context/Dockerfile b/tests/tools/docker-compose-context/Dockerfile new file mode 100644 index 000000000..9a32d483d --- /dev/null +++ b/tests/tools/docker-compose-context/Dockerfile @@ -0,0 +1,2 @@ +FROM python:3.12-bookworm +RUN useradd --create-home --uid 1111 nonroot diff --git a/tests/tools/test_sandbox_compose.yaml b/tests/tools/test_sandbox_compose.yaml new file mode 100644 index 000000000..818dd29aa --- /dev/null +++ b/tests/tools/test_sandbox_compose.yaml @@ -0,0 +1,11 @@ +services: + default: + build: + context: docker-compose-context + dockerfile: Dockerfile + command: "tail -f /dev/null" + user: nonroot + working_dir: /home/nonroot + init: true + network_mode: none + stop_grace_period: 1s \ No newline at end of file diff --git a/tests/tools/test_sandbox_docker_and_local.py b/tests/tools/test_sandbox_docker_and_local.py new file mode 100644 index 000000000..58ed0a43f --- /dev/null +++ b/tests/tools/test_sandbox_docker_and_local.py @@ -0,0 +1,66 @@ +from pathlib import Path + +from inspect_ai.util._sandbox.docker.docker import DockerSandboxEnvironment +from inspect_ai.util._sandbox.local import LocalSandboxEnvironment +from inspect_ai.util._sandbox.self_check import self_check + + +async def test_self_check_local(request) -> None: + task_name = f"{__name__}_{request.node.name}_local" + + await LocalSandboxEnvironment.task_init(task_name=task_name, config=None) + envs_dict = await LocalSandboxEnvironment.sample_init( + task_name=task_name, config=None, metadata={} + ) + + return await check_results_of_self_check(task_name, envs_dict) + + +async def test_self_check_docker_custom_nonroot(request) -> None: + task_name = f"{__name__}_{request.node.name}_docker_nonroot" + + # The default docker-compose used in Inspect uses the root user in the container. + # The root user is allowed to overwrite files even if they're read-only. + # This breaks the sematics of the sandbox, so we use a non-root user for these tests. + config_file = str(Path(__file__) / ".." / "test_sandbox_compose.yaml") + + await DockerSandboxEnvironment.task_init(task_name=task_name, config=config_file) + envs_dict = await DockerSandboxEnvironment.sample_init( + task_name=task_name, config=config_file, metadata={} + ) + + return await check_results_of_self_check(task_name, envs_dict) + + +async def test_self_check_docker_default_root(request) -> None: + task_name = f"{__name__}_{request.node.name}_docker_root" + + await DockerSandboxEnvironment.task_init(task_name=task_name, config=None) + envs_dict = await DockerSandboxEnvironment.sample_init( + task_name=task_name, config=None, metadata={} + ) + + return await check_results_of_self_check( + task_name, envs_dict, ["test_write_file_without_permissions"] + ) + + +async def check_results_of_self_check(task_name, envs_dict, known_failures=[]): + sandbox_env = envs_dict["default"] + + try: + self_check_results = await self_check(sandbox_env) + failures = [] + for test_name, result in self_check_results.items(): + if result is not True and test_name not in known_failures: + failures.append(f"Test {test_name} failed: {result}") + if failures: + assert False, "\n".join(failures) + finally: + await sandbox_env.sample_cleanup( + task_name=task_name, + config=None, + environments=envs_dict, + interrupted=False, + ) + await sandbox_env.task_cleanup(task_name=task_name, config=None, cleanup=True) diff --git a/tests/tools/test_tool_environment.py b/tests/tools/test_sandbox_tool_eval.py similarity index 100% rename from tests/tools/test_tool_environment.py rename to tests/tools/test_sandbox_tool_eval.py