UKGovernmentBEIS · jjallaire · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/evals/swe_bench/swe_bench.py b/evals/swe_bench/swe_bench.py
@@ -14,17 +14,17 @@
 from textwrap import dedent
 from typing import Callable
 
-from docker import DockerClient
-from swebench.harness.constants import (
+from docker import DockerClient  # type: ignore
+from swebench.harness.constants import (  # type: ignore
     APPLY_PATCH_FAIL,
     MAP_REPO_TO_INSTALL,
     MAP_REPO_VERSION_TO_SPECS,
     RESET_FAILED,
     TESTS_ERROR,
     TESTS_TIMEOUT,
 )
-from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
-from swebench.harness.utils import get_test_directives
+from swebench.harness.log_parsers import MAP_REPO_TO_PARSER  # type: ignore
+from swebench.harness.utils import get_test_directives  # type: ignore
 
 from inspect_ai import Task, task  # noqa: E402
 from inspect_ai.dataset import FieldSpec, Sample, hf_dataset
@@ -37,6 +37,7 @@
 )
 from inspect_ai.tool import bash
 from inspect_ai.util import sandbox
+from inspect_ai.util._subprocess import ExecResult
 
 getLogger().handlers = []  # Swe-bench adds a global logger, which we disable.
 
@@ -102,17 +103,21 @@ def swe_bench(
 
     for sample in samples:
         # Turn the saved strings into list objects
+        sample.metadata = sample.metadata or {}
         sample.metadata["PASS_TO_PASS"] = json.loads(sample.metadata["PASS_TO_PASS"])
         sample.metadata["FAIL_TO_PASS"] = json.loads(sample.metadata["FAIL_TO_PASS"])
 
     if filter:
         samples = samples.filter(filter)
 
     for sample in samples:
+        sample.metadata = sample.metadata or {}
         sample.input = INPUT_PROMPT.format(issue_text=sample.input)
         sample.sandbox = (
             "docker",
-            get_compose_file(sample.metadata["environment_setup_commit"], sample.id),
+            get_compose_file(
+                sample.metadata["environment_setup_commit"], str(sample.id)
+            ),
         )
         sample.setup = get_setup_script(
             sample.metadata["repo"],
@@ -131,6 +136,7 @@ def swe_bench(
 
 def get_setup_script(repo: str, version: str, base_commit: str) -> str:
     """Create a list of bash commands to set up the repository for testing. These are ran at the start of the sample,  clone the repository, and do some extra repository-specific installation steps over and above what is in the environment images."""
+    newline = "\n"
     setup_script = dedent(
         f"""#!/bin/bash
         set -euo pipefail -x
@@ -148,7 +154,7 @@ def get_setup_script(repo: str, version: str, base_commit: str) -> str:
 
         # We then do any repo-specific install scripts
         {MAP_REPO_TO_INSTALL.get(repo,"")}
-        {'\n'.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
+        {newline.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
         {MAP_REPO_VERSION_TO_SPECS[repo][version].get('install','')}
     """
     )
@@ -181,9 +187,10 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
     test_patch_files = re.findall(r"--- a/(.*)", test_patch)
 
     # Find all the files which contain tests. Ugly interface is due to swebench
-    test_files = get_test_directives({"repo": repo, "test_patch": test_patch})  # type: ignore
+    test_files = get_test_directives({"repo": repo, "test_patch": test_patch})
 
     # Reset test files to the state they should be in before the patch.
+    newline = "\n"
     eval_script = dedent(
         f"""#!/bin/bash
         set -uo pipefail -x
@@ -196,7 +203,7 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
         set -x
 
         #We run all of the repo-specific setup commands (If any exist)
-        {"\n".join(repo_specific_setup_command)}
+        {newline.join(repo_specific_setup_command)}
 
         #We make sure we're back in the correct cwd and environment, in case repo setup caused issues.
         cd {repo_directory}
@@ -253,8 +260,11 @@ async def scorer(state: TaskState, target: Target) -> Score:
         try:
             agent_patch = await sandbox().exec(["bash", "-c", GET_AGENT_PATCH])
         except UnicodeDecodeError:
-            agent_patch = (
-                "Agent patch could not be decoded due to having a binary input."
+            agent_patch = ExecResult(
+                True,
+                0,
+                "Agent patch could not be decoded due to having a binary input.",
+                "",
             )
 
         # Run the evaluation script
@@ -334,10 +344,10 @@ def swebench_baseline_scorer(path_to_baseline: str, name: str | None = None) ->
     def _swebench_baseline_scorer() -> Scorer:
         async def scorer(state: TaskState, target: Target) -> Score:
             if state.sample_id in results_per_instance_id:
-                results = results_per_instance_id[state.sample_id]
+                results = results_per_instance_id[str(state.sample_id)]
                 return Score(
                     value=results["resolved"],
-                    explanation=f"Model Patch:\n\n {results["patch"]}",
+                    explanation=f"Model Patch:\n\n {results['patch']}",
                 )
             else:
                 return Score(
@@ -404,13 +414,13 @@ def get_compose_file(environment_commit_id: Sample, instance_id: str) -> str:
         )
 
     # If the image is found, we can now create the compose file.
-    compose_file_path = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
-    with compose_file_path.open(mode="w+") as f:
+    image_compose_file = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
+    with image_compose_file.open(mode="w+") as f:
         f.write(f"""services:
   default:
     image: {environment_image_name}
     command: "sleep infinity"
     working_dir: /testbed
     x-local: true""")
 
-    return str(compose_file_path)
+    return str(image_compose_file)
diff --git a/evals/swe_bench/test_swe_bench.py b/evals/swe_bench/test_swe_bench.py
@@ -3,7 +3,7 @@
 from uuid import uuid1
 
 from build_images import build_images
-from datasets import load_dataset
+from datasets import load_dataset  # type: ignore
 from swe_bench import swe_bench, swebench_baseline_scorer
 
 from inspect_ai import Task, eval
@@ -25,7 +25,7 @@
 
 if not Path(SLOW_TEST_DATASET).exists():
     raise FileNotFoundError(
-        f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / "tests"/"create_test_repos.py").absolute()} to run the tests."
+        f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / 'tests'/'create_test_repos.py').absolute()} to run the tests."
     )
 
 if not Path(SWEAGENT_BASELINE).exists():
@@ -78,7 +78,7 @@ async def _apply_patch_solver(state: TaskState, generate: Generate) -> TaskState
 @solver
 def delete_readme_solver():
     async def _delete_readme_solver(state: TaskState, generate: Generate) -> TaskState:
-        sandbox().exec(["rm", "/testbed/README.md"])
+        await sandbox().exec(["rm", "/testbed/README.md"])
         return state
 
     return _delete_readme_solver
@@ -95,21 +95,20 @@ def test_correct_patch_succeeds() -> None:
     result = eval(test_task, "mockllm/model", max_messages=4, debug_errors=True)[0]
 
     assert (
-        result.results.scores[0].metrics["mean"].value == 1.0
+        result.results and result.results.scores[0].metrics["mean"].value == 1.0
     ), "SWE-bench should mark a correct application successfully."
 
 
 def test_incorrect_patch_fails() -> None:
     dataset = get_dataset_single_instance(GOLDEN_PATCH_TEST_ID)
     build_swebench_images(dataset, "train")
 
-    test_task = swe_bench(dataset, "train")
-    test_task.plan = Plan([delete_readme_solver()])
+    test_task = swe_bench(dataset, "train", solver=delete_readme_solver())
 
     result = eval(test_task, "mockllm/model", max_messages=2, debug_errors=True)[0]
 
     assert (
-        result.results.scores[0].metrics["mean"].value == 0.0
+        result.results and result.results.scores[0].metrics["mean"].value == 0.0
     ), "SWE-bench should mark an incorrect application as a failure."
 
 
@@ -147,7 +146,7 @@ def test_same_scores_for_swe_agent() -> None:
         "mockllm/model",
         max_tasks=MAX_CONCURRENCY,
         max_samples=MAX_CONCURRENCY,
-        max_subprocess=MAX_CONCURRENCY,
+        max_subprocesses=MAX_CONCURRENCY,
         fail_on_error=False,
     )
 
@@ -165,11 +164,12 @@ def test_same_scores_for_swe_agent() -> None:
                     error_str += f"Error occurred while evaluating task. Error:\n\n {sample.error}"
                     continue
 
-        score = result.samples[0].scores["swebench_scorer"]
-        swe_agent_score = result.samples[0].scores["sweagent_baseline"]
+            if result.samples[0].scores:
+                score = result.samples[0].scores["swebench_scorer"]
+                swe_agent_score = result.samples[0].scores["sweagent_baseline"]
 
-        if score.value != swe_agent_score.value:
-            error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"
+            if score.value != swe_agent_score.value:
+                error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"
 
     assert error_str == "", error_str