Skip to content

Commit

Permalink
swe_bench: misc mypy lint (#453)
Browse files Browse the repository at this point in the history
* swe bench mypy cleanup

* test_swe_bench mypy
  • Loading branch information
jjallaire authored Sep 19, 2024
1 parent 25bd6bf commit 1913a04
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 27 deletions.
40 changes: 25 additions & 15 deletions evals/swe_bench/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@
from textwrap import dedent
from typing import Callable

from docker import DockerClient
from swebench.harness.constants import (
from docker import DockerClient # type: ignore
from swebench.harness.constants import ( # type: ignore
APPLY_PATCH_FAIL,
MAP_REPO_TO_INSTALL,
MAP_REPO_VERSION_TO_SPECS,
RESET_FAILED,
TESTS_ERROR,
TESTS_TIMEOUT,
)
from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
from swebench.harness.utils import get_test_directives
from swebench.harness.log_parsers import MAP_REPO_TO_PARSER # type: ignore
from swebench.harness.utils import get_test_directives # type: ignore

from inspect_ai import Task, task # noqa: E402
from inspect_ai.dataset import FieldSpec, Sample, hf_dataset
Expand All @@ -37,6 +37,7 @@
)
from inspect_ai.tool import bash
from inspect_ai.util import sandbox
from inspect_ai.util._subprocess import ExecResult

getLogger().handlers = [] # Swe-bench adds a global logger, which we disable.

Expand Down Expand Up @@ -102,17 +103,21 @@ def swe_bench(

for sample in samples:
# Turn the saved strings into list objects
sample.metadata = sample.metadata or {}
sample.metadata["PASS_TO_PASS"] = json.loads(sample.metadata["PASS_TO_PASS"])
sample.metadata["FAIL_TO_PASS"] = json.loads(sample.metadata["FAIL_TO_PASS"])

if filter:
samples = samples.filter(filter)

for sample in samples:
sample.metadata = sample.metadata or {}
sample.input = INPUT_PROMPT.format(issue_text=sample.input)
sample.sandbox = (
"docker",
get_compose_file(sample.metadata["environment_setup_commit"], sample.id),
get_compose_file(
sample.metadata["environment_setup_commit"], str(sample.id)
),
)
sample.setup = get_setup_script(
sample.metadata["repo"],
Expand All @@ -131,6 +136,7 @@ def swe_bench(

def get_setup_script(repo: str, version: str, base_commit: str) -> str:
"""Create a list of bash commands to set up the repository for testing. These are ran at the start of the sample, clone the repository, and do some extra repository-specific installation steps over and above what is in the environment images."""
newline = "\n"
setup_script = dedent(
f"""#!/bin/bash
set -euo pipefail -x
Expand All @@ -148,7 +154,7 @@ def get_setup_script(repo: str, version: str, base_commit: str) -> str:
# We then do any repo-specific install scripts
{MAP_REPO_TO_INSTALL.get(repo,"")}
{'\n'.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
{newline.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
{MAP_REPO_VERSION_TO_SPECS[repo][version].get('install','')}
"""
)
Expand Down Expand Up @@ -181,9 +187,10 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
test_patch_files = re.findall(r"--- a/(.*)", test_patch)

# Find all the files which contain tests. Ugly interface is due to swebench
test_files = get_test_directives({"repo": repo, "test_patch": test_patch}) # type: ignore
test_files = get_test_directives({"repo": repo, "test_patch": test_patch})

# Reset test files to the state they should be in before the patch.
newline = "\n"
eval_script = dedent(
f"""#!/bin/bash
set -uo pipefail -x
Expand All @@ -196,7 +203,7 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
set -x
#We run all of the repo-specific setup commands (If any exist)
{"\n".join(repo_specific_setup_command)}
{newline.join(repo_specific_setup_command)}
#We make sure we're back in the correct cwd and environment, in case repo setup caused issues.
cd {repo_directory}
Expand Down Expand Up @@ -253,8 +260,11 @@ async def scorer(state: TaskState, target: Target) -> Score:
try:
agent_patch = await sandbox().exec(["bash", "-c", GET_AGENT_PATCH])
except UnicodeDecodeError:
agent_patch = (
"Agent patch could not be decoded due to having a binary input."
agent_patch = ExecResult(
True,
0,
"Agent patch could not be decoded due to having a binary input.",
"",
)

# Run the evaluation script
Expand Down Expand Up @@ -334,10 +344,10 @@ def swebench_baseline_scorer(path_to_baseline: str, name: str | None = None) ->
def _swebench_baseline_scorer() -> Scorer:
async def scorer(state: TaskState, target: Target) -> Score:
if state.sample_id in results_per_instance_id:
results = results_per_instance_id[state.sample_id]
results = results_per_instance_id[str(state.sample_id)]
return Score(
value=results["resolved"],
explanation=f"Model Patch:\n\n {results["patch"]}",
explanation=f"Model Patch:\n\n {results['patch']}",
)
else:
return Score(
Expand Down Expand Up @@ -404,13 +414,13 @@ def get_compose_file(environment_commit_id: Sample, instance_id: str) -> str:
)

# If the image is found, we can now create the compose file.
compose_file_path = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
with compose_file_path.open(mode="w+") as f:
image_compose_file = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
with image_compose_file.open(mode="w+") as f:
f.write(f"""services:
default:
image: {environment_image_name}
command: "sleep infinity"
working_dir: /testbed
x-local: true""")

return str(compose_file_path)
return str(image_compose_file)
24 changes: 12 additions & 12 deletions evals/swe_bench/test_swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from uuid import uuid1

from build_images import build_images
from datasets import load_dataset
from datasets import load_dataset # type: ignore
from swe_bench import swe_bench, swebench_baseline_scorer

from inspect_ai import Task, eval
Expand All @@ -25,7 +25,7 @@

if not Path(SLOW_TEST_DATASET).exists():
raise FileNotFoundError(
f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / "tests"/"create_test_repos.py").absolute()} to run the tests."
f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / 'tests'/'create_test_repos.py').absolute()} to run the tests."
)

if not Path(SWEAGENT_BASELINE).exists():
Expand Down Expand Up @@ -78,7 +78,7 @@ async def _apply_patch_solver(state: TaskState, generate: Generate) -> TaskState
@solver
def delete_readme_solver():
async def _delete_readme_solver(state: TaskState, generate: Generate) -> TaskState:
sandbox().exec(["rm", "/testbed/README.md"])
await sandbox().exec(["rm", "/testbed/README.md"])
return state

return _delete_readme_solver
Expand All @@ -95,21 +95,20 @@ def test_correct_patch_succeeds() -> None:
result = eval(test_task, "mockllm/model", max_messages=4, debug_errors=True)[0]

assert (
result.results.scores[0].metrics["mean"].value == 1.0
result.results and result.results.scores[0].metrics["mean"].value == 1.0
), "SWE-bench should mark a correct application successfully."


def test_incorrect_patch_fails() -> None:
dataset = get_dataset_single_instance(GOLDEN_PATCH_TEST_ID)
build_swebench_images(dataset, "train")

test_task = swe_bench(dataset, "train")
test_task.plan = Plan([delete_readme_solver()])
test_task = swe_bench(dataset, "train", solver=delete_readme_solver())

result = eval(test_task, "mockllm/model", max_messages=2, debug_errors=True)[0]

assert (
result.results.scores[0].metrics["mean"].value == 0.0
result.results and result.results.scores[0].metrics["mean"].value == 0.0
), "SWE-bench should mark an incorrect application as a failure."


Expand Down Expand Up @@ -147,7 +146,7 @@ def test_same_scores_for_swe_agent() -> None:
"mockllm/model",
max_tasks=MAX_CONCURRENCY,
max_samples=MAX_CONCURRENCY,
max_subprocess=MAX_CONCURRENCY,
max_subprocesses=MAX_CONCURRENCY,
fail_on_error=False,
)

Expand All @@ -165,11 +164,12 @@ def test_same_scores_for_swe_agent() -> None:
error_str += f"Error occurred while evaluating task. Error:\n\n {sample.error}"
continue

score = result.samples[0].scores["swebench_scorer"]
swe_agent_score = result.samples[0].scores["sweagent_baseline"]
if result.samples[0].scores:
score = result.samples[0].scores["swebench_scorer"]
swe_agent_score = result.samples[0].scores["sweagent_baseline"]

if score.value != swe_agent_score.value:
error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"
if score.value != swe_agent_score.value:
error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"

assert error_str == "", error_str

Expand Down

0 comments on commit 1913a04

Please sign in to comment.