Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

swe_bench: misc mypy lint #453

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions evals/swe_bench/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@
from textwrap import dedent
from typing import Callable

from docker import DockerClient
from swebench.harness.constants import (
from docker import DockerClient # type: ignore
from swebench.harness.constants import ( # type: ignore
APPLY_PATCH_FAIL,
MAP_REPO_TO_INSTALL,
MAP_REPO_VERSION_TO_SPECS,
RESET_FAILED,
TESTS_ERROR,
TESTS_TIMEOUT,
)
from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
from swebench.harness.utils import get_test_directives
from swebench.harness.log_parsers import MAP_REPO_TO_PARSER # type: ignore
from swebench.harness.utils import get_test_directives # type: ignore

from inspect_ai import Task, task # noqa: E402
from inspect_ai.dataset import FieldSpec, Sample, hf_dataset
Expand All @@ -37,6 +37,7 @@
)
from inspect_ai.tool import bash
from inspect_ai.util import sandbox
from inspect_ai.util._subprocess import ExecResult

getLogger().handlers = [] # Swe-bench adds a global logger, which we disable.

Expand Down Expand Up @@ -102,17 +103,21 @@ def swe_bench(

for sample in samples:
# Turn the saved strings into list objects
sample.metadata = sample.metadata or {}
sample.metadata["PASS_TO_PASS"] = json.loads(sample.metadata["PASS_TO_PASS"])
sample.metadata["FAIL_TO_PASS"] = json.loads(sample.metadata["FAIL_TO_PASS"])

if filter:
samples = samples.filter(filter)

for sample in samples:
sample.metadata = sample.metadata or {}
sample.input = INPUT_PROMPT.format(issue_text=sample.input)
sample.sandbox = (
"docker",
get_compose_file(sample.metadata["environment_setup_commit"], sample.id),
get_compose_file(
sample.metadata["environment_setup_commit"], str(sample.id)
),
)
sample.setup = get_setup_script(
sample.metadata["repo"],
Expand All @@ -131,6 +136,7 @@ def swe_bench(

def get_setup_script(repo: str, version: str, base_commit: str) -> str:
"""Create a list of bash commands to set up the repository for testing. These are ran at the start of the sample, clone the repository, and do some extra repository-specific installation steps over and above what is in the environment images."""
newline = "\n"
setup_script = dedent(
f"""#!/bin/bash
set -euo pipefail -x
Expand All @@ -148,7 +154,7 @@ def get_setup_script(repo: str, version: str, base_commit: str) -> str:

# We then do any repo-specific install scripts
{MAP_REPO_TO_INSTALL.get(repo,"")}
{'\n'.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
{newline.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))}
{MAP_REPO_VERSION_TO_SPECS[repo][version].get('install','')}
"""
)
Expand Down Expand Up @@ -181,9 +187,10 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
test_patch_files = re.findall(r"--- a/(.*)", test_patch)

# Find all the files which contain tests. Ugly interface is due to swebench
test_files = get_test_directives({"repo": repo, "test_patch": test_patch}) # type: ignore
test_files = get_test_directives({"repo": repo, "test_patch": test_patch})

# Reset test files to the state they should be in before the patch.
newline = "\n"
eval_script = dedent(
f"""#!/bin/bash
set -uo pipefail -x
Expand All @@ -196,7 +203,7 @@ def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str)
set -x

#We run all of the repo-specific setup commands (If any exist)
{"\n".join(repo_specific_setup_command)}
{newline.join(repo_specific_setup_command)}

#We make sure we're back in the correct cwd and environment, in case repo setup caused issues.
cd {repo_directory}
Expand Down Expand Up @@ -253,8 +260,11 @@ async def scorer(state: TaskState, target: Target) -> Score:
try:
agent_patch = await sandbox().exec(["bash", "-c", GET_AGENT_PATCH])
except UnicodeDecodeError:
agent_patch = (
"Agent patch could not be decoded due to having a binary input."
agent_patch = ExecResult(
True,
0,
"Agent patch could not be decoded due to having a binary input.",
"",
)

# Run the evaluation script
Expand Down Expand Up @@ -334,10 +344,10 @@ def swebench_baseline_scorer(path_to_baseline: str, name: str | None = None) ->
def _swebench_baseline_scorer() -> Scorer:
async def scorer(state: TaskState, target: Target) -> Score:
if state.sample_id in results_per_instance_id:
results = results_per_instance_id[state.sample_id]
results = results_per_instance_id[str(state.sample_id)]
return Score(
value=results["resolved"],
explanation=f"Model Patch:\n\n {results["patch"]}",
explanation=f"Model Patch:\n\n {results['patch']}",
)
else:
return Score(
Expand Down Expand Up @@ -404,13 +414,13 @@ def get_compose_file(environment_commit_id: Sample, instance_id: str) -> str:
)

# If the image is found, we can now create the compose file.
compose_file_path = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
with compose_file_path.open(mode="w+") as f:
image_compose_file = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml"
with image_compose_file.open(mode="w+") as f:
f.write(f"""services:
default:
image: {environment_image_name}
command: "sleep infinity"
working_dir: /testbed
x-local: true""")

return str(compose_file_path)
return str(image_compose_file)
24 changes: 12 additions & 12 deletions evals/swe_bench/test_swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from uuid import uuid1

from build_images import build_images
from datasets import load_dataset
from datasets import load_dataset # type: ignore
from swe_bench import swe_bench, swebench_baseline_scorer

from inspect_ai import Task, eval
Expand All @@ -25,7 +25,7 @@

if not Path(SLOW_TEST_DATASET).exists():
raise FileNotFoundError(
f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / "tests"/"create_test_repos.py").absolute()} to run the tests."
f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / 'tests'/'create_test_repos.py').absolute()} to run the tests."
)

if not Path(SWEAGENT_BASELINE).exists():
Expand Down Expand Up @@ -78,7 +78,7 @@ async def _apply_patch_solver(state: TaskState, generate: Generate) -> TaskState
@solver
def delete_readme_solver():
async def _delete_readme_solver(state: TaskState, generate: Generate) -> TaskState:
sandbox().exec(["rm", "/testbed/README.md"])
await sandbox().exec(["rm", "/testbed/README.md"])
return state

return _delete_readme_solver
Expand All @@ -95,21 +95,20 @@ def test_correct_patch_succeeds() -> None:
result = eval(test_task, "mockllm/model", max_messages=4, debug_errors=True)[0]

assert (
result.results.scores[0].metrics["mean"].value == 1.0
result.results and result.results.scores[0].metrics["mean"].value == 1.0
), "SWE-bench should mark a correct application successfully."


def test_incorrect_patch_fails() -> None:
dataset = get_dataset_single_instance(GOLDEN_PATCH_TEST_ID)
build_swebench_images(dataset, "train")

test_task = swe_bench(dataset, "train")
test_task.plan = Plan([delete_readme_solver()])
test_task = swe_bench(dataset, "train", solver=delete_readme_solver())

result = eval(test_task, "mockllm/model", max_messages=2, debug_errors=True)[0]

assert (
result.results.scores[0].metrics["mean"].value == 0.0
result.results and result.results.scores[0].metrics["mean"].value == 0.0
), "SWE-bench should mark an incorrect application as a failure."


Expand Down Expand Up @@ -147,7 +146,7 @@ def test_same_scores_for_swe_agent() -> None:
"mockllm/model",
max_tasks=MAX_CONCURRENCY,
max_samples=MAX_CONCURRENCY,
max_subprocess=MAX_CONCURRENCY,
max_subprocesses=MAX_CONCURRENCY,
fail_on_error=False,
)

Expand All @@ -165,11 +164,12 @@ def test_same_scores_for_swe_agent() -> None:
error_str += f"Error occurred while evaluating task. Error:\n\n {sample.error}"
continue

score = result.samples[0].scores["swebench_scorer"]
swe_agent_score = result.samples[0].scores["sweagent_baseline"]
if result.samples[0].scores:
score = result.samples[0].scores["swebench_scorer"]
swe_agent_score = result.samples[0].scores["sweagent_baseline"]

if score.value != swe_agent_score.value:
error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"
if score.value != swe_agent_score.value:
error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}"

assert error_str == "", error_str

Expand Down
Loading