diff --git a/benchmarks/swe_bench/.gitignore b/benchmarks/swe_bench/.gitignore new file mode 100644 index 000000000..6867e3fdc --- /dev/null +++ b/benchmarks/swe_bench/.gitignore @@ -0,0 +1,2 @@ +resources/compose_files/* +resources/baselines/**/* \ No newline at end of file diff --git a/benchmarks/swe_bench/README.md b/benchmarks/swe_bench/README.md new file mode 100644 index 000000000..f5938bed0 --- /dev/null +++ b/benchmarks/swe_bench/README.md @@ -0,0 +1,68 @@ +# SWE-agent +This is an inspect-native implementation of [the SWE-bench dataset](https://www.swebench.com/). + +## Installation + +- **Install requirements.** As well as the requirements provided by inspect, this benchmark has its own dependencies. You can install them in your virtual environment by running ```pip install -r requirements.txt``` +- **Build environment images.** SWE-bench requires a set of images with all the dependencies of each repository. To build them for the SWE-bench-verified split, run ```./build_images.py --dataset_path princeton-nlp/SWE-bench_Verified --split train```. See ```./build_images.py --help``` for all arguments. NOTE: Depending on the repositories contained in your subset of SWE-bench, this can take a while to run (several hours). + +## Usage + + +### Running the benchmark +The ```swe_bench.py``` file contains the ```swe_bench``` function, which creates an instance of a SWE-bench Task: + +```python +from inspect_ai import eval +from inspect_ai.solver import use_tools, generate, system_message +from inspect_ai.solver import basic_agent +from inspect_ai.tool import bash, python +from swe_bench import swe_bench +from inspect_ai.solver import system_message + +# Create an agent that only uses bash, and an agent that only uses python. +agents = [ + basic_agent( + init=system_message("Please solve the coding task below. Once you are done, use your submit tool."), + tools=[tool], + ) + for tool in [bash(), python()] +] + +# Create a swe-bench instance for each of those agents. +swebench_task = [swe_bench(dataset="princeton-nlp/SWE-bench_Verified", split="test", plan=agent) for agent in agents] + +# For the demonstration, we will select only the first 5 tasks. +swebench_task.dataset = swebench_task.dataset[:5] + +# Compare how these two agents perform. +eval(swebench_task, model="openai/gpt-4o") +``` + +NOTE: SWE-bench will take a while to run, and uses a lot of tokens. If things are too slow, you should increase the level of parallelism - see https://inspect.ai-safety-institute.org.uk/parallelism.html. Note that running too many docker containers on your machine can also cause issues, most notably with a 'ALL PREDEFINED ADDRESS POOLS HAVE BEEN FULLY SUBNETTED' error - we don't recommend running more than 32 containers at any one time. + +### Comparing to baselines +Submissions to [the official SWE-bench leaderboard](https://www.swebench.com/) often come with a set of trajectories and per-swebench-instance results. This means that you can compare the performance of your agent not just on the full dataset, but also on subsets of that dataset. This is often useful when trying to run smaller, cheaper experiments. To make this comparison easy, we provide a scorer that will calculate the average performance of an existing baseline on that set of trajectories: + +```python +from inspect_ai import eval +from swe_bench import swe_bench, swe_bench_scorer, swebench_baseline_scorer + +# Select only the subset where the patch is more than 20 lines long. Select the default plan. +swebench_verified_short_descriptions = swe_bench(dataset="princeton-nlp/SWE-bench_Verified", split="train",plan=simple_bash_plan, filter=lambda x: len(x["patch"].split("\n")) > 20) + + +# Report the Claude 3.5 Sonnet + SweAgent baseline, as well as the performance of the agent. To download the baseline run ./resources/baselines/download_swebench_baseline.sh +swebench_verified_task_subset.scorer = [swebench_baseline_scorer("./resources/baselines/swebench_verified_20240620_sweagent_claude3.5sonnet",name="sweagent_baseline"), swebench_scorer()] + +eval(swebench_verified_task_subset) +``` + +This will lead to both numbers being reported in the final output, allowing you to compare baselines: + +![SWE-bench baseline comparison](./resources/docs/swebench_comparison.jpeg) +``` + + + + diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py new file mode 100644 index 000000000..237ff9b69 --- /dev/null +++ b/benchmarks/swe_bench/build_images.py @@ -0,0 +1,111 @@ +"""This is a utility script which installs all of the environment images for the SWE-bench dataset. These images contain all of the dependencies required to run the tests in the dataset.""" + +import argparse +import json +import os + +from datasets import load_dataset +from docker.client import DockerClient +from swe_bench import SAMPLE_TO_IMAGE_PATH +from swebench.harness.docker_build import build_env_images +from swebench.harness.test_spec import get_test_specs_from_dataset +from swebench.harness.utils import load_swebench_dataset + + +def build_images( + dataset_name: str, + split: str | None = None, + max_workers: int = 4, + force_rebuild=False, +) -> None: + """This function uses the swe_bench library to build docker images for the environment of the SWE-bench dataset. It also creates a mapping from the information contained in the dataset itself ( in particular, the "instance_id" and env_image_key) to the name of the docker images. This mapping lets us find the images directly from the dataset entries, rather than relying on objects created in the swe_bench code.""" + # Code copied from the swe_bench repository + docker_client = DockerClient.from_env() + + swebench_dataset = load_swebench_dataset(dataset_name, split) + if not isinstance(swebench_dataset[0], dict): + raise ValueError( + f"After loading the dataset, the elements should be a dictonary. Got {type(swebench_dataset[0])} instead. Did you pass the correct dataset name and split? \n\nOutput of huggingface load_dataset: {load_dataset(dataset_name, split)}" + ) + + # We then use a couple of internal functions from swebench to build the environment images + test_specs = get_test_specs_from_dataset(swebench_dataset) + build_env_images(docker_client, swebench_dataset, force_rebuild, max_workers) + + # We build a mapping of insance_ids and envirnoment_commits to the name of the docker images. This is used to find the images in our main swe_bench code. + environment_name_mapping = ( + {} + if not os.path.exists(SAMPLE_TO_IMAGE_PATH) + else json.load(open(SAMPLE_TO_IMAGE_PATH)) + ) + + for spec, swebench_instance in zip(test_specs, swebench_dataset): + assert spec.env_image_key in [ + image_tag + for image in docker_client.images.list() + for image_tag in image.tags + ], f"Image {spec.env_image_key} not found in docker images" + # Check if entry already exists + if swebench_instance["instance_id"] not in environment_name_mapping: + environment_name_mapping[swebench_instance["instance_id"]] = {} + if ( + swebench_instance["environment_setup_commit"] + in environment_name_mapping[swebench_instance["instance_id"]] + ): + assert ( + environment_name_mapping[swebench_instance["instance_id"]][ + swebench_instance["environment_setup_commit"] + ]["image_key"] + == spec.env_image_key + ), f"Image {spec.env_image_key} already mapped to a different image" + else: + environment_name_mapping[swebench_instance["instance_id"]][ + swebench_instance["environment_setup_commit"] + ] = { + "image_key": spec.env_image_key, + "env_setup_script": "\n".join(spec.env_script_list), + } + + # Add the mappings to a file + json.dump(environment_name_mapping, open(SAMPLE_TO_IMAGE_PATH, "w+"), indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Build environment images for SWE-bench" + ) + + parser.add_argument( + "--dataset_path", + type=str, + help="Name of the dataset to build images for", + required=False, + default="princeton-nlp/SWE-bench_Verified", + ) + + parser.add_argument( + "--split", + type=str, + help="Split of the dataset to build images for", + required=False, + default="test", + ) + + parser.add_argument( + "--max_workers", + type=int, + help="Maximum number of workers to use for building images", + required=False, + default=4, + ) + + parser.add_argument( + "--force_rebuild", + action="store_true", + help="Force rebuild of images", + required=False, + default=False, + ) + + args = parser.parse_args() + build_images(args.dataset_path, args.split, args.max_workers, args.force_rebuild) diff --git a/benchmarks/swe_bench/requirements.txt b/benchmarks/swe_bench/requirements.txt new file mode 100644 index 000000000..a4a0ae18d --- /dev/null +++ b/benchmarks/swe_bench/requirements.txt @@ -0,0 +1,2 @@ +swebench +docker \ No newline at end of file diff --git a/benchmarks/swe_bench/resources/baselines/download_sweagent_baseline.sh b/benchmarks/swe_bench/resources/baselines/download_sweagent_baseline.sh new file mode 100644 index 000000000..6d4c1996b --- /dev/null +++ b/benchmarks/swe_bench/resources/baselines/download_sweagent_baseline.sh @@ -0,0 +1,5 @@ +# Download the swe-bench baselines. NOTE: THEY ARE QUITE LARGE. +git clone https://github.com/swe-bench/experiments /tmp/swebench_baselines +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +# Copy to current directory +cp -r /tmp/swebench_baselines/evaluation/verified/20240620_sweagent_claude3.5sonnet $SCRIPT_DIR \ No newline at end of file diff --git a/benchmarks/swe_bench/resources/docs/swebench_comparison.jpeg b/benchmarks/swe_bench/resources/docs/swebench_comparison.jpeg new file mode 100644 index 000000000..9dcaf4f46 Binary files /dev/null and b/benchmarks/swe_bench/resources/docs/swebench_comparison.jpeg differ diff --git a/benchmarks/swe_bench/resources/tests/create_test_repos.py b/benchmarks/swe_bench/resources/tests/create_test_repos.py new file mode 100644 index 000000000..40d7d5428 --- /dev/null +++ b/benchmarks/swe_bench/resources/tests/create_test_repos.py @@ -0,0 +1,98 @@ +import json +import os +from pathlib import Path + +import pandas as pd +from datasets import load_dataset + +dataset = load_dataset("princeton-nlp/SWE-bench_Verified")["test"] + +# We create a subset of SWE-bench-verfied which: +# 1) Contains all of the repositories in SWE-bench verified +# 2) For each repository, contains an example where swe_agent + Claude 3.5 + Sonnet resolved the issue, and an example where it did not + +results_per_repo = {} +baseline_dir = Path(__file__).parent.parent / "baselines" +logs_dir = baseline_dir / "20240620_sweagent_claude3.5sonnet" / "logs" + +if not logs_dir.exists(): + print( + f"Please run the baseline creation script at {baseline_dir / "download_sweagent_baselines.sh"} to make the baselines to compare your agents against." + ) + exit() + + +results = [] +missing_results = [] + +# Load results from the log directory +for result in os.listdir(logs_dir): + results_path = os.path.join(logs_dir, result, "report.json") + if os.path.exists(results_path): + with open(results_path, "r") as f: + result_dict = json.load(f) + result_name, results_value = next(iter(result_dict.items())) + output_dict = dict(instance_id=result_name, **results_value) + patch_path = os.path.join(logs_dir, result, "patch.diff") + with open(patch_path, "r") as f: + output_dict["swe_agent_patch"] = f.read() + results.append(output_dict) + + else: + missing_results.append(result) + +# Get repository name from the results +results = pd.DataFrame.from_records(results) +results["repo"] = results["instance_id"].apply(lambda x: x.split("__")[0]) + +# Github patches which change binary files cannot actually be applied. We wil remove these entries in the dataset +results = results[~results["swe_agent_patch"].str.contains("Binary files")] + +# Group by repository, and success. Then pick one from each group. +results_per_repo = results.groupby(["repo", "resolved"]) +results_per_repo = results_per_repo.apply(lambda x: x.sample(1)).reset_index(drop=True) + + +# Filter dataset by those instance ids, and add a "reolved_by_swe_agent" column. +instance_ids = results_per_repo["instance_id"].values +resolved = results_per_repo["resolved"].values + +dataset = dataset.filter(lambda x: x["instance_id"] in instance_ids) +dataset = dataset.map( + lambda x: dict( + x, resolved_by_swe_agent=resolved[instance_ids == x["instance_id"]][0] + ), + num_proc=4, +) +# Add swe-agent-patch +dataset = dataset.map( + lambda x: dict( + x, + swe_agent_patch=results_per_repo[ + results_per_repo["instance_id"] == x["instance_id"] + ]["swe_agent_patch"].values[0], + ), + num_proc=4, +) +# Add resolved column +dataset = dataset.map( + lambda x: dict(x, resolved=resolved[instance_ids == x["instance_id"]][0]), + num_proc=4, +) + +# This repo is bugged for testing, as the setup script edits files, breaking the patch from swe-agent. +dataset = dataset.filter(lambda x: "sphinx" not in x["instance_id"]) + +# psf__requests-1921 is flakey at high levels of concurrency, so we remove it as well +dataset = dataset.filter(lambda x: "psf__requests-1921" not in x["instance_id"]) + + +# Calculate the accuracy. Should be 0.42857142857142855 +accuracy = sum(resolved) / len(resolved) + +# Save tbe dataset +dataset_dir = Path(__file__).parent / "all_repos_swe_agent_50_percent.hf" +os.makedirs(str(dataset_dir), exist_ok=True) +dataset.to_parquet(dataset_dir / "dataset.parquet") + +print(f"Saved dataset to {dataset_dir}, accuracy {accuracy}") diff --git a/benchmarks/swe_bench/swe_bench.py b/benchmarks/swe_bench/swe_bench.py new file mode 100644 index 000000000..51fe37793 --- /dev/null +++ b/benchmarks/swe_bench/swe_bench.py @@ -0,0 +1,415 @@ +"""SWE-bench: Can Language Models Resolve Real-World GitHub Issues? + +Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik Narasimhan +https://arxiv.org/abs/2310.06770 +""" + +import json +import logging +import os +import re +import shlex +from logging import getLogger +from pathlib import Path +from textwrap import dedent +from typing import Callable + +from docker import DockerClient +from swebench.harness.constants import ( + APPLY_PATCH_FAIL, + MAP_REPO_TO_INSTALL, + MAP_REPO_VERSION_TO_SPECS, + RESET_FAILED, + TESTS_ERROR, + TESTS_TIMEOUT, +) +from swebench.harness.log_parsers import MAP_REPO_TO_PARSER +from swebench.harness.utils import get_test_directives + +from inspect_ai import Task, task # noqa: E402 +from inspect_ai.dataset import FieldSpec, Sample, hf_dataset +from inspect_ai.scorer import Score, Scorer, Target, mean, scorer, std +from inspect_ai.solver import ( + Plan, + TaskState, + basic_agent, + system_message, +) +from inspect_ai.tool import bash +from inspect_ai.util import sandbox + +getLogger().handlers = [] # Swe-bench adds a global logger, which we disable. + +INPUT_PROMPT = "Please solve the following issue:\n\n{issue_text}" +COMPOSE_FILE_DIR = Path(__file__).parent / "resources/compose_files/" +os.makedirs(COMPOSE_FILE_DIR, exist_ok=True) + +SAMPLE_TO_IMAGE_PATH = COMPOSE_FILE_DIR / "sample_to_image.json" + +DEFAULT_PLAN = basic_agent( + init=system_message( + "Please solve the coding task below. Once you are done, use your submit tool." + ), + tools=[bash(timeout=180)], +) + +DEFAULT_MAX_MESSAGES = 30 + +# get python logger +logger = logging.getLogger(__name__) + + +@task +def swe_bench( + dataset: str = "princeton-nlp/SWE-bench_Verified", + split: str = "test", + plan: Plan = DEFAULT_PLAN, + max_messages: int = DEFAULT_MAX_MESSAGES, + filter: Callable[[Sample], bool] | None = None, +) -> Task: + """Returns a Task, representing an evaluation on SWE-bench. + + Args. + dataset : str + The dataset to use. This should either be the name of a dataset in the HF hub, or a path to a dataset on disk. + split : str + The split of the dataset to load. + plan : Plan + The plan to use when creating the task. If None, uses the default pkan. + max_messages : int + The maximum number of messages to generate for each sample. + filter : Callable[[Sample],bool] + A function to filter whether specific SWE-bench samples are included. + """ + samples = hf_dataset( + dataset, + split=split, + sample_fields=FieldSpec( + input="problem_statement", + id="instance_id", + metadata=[ + "base_commit", + "patch", + "PASS_TO_PASS", + "FAIL_TO_PASS", + "test_patch", + "version", + "repo", + "environment_setup_commit", + ], + ), + ) + + for sample in samples: + # Turn the saved strings into dictonary objects + sample.metadata["PASS_TO_PASS"] = json.loads(sample.metadata["PASS_TO_PASS"]) + sample.metadata["FAIL_TO_PASS"] = json.loads(sample.metadata["FAIL_TO_PASS"]) + + if filter: + samples = samples.filter(filter) + + for sample in samples: + sample.input = INPUT_PROMPT.format(issue_text=sample.input) + sample.sandbox = ( + "docker", + get_compose_file(sample.metadata["environment_setup_commit"], sample.id), + ) + sample.setup = get_setup_script( + sample.metadata["repo"], + sample.metadata["version"], + sample.metadata["base_commit"], + ) + + return Task( + name=f"{dataset}_{split}_{plan.name}", + dataset=samples, + plan=plan, + scorer=swebench_scorer(), + max_messages=max_messages, + ) + + +def get_setup_script(repo: str, version: str, base_commit: str) -> str: + """Create a list of bash commands to set up the repository for testing. These are ran at the start of the sample, clone the repository, and do some extra repository-specific installation steps over and above what is in the environment images.""" + setup_script = dedent( + f"""#!/bin/bash + set -euo pipefail -x + + # We clone the repository and set the permissions so the non-root user can run tests + rm -rf /testbed/* + git clone -o origin https://github.com/{repo} /testbed/ + chmod -R 777 /testbed/ + cd /testbed/ + git reset --hard {base_commit} + git remote remove origin + source /opt/miniconda3/bin/activate + conda activate testbed + echo "Current environment: $CONDA_DEFAULT_ENV" + + # We then do any repo-specific install scripts + {MAP_REPO_TO_INSTALL.get(repo,"")} + {'\n'.join(MAP_REPO_VERSION_TO_SPECS[repo][version].get('pre_install',[]))} + {MAP_REPO_VERSION_TO_SPECS[repo][version].get('install','')} + """ + ) + + return setup_script + + +def get_eval_script(test_patch: str, repo: str, version: str, base_commit: str) -> str: + """Creates a script which runs the tests of all the files in the test_patch.""" + # First we fetch the repository-specific 'specification' which SWE-bench provides + conda_env = "testbed" + repo_directory = "/testbed" + + # Fetch the command which runs the test. Often simply the string 'pytest' + test_command = MAP_REPO_VERSION_TO_SPECS[repo][version]["test_cmd"] + + # Fetch any repo-specific setup commands, e.g. any environment variables + repo_specific_setup_command = MAP_REPO_VERSION_TO_SPECS[repo][version].get( + "eval_commands", [] + ) + + repo_specific_install_command = MAP_REPO_VERSION_TO_SPECS[repo][version].get( + "install", "" + ) + + if repo == "scikit-learn/scikit-learn": # Scikit-learn gets upset with the install + repo_specific_install_command = "" + + # Find all the files which have been modified by the test patch + test_patch_files = re.findall(r"--- a/(.*)", test_patch) + + # Find all the files which contain tests. Ugly interface is due to swebench + test_files = get_test_directives({"repo": repo, "test_patch": test_patch}) # type: ignore + + # Reset test files to the state they should be in before the patch. + eval_script = dedent( + f"""#!/bin/bash + set -uo pipefail -x + + #We switch to the repository directory and activate the environment needed to run the tests + cd {repo_directory} + set +x + source /opt/miniconda3/bin/activate + conda activate {conda_env} + set -x + + #We run all of the repo-specific setup commands (If any exist) + {"\n".join(repo_specific_setup_command)} + + #We make sure we're back in the correct cwd and environment, in case repo setup caused issues. + cd {repo_directory} + set +x + source /opt/miniconda3/bin/activate + conda activate {conda_env} + set -x + + #We then re-run any repo-specific install commands (these should have happened in environment setup, but we do it again to be sure.) + {repo_specific_install_command} + + #First we reset all of the files which out test patch touches + git checkout {base_commit} {' '.join(test_patch_files)} + + #Then we apply the test patch given to us by SWE-bench, setting up the test we need to run + echo {shlex.quote(test_patch)} > /tmp/test_patch.diff + git apply --check /tmp/test_patch.diff + git apply /tmp/test_patch.diff + + #Then we run all the tests in the repository. + set +x + {test_command} {" ".join(test_files)} || true + + #and we reset the tests back to the base commit + git checkout {base_commit} {' '.join(test_patch_files)} + """ + ) + + return eval_script + + +CREATE_MODEL_PATCH = """cd /testbed +git add -A +git diff --cached {base_commit} > model.patch""" + +GET_AGENT_PATCH = """cd /testbed/ +cat model.patch""" + + +@scorer(metrics=[mean(), std()]) +def swebench_scorer() -> Scorer: + """Scores the changes made by a solver when solving a swe-bench instance, by running the tests which check whether than instance is correct.""" + + async def scorer(state: TaskState, target: Target) -> Score: + # Get the changes the model made, for logging purposes + await sandbox().exec( + [ + "bash", + "-c", + CREATE_MODEL_PATCH.format(base_commit=state.metadata["base_commit"]), + ] + ) + agent_patch = await sandbox().exec(["bash", "-c", GET_AGENT_PATCH]) + + # Run the evaluation script + eval_script = get_eval_script( + test_patch=state.metadata["test_patch"], + repo=state.metadata["repo"], + version=state.metadata["version"], + base_commit=state.metadata["base_commit"], + ) + eval_output = await sandbox().exec(["bash", "-c", eval_script]) + if not eval_output.success: + raise RuntimeError( + f"Test run failed. \n\nStderr: \n\n{eval_output.stderr}\n\nStdout: \n\n{eval_output.stdout}" + ) + + # Search for the error strings defined by the swe-bench authors + error_string_search = { + x: x in eval_output.stdout + for x in [ + APPLY_PATCH_FAIL, + RESET_FAILED, + TESTS_ERROR, + TESTS_TIMEOUT, + "Failed to reset task environment", + ] + } + + if any(error_string_search.values()): + explanation = f"The tests did not run correctly. Output from searching for error strings:\n\n{error_string_search}\n\nOutput from tests:\n\n{eval_output.stdout}" + value = 0.0 + else: + test_output_parser = MAP_REPO_TO_PARSER[state.metadata["repo"]] + test_outputs = test_output_parser(eval_output.stdout + eval_output.stderr) + + pass_to_pass_results = {k: "FAILED" for k in state.metadata["PASS_TO_PASS"]} + fail_to_pass_results = {v: "PASSED" for v in state.metadata["FAIL_TO_PASS"]} + for k, v in test_outputs.items(): + if k in state.metadata["PASS_TO_PASS"]: + pass_to_pass_results[k] = v + elif k in state.metadata["FAIL_TO_PASS"]: + fail_to_pass_results[k] = v + else: + logger.warning( + f"Test {k} not found in PASS_TO_PASS or FAIL_TO_PASS" + ) + continue + + passed_all_tests = all( + ["PASSED" == v for v in pass_to_pass_results.values()] + ) and all(["PASSED" == v for v in fail_to_pass_results.values()]) + value = 1.0 if passed_all_tests else 0.0 + + # Sort both so the the false values are at the top + pass_to_pass_results, fail_to_pass_results = ( + dict( + sorted(pass_to_pass_results.items(), key=lambda x: x[1] == "PASSED") + ), + dict( + sorted(fail_to_pass_results.items(), key=lambda x: x[1] == "PASSED") + ), + ) + + # Create an explanation of the results + explanation = f"PASS_TO_PASS:\n\n{json.dumps(pass_to_pass_results,indent=2)}\n\nFAIL_TO_PASS:\n\n{json.dumps(fail_to_pass_results,indent=2)}\n\n" + + return Score( + value=value, + explanation=explanation, + metadata={"model_patch": agent_patch.stdout}, + ) + + return scorer + + +def swebench_baseline_scorer(path_to_baseline: str, name: str | None = None) -> Scorer: + """Given a path to a set of SWE-bench trajectories in the official format (see https://github.com/swe-bench/experiments), returns the performance of those trajectories on the subset of SWE-bench you are evaluating on. This lets you compare to baselines on arbitrary subsets of SWE-bench.""" + baseline_name = name if name else Path(path_to_baseline).name + + results_per_instance_id = get_baseline_results(path_to_baseline) + + @scorer(metrics=[mean(), std()], name=baseline_name) + def _swebench_baseline_scorer() -> Scorer: + async def scorer(state: TaskState, target: Target) -> Score: + if state.sample_id in results_per_instance_id: + results = results_per_instance_id[state.sample_id] + return Score( + value=results["resolved"], + explanation=f"Model Patch:\n\n {results["patch"]}", + ) + else: + return Score( + value="N", explanation="No baseline found for this instance" + ) + + return scorer + + return _swebench_baseline_scorer() + + +def get_baseline_results(path_to_baseline: str) -> dict[str, dict[str, str]]: + path_to_logs = os.path.join(path_to_baseline, "logs") + + results_per_instance_id = {} + for result in os.listdir(path_to_logs): + results_path = os.path.join(path_to_logs, result, "report.json") + patch_path = os.path.join(path_to_logs, result, "patch.diff") + + if os.path.exists(results_path) and os.path.exists(patch_path): + # Sometimes there is no result saved, at which point we ignore that entry + with open(results_path, "r") as f: + result_dict = json.load(f) + instance_id, raw_results = next(iter(result_dict.items())) + + with open(patch_path, "r") as f: + results_per_instance_id[instance_id] = { + "resolved": raw_results["resolved"], + "patch": f.read(), + } + + return results_per_instance_id + + +def get_compose_file(environment_commit_id: Sample, instance_id: str) -> str: + if not os.path.exists(SAMPLE_TO_IMAGE_PATH): + raise ValueError( + "No sample to image mapping found. Please run 'build_images.py --dataset_location DATASET_LOCATION --split SPLIT' to build the images" + ) + + sample_to_image = json.load(open(SAMPLE_TO_IMAGE_PATH)) + if ( + instance_id in sample_to_image + and environment_commit_id in sample_to_image[instance_id] + ): + environment_image_name = sample_to_image[instance_id][environment_commit_id][ + "image_key" + ] + else: + raise ValueError( + f"No image found for instance_id {instance_id}. Please run 'build_images.py --dataset_location DATASET_LOCATION --split SPLIT' to build the images" + ) + + compose_file_path = f"{COMPOSE_FILE_DIR}/{environment_commit_id}.yaml" + if os.path.exists(compose_file_path): + return compose_file_path + + images = DockerClient.from_env().images.list() + if environment_image_name not in [ + image_name for image in images for image_name in image.tags + ]: + raise ValueError( + f"Image {environment_image_name} not found in docker images. Please run 'build_images.py --dataset_location DATASET_LOCATION --split SPLIT' to build the images" + ) + + # If the image is found, we can now create the compose file. + compose_file_path = COMPOSE_FILE_DIR / f"{environment_image_name}.yaml" + with compose_file_path.open(mode="w+") as f: + f.write(f"""services: + default: + image: {environment_image_name} + command: "sleep infinity" + working_dir: /testbed + x-local: true""") + + return str(compose_file_path) diff --git a/benchmarks/swe_bench/test_swe_bench.py b/benchmarks/swe_bench/test_swe_bench.py new file mode 100644 index 000000000..668a185a8 --- /dev/null +++ b/benchmarks/swe_bench/test_swe_bench.py @@ -0,0 +1,177 @@ +import os +from pathlib import Path +from uuid import uuid1 + +from build_images import build_images +from datasets import load_dataset +from swe_bench import swe_bench, swebench_baseline_scorer + +from inspect_ai import Task, eval +from inspect_ai.log import EvalLog +from inspect_ai.solver import Generate, Plan, TaskState, solver +from inspect_ai.util import sandbox + +RESOURCE_DIR = Path(__file__).parent / "resources" +TIMEOUT_SECONDS = 2 +GOLDEN_PATCH_TEST_ID = "scikit-learn__scikit-learn-15100" +SWEBENCH_BASELINE_NAME = "20240620_sweagent_claude3.5sonnet" +SWEAGENT_BASELINE: str = str(RESOURCE_DIR / "baselines" / SWEBENCH_BASELINE_NAME) + +SLOW_TEST_DATASET: str = str( + RESOURCE_DIR / "tests" / "all_repos_swe_agent_50_percent.hf" +) # A dataset with one instance from each of the repositories scraped in SWE-bench-verified, and also a column indicating whether it was resolved by swe-agent. + +MAX_CONCURRENCY = int(os.getenv("INSPECT_MAX_CONCURRENCY", 4)) + +if not Path(SLOW_TEST_DATASET).exists(): + raise FileNotFoundError( + f"Test datasets have not been created. Please run the script at {(RESOURCE_DIR / "tests"/"create_test_repos.py").absolute()} to run the tests." + ) + +if not Path(SWEAGENT_BASELINE).exists(): + raise FileNotFoundError( + f"Test baseline files have not been created. Please run the script at {(RESOURCE_DIR / 'baselines' / 'download_sweagent_baselines.sh').absolute()} to run the tests." + ) + + +def get_dataset_single_instance( + instance_id: str, + dataset_name: str = "princeton-nlp/SWE-bench_Verified", + split: str = "test", +) -> str: + full_benchmark = load_dataset(dataset_name)[split] + instance = full_benchmark.filter(lambda x: x["instance_id"] == instance_id) + dataset_location = f"/tmp/{uuid1()}/" + os.makedirs(dataset_location, exist_ok=True) + instance.to_parquet(dataset_location + "dataset.parquet") + + return dataset_location + + +def build_swebench_images( + dataset_name: str = "princeton-nlp/SWE-bench_Verified", + split: str | None = "test", + max_workers: int = 4, +) -> None: + build_images(dataset_name=dataset_name, split=split, max_workers=max_workers) + + +@solver +def apply_patch_solver(patch: str): + # Solver to apply a specific patch to a git repository. + + async def _apply_patch_solver(state: TaskState, generate: Generate) -> TaskState: + state.metadata["patch"] = patch + patch_location = "/tmp/patch.diff" + await sandbox().write_file(patch_location, patch) + output = await sandbox().exec(["git", "apply", patch_location]) + + assert ( + output.returncode == 0 + ), f"Failed to write patch to {patch_location}. Stdout:\n\n {output.stdout}\n\nStderr:\n\n{output.stderr}" + + return state + + return _apply_patch_solver + + +@solver +def delete_readme_solver(): + async def _delete_readme_solver(state: TaskState, generate: Generate) -> TaskState: + sandbox().exec(["rm", "/testbed/README.md"]) + return state + + return _delete_readme_solver + + +def test_correct_patch_succeeds() -> None: + dataset = get_dataset_single_instance(GOLDEN_PATCH_TEST_ID) + build_swebench_images(dataset, "train") + + test_task = swe_bench(dataset, "train") + golden_patch = test_task.dataset[0].metadata["patch"] + test_task.plan = Plan([apply_patch_solver(golden_patch)]) + + result = eval(test_task, "mockllm/model", max_messages=4, debug_errors=True)[0] + + assert ( + result.results.scores[0].metrics["mean"].value == 1.0 + ), "SWE-bench should mark a correct application successfully." + + +def test_incorrect_patch_fails() -> None: + dataset = get_dataset_single_instance(GOLDEN_PATCH_TEST_ID) + build_swebench_images(dataset, "train") + + test_task = swe_bench(dataset, "train") + test_task.plan = Plan([delete_readme_solver()]) + + result = eval(test_task, "mockllm/model", max_messages=2, debug_errors=True)[0] + + assert ( + result.results.scores[0].metrics["mean"].value == 0.0 + ), "SWE-bench should mark an incorrect application as a failure." + + +def test_same_scores_for_swe_agent() -> None: + # Very slow test + # This test checks that we agree with the original swe-bench implementation patches outputted by swe-agent, with one instance from each of the repositories scraped in SWE-bench-verified. + + build_swebench_images(dataset_name=SLOW_TEST_DATASET, split="train") + + # We load the whole dataset + test_task = swe_bench(dataset=SLOW_TEST_DATASET, split="train") + test_dataset = load_dataset(SLOW_TEST_DATASET, split="train") + + # Add a scorer which compares the value of the swe-agents's patch with the ground truth + scorers = [ + test_task.scorer[0], + swebench_baseline_scorer(SWEAGENT_BASELINE, name="sweagent_baseline"), + ] + + # Make plans which apply the swe-agent's patch for each swebench instance + test_tasks = [] + for test_sample, swe_agent_patch in zip( + test_task.dataset, test_dataset["swe_agent_patch"] + ): + test_tasks.append( + Task( + dataset=[test_sample], + plan=apply_patch_solver(swe_agent_patch), + scorer=scorers, + ) + ) + + results: list[EvalLog] = eval( + test_tasks, + "mockllm/model", + max_tasks=MAX_CONCURRENCY, + max_samples=MAX_CONCURRENCY, + max_subprocess=MAX_CONCURRENCY, + fail_on_error=False, + ) + + error_str = "" + for result in results: + if result.status == "error": + error_str += ( + f"Error occurred while evaluating task. Error:\n\n {result.error}" + ) + continue + + if result.samples: + for sample in result.samples: + if sample.error: + error_str += f"Error occurred while evaluating task. Error:\n\n {sample.error}" + continue + + score = result.samples[0].scores["swebench_scorer"] + swe_agent_score = result.samples[0].scores["sweagent_baseline"] + + if score.value != swe_agent_score.value: + error_str += f"Result of evaluating {result.samples[0].id} did not agree with the swe_bench ground truth. Our score: '{score.value}'. swe-agent score: '{swe_agent_score.value}' Scorer results: {score.explanation}" + + assert error_str == "", error_str + + +test_same_scores_for_swe_agent() diff --git a/docs/interactivity.qmd b/docs/interactivity.qmd index ff06af63b..614891527 100644 --- a/docs/interactivity.qmd +++ b/docs/interactivity.qmd @@ -109,4 +109,4 @@ with input_screen() as console: table.add_row("bash", "ls /usr/bin") table.add_row("python", "print('foo')") console.print(table) -``` \ No newline at end of file +``` diff --git a/src/inspect_ai/solver/_basic_agent.py b/src/inspect_ai/solver/_basic_agent.py index 77a581a9b..4f1547be8 100644 --- a/src/inspect_ai/solver/_basic_agent.py +++ b/src/inspect_ai/solver/_basic_agent.py @@ -38,6 +38,7 @@ def basic_agent( *, init: Solver | list[Solver] | None = None, tools: list[Tool] = [], + agent_name: str = "basic_agent", max_attempts: int = 1, score_value: ValueToFloat | None = None, incorrect_message: str = DEFAULT_INCORRECT_MESSAGE, @@ -67,6 +68,7 @@ def basic_agent( (defaults to system_message with basic ReAct prompt) tools (list[Tool]): Tools available for the agent. max_attempts (int): Maximum number of submissions to accept before terminating. + agent_name (str): Name passed to the Plan constructor (defaults to 'basic_agent') score_value (ValueToFloat): Function used to extract float from scores (defaults to standard value_to_float()) incorrect_message (str): User message reply for an incorrect submission from @@ -171,5 +173,6 @@ async def solve(state: TaskState, generate: Generate) -> TaskState: + [ use_tools(tools + [submit_tool]), basic_agent_loop(), - ] + ], + name=agent_name, ) diff --git a/src/inspect_ai/util/_sandbox/context.py b/src/inspect_ai/util/_sandbox/context.py index cb2ae8f47..0b375277e 100644 --- a/src/inspect_ai/util/_sandbox/context.py +++ b/src/inspect_ai/util/_sandbox/context.py @@ -123,7 +123,7 @@ async def setup_sandbox_environment( env = default_sandbox_environment(environments) # copy to container - setup_file = uuid() + setup_file = f"/tmp/{uuid()}" await env.write_file(setup_file, setup) # chmod, execute, and remove @@ -135,7 +135,6 @@ async def exec(cmd: list[str]) -> None: f"Failed to execute setup script for sample: {result.stderr}" ) - setup_file = f"./{setup_file}" await exec(["chmod", "+x", setup_file]) await exec([setup_file]) await exec(["rm", setup_file])