Merge pull request #268 from UKGovernmentBEIS/max/swe_bench

Adding SWE-bench
UKGovernmentBEIS · Sep 12, 2024 · 50b2f36 · 50b2f36
2 parents 66f19a6 + d771f45
commit 50b2f36
Show file tree

Hide file tree

Showing 12 changed files with 884 additions and 4 deletions.
diff --git a/benchmarks/swe_bench/.gitignore b/benchmarks/swe_bench/.gitignore
@@ -0,0 +1,2 @@
+resources/compose_files/*
+resources/baselines/**/*
diff --git a/benchmarks/swe_bench/README.md b/benchmarks/swe_bench/README.md
@@ -0,0 +1,68 @@
+# SWE-agent
+This is an inspect-native implementation of [the SWE-bench dataset](https://www.swebench.com/).
+
+## Installation
+
+- **Install requirements.** As well as the requirements provided by inspect, this benchmark has its own dependencies. You can install them in your virtual environment by running ```pip install -r requirements.txt```
+- **Build environment images.** SWE-bench requires a set of images with all the dependencies of each repository. To build them for the SWE-bench-verified split, run ```./build_images.py --dataset_path princeton-nlp/SWE-bench_Verified --split train```. See ```./build_images.py --help``` for all arguments. NOTE: Depending on the repositories contained in your subset of SWE-bench, this can take a while to run (several hours). 
+
+## Usage
+
+
+### Running the benchmark
+The ```swe_bench.py``` file contains the ```swe_bench``` function, which creates an instance of a SWE-bench Task:
+
+```python
+from inspect_ai import eval
+from inspect_ai.solver import use_tools, generate, system_message
+from inspect_ai.solver import basic_agent
+from inspect_ai.tool import bash, python
+from swe_bench import swe_bench
+from inspect_ai.solver import system_message
+
+# Create an agent that only uses bash, and an agent that only uses python.
+agents = [
+    basic_agent(
+        init=system_message("Please solve the coding task below. Once you are done, use your submit tool."),
+        tools=[tool],
+    )
+    for tool in [bash(), python()]
+]
+
+# Create a swe-bench instance for each of those agents.
+swebench_task = [swe_bench(dataset="princeton-nlp/SWE-bench_Verified", split="test", plan=agent) for agent in agents]
+
+# For the demonstration, we will select only the first 5 tasks.
+swebench_task.dataset = swebench_task.dataset[:5]
+
+# Compare how these two agents perform. 
+eval(swebench_task, model="openai/gpt-4o")
+```
+
+NOTE: SWE-bench will take a while to run, and uses a lot of tokens. If things are too slow, you should increase the level of parallelism - see https://inspect.ai-safety-institute.org.uk/parallelism.html. Note that running too many docker containers on your machine can also cause issues, most notably with a 'ALL PREDEFINED ADDRESS POOLS HAVE BEEN FULLY SUBNETTED' error - we don't recommend running more than 32 containers at any one time.
+
+### Comparing to baselines
+Submissions to [the official SWE-bench leaderboard](https://www.swebench.com/) often come with a set of trajectories and per-swebench-instance results. This means that you can compare the performance of your agent not just on the full dataset, but also on subsets of that dataset. This is often useful when trying to run smaller, cheaper experiments. To make this comparison easy, we provide a scorer that will calculate the average performance of an existing baseline on that set of trajectories:
+
+```python
+from inspect_ai import eval
+from swe_bench import swe_bench, swe_bench_scorer, swebench_baseline_scorer
+
+# Select only the subset where the patch is more than 20 lines long. Select the default plan.
+swebench_verified_short_descriptions = swe_bench(dataset="princeton-nlp/SWE-bench_Verified", split="train",plan=simple_bash_plan, filter=lambda x: len(x["patch"].split("\n")) > 20)
+
+
+# Report the Claude 3.5 Sonnet + SweAgent baseline, as well as the performance of the agent. To download the baseline run ./resources/baselines/download_swebench_baseline.sh
+swebench_verified_task_subset.scorer = [swebench_baseline_scorer("./resources/baselines/swebench_verified_20240620_sweagent_claude3.5sonnet",name="sweagent_baseline"), swebench_scorer()]
+
+eval(swebench_verified_task_subset)
+```
+
+This will lead to both numbers being reported in the final output, allowing you to compare baselines:
+
+![SWE-bench baseline comparison](./resources/docs/swebench_comparison.jpeg)
+```
+
+
+
+
diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
@@ -0,0 +1,111 @@
+"""This is a utility script which installs all of the environment images for the SWE-bench dataset. These images contain all of the dependencies required to run the tests in the dataset."""
+
+import argparse
+import json
+import os
+
+from datasets import load_dataset
+from docker.client import DockerClient
+from swe_bench import SAMPLE_TO_IMAGE_PATH
+from swebench.harness.docker_build import build_env_images
+from swebench.harness.test_spec import get_test_specs_from_dataset
+from swebench.harness.utils import load_swebench_dataset
+
+
+def build_images(
+    dataset_name: str,
+    split: str | None = None,
+    max_workers: int = 4,
+    force_rebuild=False,
+) -> None:
+    """This function uses the swe_bench library to build docker images for the environment of the SWE-bench dataset. It also creates a mapping from the information contained in the dataset itself ( in particular, the "instance_id" and env_image_key) to the name of the docker images. This mapping lets us find the images directly from the dataset entries, rather than relying on objects created in the swe_bench code."""
+    # Code copied from the swe_bench repository
+    docker_client = DockerClient.from_env()
+
+    swebench_dataset = load_swebench_dataset(dataset_name, split)
+    if not isinstance(swebench_dataset[0], dict):
+        raise ValueError(
+            f"After loading the dataset, the elements should be a dictonary. Got {type(swebench_dataset[0])} instead. Did you pass the correct dataset name and split? \n\nOutput of huggingface load_dataset: {load_dataset(dataset_name, split)}"
+        )
+
+    # We then use a couple of internal functions from swebench to build the environment images
+    test_specs = get_test_specs_from_dataset(swebench_dataset)
+    build_env_images(docker_client, swebench_dataset, force_rebuild, max_workers)
+
+    # We build a mapping of insance_ids and envirnoment_commits to the name of the docker images. This is used to find the images in our main swe_bench code.
+    environment_name_mapping = (
+        {}
+        if not os.path.exists(SAMPLE_TO_IMAGE_PATH)
+        else json.load(open(SAMPLE_TO_IMAGE_PATH))
+    )
+
+    for spec, swebench_instance in zip(test_specs, swebench_dataset):
+        assert spec.env_image_key in [
+            image_tag
+            for image in docker_client.images.list()
+            for image_tag in image.tags
+        ], f"Image {spec.env_image_key} not found in docker images"
+        # Check if entry already exists
+        if swebench_instance["instance_id"] not in environment_name_mapping:
+            environment_name_mapping[swebench_instance["instance_id"]] = {}
+            if (
+                swebench_instance["environment_setup_commit"]
+                in environment_name_mapping[swebench_instance["instance_id"]]
+            ):
+                assert (
+                    environment_name_mapping[swebench_instance["instance_id"]][
+                        swebench_instance["environment_setup_commit"]
+                    ]["image_key"]
+                    == spec.env_image_key
+                ), f"Image {spec.env_image_key} already mapped to a different image"
+            else:
+                environment_name_mapping[swebench_instance["instance_id"]][
+                    swebench_instance["environment_setup_commit"]
+                ] = {
+                    "image_key": spec.env_image_key,
+                    "env_setup_script": "\n".join(spec.env_script_list),
+                }
+
+    # Add the mappings to a file
+    json.dump(environment_name_mapping, open(SAMPLE_TO_IMAGE_PATH, "w+"), indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Build environment images for SWE-bench"
+    )
+
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Name of the dataset to build images for",
+        required=False,
+        default="princeton-nlp/SWE-bench_Verified",
+    )
+
+    parser.add_argument(
+        "--split",
+        type=str,
+        help="Split of the dataset to build images for",
+        required=False,
+        default="test",
+    )
+
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        help="Maximum number of workers to use for building images",
+        required=False,
+        default=4,
+    )
+
+    parser.add_argument(
+        "--force_rebuild",
+        action="store_true",
+        help="Force rebuild of images",
+        required=False,
+        default=False,
+    )
+
+    args = parser.parse_args()
+    build_images(args.dataset_path, args.split, args.max_workers, args.force_rebuild)
diff --git a/benchmarks/swe_bench/requirements.txt b/benchmarks/swe_bench/requirements.txt
@@ -0,0 +1,2 @@
+swebench
+docker
diff --git a/benchmarks/swe_bench/resources/baselines/download_sweagent_baseline.sh b/benchmarks/swe_bench/resources/baselines/download_sweagent_baseline.sh
@@ -0,0 +1,5 @@
+# Download the swe-bench baselines. NOTE: THEY ARE QUITE LARGE. 
+git clone https://github.com/swe-bench/experiments /tmp/swebench_baselines
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+# Copy to current directory
+cp -r /tmp/swebench_baselines/evaluation/verified/20240620_sweagent_claude3.5sonnet $SCRIPT_DIR
diff --git a/benchmarks/swe_bench/resources/docs/swebench_comparison.jpeg b/benchmarks/swe_bench/resources/docs/swebench_comparison.jpeg
diff --git a/benchmarks/swe_bench/resources/tests/create_test_repos.py b/benchmarks/swe_bench/resources/tests/create_test_repos.py
@@ -0,0 +1,98 @@
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from datasets import load_dataset
+
+dataset = load_dataset("princeton-nlp/SWE-bench_Verified")["test"]
+
+# We  create a subset of SWE-bench-verfied which:
+# 1) Contains all of the repositories in SWE-bench verified
+# 2) For each repository, contains an example where swe_agent + Claude 3.5 + Sonnet resolved the issue, and an example where it did not
+
+results_per_repo = {}
+baseline_dir = Path(__file__).parent.parent / "baselines"
+logs_dir = baseline_dir / "20240620_sweagent_claude3.5sonnet" / "logs"
+
+if not logs_dir.exists():
+    print(
+        f"Please run the baseline creation script at {baseline_dir / "download_sweagent_baselines.sh"} to make the baselines to compare your agents against."
+    )
+    exit()
+
+
+results = []
+missing_results = []
+
+# Load results from the log directory
+for result in os.listdir(logs_dir):
+    results_path = os.path.join(logs_dir, result, "report.json")
+    if os.path.exists(results_path):
+        with open(results_path, "r") as f:
+            result_dict = json.load(f)
+            result_name, results_value = next(iter(result_dict.items()))
+            output_dict = dict(instance_id=result_name, **results_value)
+            patch_path = os.path.join(logs_dir, result, "patch.diff")
+            with open(patch_path, "r") as f:
+                output_dict["swe_agent_patch"] = f.read()
+            results.append(output_dict)
+
+    else:
+        missing_results.append(result)
+
+# Get repository name from the results
+results = pd.DataFrame.from_records(results)
+results["repo"] = results["instance_id"].apply(lambda x: x.split("__")[0])
+
+# Github patches which change binary files cannot actually be applied. We wil remove these entries in the dataset
+results = results[~results["swe_agent_patch"].str.contains("Binary files")]
+
+# Group by repository, and success. Then pick one from each group.
+results_per_repo = results.groupby(["repo", "resolved"])
+results_per_repo = results_per_repo.apply(lambda x: x.sample(1)).reset_index(drop=True)
+
+
+# Filter dataset by those instance ids, and add a "reolved_by_swe_agent" column.
+instance_ids = results_per_repo["instance_id"].values
+resolved = results_per_repo["resolved"].values
+
+dataset = dataset.filter(lambda x: x["instance_id"] in instance_ids)
+dataset = dataset.map(
+    lambda x: dict(
+        x, resolved_by_swe_agent=resolved[instance_ids == x["instance_id"]][0]
+    ),
+    num_proc=4,
+)
+# Add swe-agent-patch
+dataset = dataset.map(
+    lambda x: dict(
+        x,
+        swe_agent_patch=results_per_repo[
+            results_per_repo["instance_id"] == x["instance_id"]
+        ]["swe_agent_patch"].values[0],
+    ),
+    num_proc=4,
+)
+# Add resolved column
+dataset = dataset.map(
+    lambda x: dict(x, resolved=resolved[instance_ids == x["instance_id"]][0]),
+    num_proc=4,
+)
+
+# This repo is bugged for testing, as the setup script edits files, breaking the patch from swe-agent.
+dataset = dataset.filter(lambda x: "sphinx" not in x["instance_id"])
+
+# psf__requests-1921 is flakey at high levels of concurrency, so we remove it as well
+dataset = dataset.filter(lambda x: "psf__requests-1921" not in x["instance_id"])
+
+
+# Calculate the accuracy. Should be 0.42857142857142855
+accuracy = sum(resolved) / len(resolved)
+
+# Save tbe dataset
+dataset_dir = Path(__file__).parent / "all_repos_swe_agent_50_percent.hf"
+os.makedirs(str(dataset_dir), exist_ok=True)
+dataset.to_parquet(dataset_dir / "dataset.parquet")
+
+print(f"Saved dataset to {dataset_dir}, accuracy {accuracy}")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		resources/compose_files/*
		resources/baselines/*/