microsoft · chisingh · Feb 28, 2025 · Mar 2, 2025 · Mar 2, 2025 · Mar 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -180,6 +180,8 @@ logs/
 
 /data
 
+.vscode/
+
 vscode/out
 vscode/node_modules
 vscode/package-lock.json

diff --git a/README.md b/README.md
@@ -30,12 +30,12 @@ To install development dependencies, run:
 
 **Set your API information in llm.yaml**
 
-First, create an LLM config template by running the `debug-gym-init-llm-config` entrypoint:
+First, create an LLM config template by running `python -m debug_gym.init_llm_config`:
 
-    python -m debug_gym.init_llm_config ~/.config/debug_gym
+    python -m debug_gym.init_llm_config $HOME/.config/debug_gym
 
 > [!TIP]
-> Run `debug-gym-init-llm-config --help` for more options. By default, the template is created at `~/.config/debug_gym/llm.yaml`, but you can specify any directory.
+> Run `python -m debug_gym.init_llm_config --help` for more options. By default, the template is created at `$HOME/.config/debug_gym/llm.yaml`, but you can specify any directory.
 
 Then, edit this file with your endpoint and credentials. You can choose one of these authentication methods:
 - For authenticating with an API key, provide `api_key`.
@@ -44,7 +44,7 @@ Then, edit this file with your endpoint and credentials. You can choose one of t
 > [!WARNING]
 > When using open-sourced LLMs, e.g., via vLLM, you need to correctly setup `HF_TOKEN` required by the tokenizer.
 
-By default, `debug-gym` looks for the LLM config file at `~/.config/debug_gym/llm.yaml`. You can change this behavior by exporting the environment variable `LLM_CONFIG_FILE_PATH` or by setting `llm_config_file_path` in your script config file (see [Running Baselines](#3-running-baselines)).
+By default, `debug-gym` looks for the LLM config file at `$HOME/.config/debug_gym/llm.yaml`. You can change this behavior by exporting the environment variable `LLM_CONFIG_FILE_PATH` or by setting `llm_config_file_path` in your script config file (see [Running Baselines](#3-running-baselines)).
 
 ---
 
@@ -143,8 +143,8 @@ As an example, we provide a buggy pytorch code repository in `data/pytorch`.
 #### 3.5. Analysis and Visualization
 
 We provide a set of scripts to help analyze the log files (e.g., the `.jsonl` files) generated by the agent.
-- In the `analysis` folder, we provide scripts that used to generate the corresponding figures in our technical report. 
-- In the `analysis/json_log_viewer` folder, we provide a Flask app to view a `.jsonl` log file in the browser. 
+- In the `analysis` folder, we provide scripts that used to generate the corresponding figures in our technical report.
+- In the `analysis/json_log_viewer` folder, we provide a Flask app to view a `.jsonl` log file in the browser.
 
 ## Citation
 ```

diff --git a/debug_gym/agents/debug_agent.py b/debug_gym/agents/debug_agent.py
@@ -1,4 +1,5 @@
 from debug_gym.agents.base_agent import BaseAgent, register_agent
+from debug_gym.agents.llm_api import LLM
 
 
 @register_agent
@@ -62,3 +63,79 @@ def run(self, task_name=None, debug=False):
                 break
 
         return info.done
+
+
+@register_agent
+class DebugHumanInTheLoop(DebugAgent):
+    name: str = "debug_human"
+
+    def run(self, task_name=None, debug=False):
+        # instantiate the human in the loop
+        self.human = LLM.instantiate(
+            llm_name="human",
+            llm_config_file_path=self.config.get("llm_config_file_path"),
+            logger=self.logger,
+        )
+
+        self.history.reset()
+        info = self.env.reset(options={"task_name": task_name})
+        # initial state does not have prompt and response
+        self.history.step(info, None)
+
+        if info.done is True:
+            # msg = "Environment started with entrypoint passing without errors."
+            return True
+
+        highscore = info.score
+
+        for step in self.logger.tqdm(range(self.config["max_steps"])):
+            highscore = max(highscore, info.score)
+            self.logger.info(
+                f"Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
+            )
+
+            prompt = self.build_prompt(info)
+
+            human_response = self.human(prompt, info.tools)
+
+            if debug:
+                breakpoint()
+
+            # make a copy of the env for the llm
+            self.cloned_env = self.env.clone()
+            # remove the pdb tool from the cloned env
+            if self.cloned_env.has_tool("pdb"):
+                self.cloned_env.remove_tool("pdb")
+            llm_info = self.cloned_env.reset(options={"task_name": task_name})
+            # replay the history up to the current step
+            for step in self.history.get_all():
+                if step.done:
+                    break
+                llm_info = self.cloned_env.step(step.action)
+
+            # step the environment with the human response
+            info = self.env.step(human_response.response)
+            # log the human response
+            self.history.step(info, human_response)
+
+            if info.done or info.rewrite_counter >= self.config["max_rewrite_steps"]:
+                self.logger.info(
+                    f"Score (human): {info.score}/{info.max_score} ({info.score/info.max_score:.1%})"
+                )
+                break
+
+            # call the llm with the cloned environment
+            prompt = self.build_prompt(llm_info)
+            llm_response = self.llm(prompt, llm_info.tools)
+            llm_info = self.cloned_env.step(llm_response.response)
+
+            if (
+                llm_info.done
+                or llm_info.rewrite_counter >= self.config["max_rewrite_steps"]
+            ):
+                self.logger.info(
+                    f"Score (llm): {llm_info.score}/{llm_info.max_score} ({llm_info.score/llm_info.max_score:.1%})"
+                )
+                break
+
+        return info.done
diff --git a/debug_gym/agents/llm_api.py b/debug_gym/agents/llm_api.py
@@ -225,7 +225,11 @@ def from_file(cls, config_file_path: str | None = None) -> "LLMConfigRegistry":
                 raw_llm_config = yaml.safe_load(f)
             return cls.register_all(raw_llm_config)
         except FileNotFoundError:
-            raise FileNotFoundError(f"Cannot find llm config file: {config_file_path}")
+            msg = (
+                f"Cannot find llm config file: {config_file_path}. "
+                "Use `debug-gym-init-llm-config` to create one and edit it."
+            )
+            raise FileNotFoundError(msg)
 
     def __getitem__(self, model_name: str) -> LLMConfig:
         """Allow dictionary-like access to configurations"""

diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
@@ -136,6 +136,11 @@ def load_config():
     parser.add_argument(
         "--agent",
     )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List available agents and problems.",
+    )
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -186,6 +191,7 @@ def load_config():
         "--params",
         nargs="+",
         metavar="my.setting=value",
+        action='extend',
         default=[],
         help="override params of the config file,"
         " e.g. -p 'rewrite_only.random_seed=123'",

diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
@@ -249,6 +249,10 @@ def set_entrypoints(self, entrypoint, debug_entrypoint):
 
     @staticmethod
     def _prepare_entrypoint(entrypoint):
+        # Remove PYTHONPATH prefix if present
+        if entrypoint.startswith("PYTHONPATH=$PYTHONPATH:$PWD "):
+            entrypoint = entrypoint[len("PYTHONPATH=$PYTHONPATH:$PWD ") :]
+
         entrypoint_list = entrypoint.split()
 
         if entrypoint_list[0] != "python":
@@ -484,6 +488,23 @@ def step(self, action: ToolCall) -> EnvInfo:
 
         return self.infos
 
+    def clone(self):
+        # Create a new instance of RepoEnv
+        new_env = RepoEnv(
+            path=self.path,
+            entrypoint=self.entrypoint,
+            debug_entrypoint=self.debug_entrypoint,
+            max_score=self.max_score,
+            readonly_patterns=None,
+            run_timeout=self.run_timeout,
+            dir_tree_depth=self.dir_tree_depth,
+            terminal=Terminal(),
+            logger=self.logger,
+        )
+        for tool in self.tools:
+            new_env.add_tool(tool)
+        return new_env
+
     def post_process_event(self, event: Event, source, kwargs, observations):
         """Post-process the event after it has been handled by the tools."""
         if event in (Event.REWRITE_SUCCESS, Event.REWRITE_FAIL):

diff --git a/debug_gym/init_llm_config.py b/debug_gym/init_llm_config.py
@@ -2,6 +2,8 @@
 import os
 from pathlib import Path
 
+from termcolor import colored
+
 from debug_gym.agents.llm_api import LLM_CONFIG_TEMPLATE
 
 
@@ -47,7 +49,13 @@ def init_llm_config(dest_dir: str = None):
     else:
         print(f"LLM config template already exists at `{destination}`.")
 
-    print("Please edit the file to configure your LLM settings.")
+    print(
+        colored(
+            f"Please edit `{destination}` to configure your LLM settings.",
+            "green",
+            attrs=["bold"],
+        )
+    )
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,4 +30,4 @@ dev = [
     "pytest-xdist",
     "pytest-timeout",
     "pytest-env",
-]
+]
diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml
@@ -18,7 +18,7 @@ base:
         # session_commands define commands that are always executed before starting a shell session or running a single command in the terminal.
         # session_commands:["conda activate aider"],
         # setup_commands define commands that are executed only once when the terminal is created. This is only supported for Docker terminal.
-        setup_commands: ["pip install pytest"],
+        setup_commands: ["pip install pytest pandas"],
     }
 
     # LLM configs
@@ -42,3 +42,6 @@ debug_agent:
 debug_5_agent:
     n_rewrites_before_pdb: 5
     tools: ["pdb", "view", "rewrite", "eval"]
+
+debug_human:
+    tools: ["pdb", "view", "rewrite", "eval"]
diff --git a/scripts/run.py b/scripts/run.py
@@ -7,7 +7,7 @@
 from termcolor import colored
 from tqdm import tqdm
 
-from debug_gym.agents.base_agent import create_agent
+from debug_gym.agents.base_agent import AGENT_REGISTRY, create_agent
 from debug_gym.agents.utils import load_config
 from debug_gym.gym.envs import select_env
 from debug_gym.gym.terminal import select_terminal
@@ -107,6 +107,17 @@ def main():
         env = create_env(config, logger=logger)
         problems = list(env.dataset.keys())  # all tasks
 
+        if args.list:
+            print(f"\n-= Available problems in {config['benchmark']}=-")
+            for problem in problems:
+                print(f" - {problem}")
+
+            # list agent
+            print("\n-= Available agents =-")
+            for agent in AGENT_REGISTRY:
+                print(f" - {agent}")
+            return
+
     num_workers = int(os.environ.get("DEBUG_GYM_WORKERS", 1))
     logger.warning(f"Running with {num_workers} workers")
     if args.debug:

diff --git a/tests/agents/test_example_agent.py → tests/agents/test_pdb_agent.py b/tests/agents/test_example_agent.py → tests/agents/test_pdb_agent.py
@@ -1,7 +1,7 @@
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, call, patch
 
-from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
-from debug_gym.agents.llm_api import LLMResponse, TokenUsage
+from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent, DebugHumanInTheLoop
+from debug_gym.agents.llm_api import Human, LLMResponse, TokenUsage
 from debug_gym.agents.rewrite_agent import RewriteAgent
 
 
@@ -95,3 +95,80 @@ def test_run_debug_5_agent(agent_setup, build_env_info):
     env.tools = {"pdb": MagicMock()}
     result = agent.run(task_name="test_task", debug=False)
     assert result
+
+
+@patch.object(
+    Human,
+    "__call__",
+    return_value=LLMResponse(
+        "Prompt",
+        '{"id": "pdb-267437", "name": "pdb", "arguments": {"command": "c"}}',
+        TokenUsage(2, 4),
+    ),
+)
+def test_human_in_the_loop(human, agent_setup, build_env_info):
+    agent, env, llm = next(agent_setup(DebugHumanInTheLoop))
+    env.reset.return_value = build_env_info(
+        done=False,
+        score=0,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    env.step.return_value = build_env_info(
+        done=False,
+        score=10,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+
+    env.clone.return_value = MagicMock()
+    llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4))
+    env.tools = {"pdb": MagicMock()}
+
+    env.clone().step.return_value = build_env_info(
+        done=True,
+        score=10,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    result = agent.run(task_name="test_task", debug=False)
+
+    assert result is False
+    # test that llm actions were executed
+    assert env.step.called
+    env.step.assert_called_with(human().response)
+    assert env.step().done is False
+
+    # test that llm actions were logged
+    _history, _prompt_response_pairs = agent.history.get()
+    assert [[], [human()]] == _prompt_response_pairs
+
+    # test that env was cloned
+    assert env.clone.called
+    assert env.clone().reset.called
+
+    # assert that cloned env was called with history steps
+    env.clone().step.assert_has_calls(
+        [
+            call(agent.history.get_all()[0].action),
+        ]
+    )
+
+    # test that human action was executed
+    assert env.clone().step.called
+    env.clone().step.assert_called_with(llm().response)
+
+    # ensure that human action was not recorded in history
+    assert env.clone().step() not in agent.history.get_all()
diff --git a/tests/gym/envs/test_env.py b/tests/gym/envs/test_env.py
@@ -538,3 +538,22 @@ def test_queue_and_process_events():
         call(environment=env, event=Event.ENV_RESET, source="source2", arg2="val2"),
     ]
     mock.assert_has_calls(expected_calls)
+
+
+def test_clone(env):
+    cloned_env = env.clone()
+
+    # Check that the cloned environment is a different instance
+    assert id(env) != id(cloned_env)
+
+    # Check that the properties are the same
+    assert env.path == cloned_env.path
+    assert env.entrypoint == cloned_env.entrypoint
+    assert env.debug_entrypoint == cloned_env.debug_entrypoint
+    assert env.max_score == cloned_env.max_score
+    assert env.run_timeout == cloned_env.run_timeout
+    assert env.dir_tree_depth == cloned_env.dir_tree_depth
+    assert env.logger == cloned_env.logger
+
+    # Check that the terminal is not the same instance
+    assert id(env.terminal) != id(cloned_env.terminal)