From 9866a2fd70daba8b659393d38b7442d38ef00bb6 Mon Sep 17 00:00:00 2001 From: "J.J. Allaire" Date: Sat, 21 Sep 2024 11:07:53 +0100 Subject: [PATCH 1/3] plan -> solver in docs, examples, and tests --- docs/_tools-scaffold.md | 37 +++--- docs/agents-api.qmd | 12 +- docs/agents.qmd | 25 ++-- docs/caching.qmd | 2 +- docs/datasets.qmd | 2 +- docs/errors-and-limits.qmd | 4 +- docs/examples/index.qmd | 1 + docs/index.qmd | 125 +++++++++++++++--- docs/log-viewer.qmd | 4 +- docs/models.qmd | 4 +- docs/parallelism.qmd | 2 +- docs/scorers.qmd | 10 +- docs/solvers.qmd | 50 ++++--- docs/tools.qmd | 10 +- docs/tutorial.qmd | 22 +-- docs/workflow.qmd | 94 ++++++++++--- evals/agieval/agieval.py | 37 +++--- evals/arc/arc.py | 2 +- evals/boolq/boolq.py | 2 +- evals/commonsense_qa/commonsense_qa.py | 2 +- evals/drop/drop.py | 16 +-- evals/gaia/gaia.py | 12 +- evals/gdm_capabilities/in_house_ctf/task.py | 6 +- evals/gdm_capabilities/intercode_ctf/task.py | 2 +- evals/gpqa/gpqa.py | 2 +- evals/gsm8k/gsm8k.py | 8 +- evals/hellaswag/hellaswag.py | 2 +- evals/humaneval/humaneval.py | 2 +- evals/ifeval/ifeval.py | 2 +- evals/mathematics/mathematics.py | 22 +-- evals/mathvista/mathvista.py | 2 +- evals/mbpp/mbpp.py | 2 +- evals/mmlu/mmlu.py | 6 +- evals/mmlu_pro/mmlu_pro.py | 12 +- evals/mmmu/mmmu.py | 4 +- evals/piqa/piqa.py | 2 +- evals/pubmedqa/pubmedqa.py | 2 +- evals/race-h/race-h.py | 2 +- evals/squad/squad.py | 2 +- evals/swe_bench/swe_bench.py | 6 +- evals/swe_bench/test_swe_bench.py | 4 +- evals/truthfulqa/truthfulqa.py | 2 +- evals/winogrande/winogrande.py | 13 +- evals/xstest/xstest.py | 2 +- examples/biology_qa.py | 2 +- examples/cache.py | 10 +- examples/hello_world.py | 2 +- examples/images/images.py | 2 +- examples/langchain/wikipedia.py | 2 +- examples/popularity.py | 2 +- examples/security_guide.py | 2 +- examples/theory_of_mind.py | 6 +- examples/tool_use.py | 10 +- src/inspect_ai/_cli/eval.py | 2 +- src/inspect_ai/_eval/loader.py | 2 +- src/inspect_ai/solver/_plan.py | 5 +- src/inspect_ai/solver/_solver.py | 2 +- tests/model/test_mock_model_llm.py | 2 +- tests/solver/test_basic_agent.py | 4 +- tests/solver/test_fork.py | 2 +- tests/solver/test_prompt.py | 2 +- tests/solver/test_store.py | 4 +- tests/solver/test_subtask.py | 2 +- tests/solver/test_transcript.py | 2 +- tests/test_eval_set.py | 6 +- tests/test_extensions.py | 2 +- tests/test_fail_on_error.py | 4 +- tests/test_helpers/utils.py | 2 +- .../inspect_package/inspect_extensions.py | 2 +- .../inspect_package/{plans => solvers}/cot.py | 0 tests/test_run_dir/task1/task1.py | 2 +- tests/test_run_dir/task2/task2.py | 2 +- tests/test_task_state.py | 2 +- tests/tools/test_sandbox_tool_eval.py | 8 +- tests/tools/test_tool_types.py | 6 +- tests/tools/test_tools.py | 14 +- .../test_docker_compose_multiple_services.py | 2 +- tests/util/test_images.py | 2 +- .../vscode/assets/templates/task.py.template | 2 +- 79 files changed, 444 insertions(+), 261 deletions(-) rename tests/test_package/inspect_package/{plans => solvers}/cot.py (100%) diff --git a/docs/_tools-scaffold.md b/docs/_tools-scaffold.md index c0794ddac..1506c6f6d 100644 --- a/docs/_tools-scaffold.md +++ b/docs/_tools-scaffold.md @@ -20,23 +20,30 @@ state.messages.append(output.message) state.messages.extend(call_tools(output.message, state.tools)) ``` -This does everything that default `generate()` does, save for an outer loop to continue calling the mode as long as it continues calling tools. You could implement the outer loop as follows: +This does everything that default `generate()` does, save for an outer loop to continue calling the mode as long as it continues calling tools. This is a complete solver agent that implements the outer loop: ``` python -model = get_model() -while True: - # call model - output = await model.generate(state.messages, state.tools) - - # update state - state.output = output - state.messages.append(output.message) - - # make tool calls or terminate if there are none - if output.message.tool_calls: - state.messages.extend(call_tools(output.message, state.tools)) - else: - break +@solver +def agent_loop(): + async def solve(state: TaskState, generate: Generate): + model = get_model() + while True: + # call model + output = await model.generate(state.messages, state.tools) + + # update state + state.output = output + state.messages.append(output.message) + + # make tool calls or terminate if there are none + if output.message.tool_calls: + state.messages.extend(call_tools(output.message, state.tools)) + else: + break + + return state + + return solve ``` You can imagine several ways you might want to customise this loop: diff --git a/docs/agents-api.qmd b/docs/agents-api.qmd index 502f59fbe..fce126ff3 100644 --- a/docs/agents-api.qmd +++ b/docs/agents-api.qmd @@ -11,13 +11,13 @@ This article describes advanced Inspect APIs available for creating evaluations 5. Delegating work to sub-tasks 6. Sandboxing arbitrary code execution -We'll assume that you already understand Inspect [Solvers](solvers.qmd) and [Tools](tools.qmd) (please review those articles as required before proceeding). +We'll assume that you have already covered the basics of [Solvers](solvers.qmd), [Tools](tools.qmd), and [Agents](agents.qmd) (please review those articles as required before proceeding). ## Use of `metadata` Before proceeding, it's important to point that some of the features described below were previously approximated by using the `metadata` field of `TaskState`, specifically `metadata` was often used as a catch-all storage location for: -- Carrying state between solvers and sometimes tools. +- Sharing state between solvers. - Providing a place to log additional structured data. - Recording calls to "helper" models used for elicitation or scoring. @@ -138,7 +138,7 @@ from inspect_ai.log import transcript transcript().info("here is some custom info") ``` -You can pass arbitrary JSON serialisable objects to `info()`. +Strings passed to `info()` will be rendered as markdown. In addition to strings you can also pass arbitrary JSON serialisable objects to `info()`. ### Grouping with Steps @@ -216,11 +216,11 @@ Note that we don't `await` the subtasks when building up our list of `searches`. ### Forking {#sec-forking} -Inspect's `fork()` function provids a convenient wrapper around a very common use of subtasks: running a `TaskState` against a set of solvers in parallel to explore different trajectories. +Inspect's `fork()` function provids a convenient wrapper around a very common use of subtasks: running a `TaskState` against a set of solvers in parallel to explore different trajectories. For example, let's say you have a solver named `explore()` that takes `temperature` as a parameter. You might want to try the solver out with multiple temperature values and then continue on with the best result: -```python +``` python from inspect_ai.solver import fork results = await fork(state, [ @@ -241,7 +241,7 @@ Many agents provide models with the ability to execute arbitrary code. It's impo def file_probe() return Task( dataset=dataset, - plan=[ + solver=[ use_tools([list_files()]), generate() ], diff --git a/docs/agents.qmd b/docs/agents.qmd index b92347ded..c6b7c5817 100644 --- a/docs/agents.qmd +++ b/docs/agents.qmd @@ -4,6 +4,8 @@ Agents combine planning, memory, and tool usage to pursue more complex, longer horizon tasks (e.g. a [Capture the Flag](https://en.wikipedia.org/wiki/Capture_the_flag_(cybersecurity)) challenge). Agents are an area of active research, and many schemes for implementing them have been developed, including [AutoGPT](https://arxiv.org/abs/2306.02224), [ReAct](https://arxiv.org/pdf/2303.11366.pdf), and [Reflexion](https://arxiv.org/pdf/2303.11366.pdf). +An agent isn't a special construct within Inspect, it's merely a solver that includes tool use and calls `generate()` internally to interact with the model. + Inspect supports a variety of approaches to agent evaluations, including: 1. Using Inspect's built-in `basic_agent()`. @@ -12,8 +14,6 @@ Inspect supports a variety of approaches to agent evaluations, including: 3. Adapting an agent provided by a research paper or open source library (for example, using a 3rd party agent library like [LangChain](https://python.langchain.com/docs/modules/agents/) or [Langroid](https://langroid.github.io/langroid/)). -We'll cover the basics of all of these approaches below. - An important additional consideration for agent evaluations is sandboxing (providing a secure environment for models to execute code within). The [Sandbox Environments](#sec-sandbox-environments) section goes into more depth on this. ## Basic Agent {#sec-basic-agent} @@ -22,17 +22,17 @@ The `basic_agent()`provides a ReAct tool loop with support for retries and encou 1. When developing tasks and datasets it's convenient to have a ready made agent that you know that will competently navigate your task. -2. When developing custom agents, it's a good idea to start out with an idea of how the model performs using its native planning and eliciatation capabilities. The basic agent is a good way to establish this baseline. +2. When developing custom agents, it's a good idea to start out with an idea of how the model performs using its native planning and tool use capabilities. The basic agent is a good way to establish this baseline. 3. It provides a sound basis for comparison of the native agentic capabilities of models both over time and across providers. -The basic agent incorporates best practices for giving models some additional resilience and persistence, both through the optional `max_attempts` parameter, as well as by continuing the task even when the model stops making tool calls. The basic agent can frequently match or exeed custom scaffolds, so you should always try it as a baseline for your tasks! +The basic agent incorporates best practices for giving models some additional resilience and persistence, both through the optional `max_attempts` parameter, as well as by continuing the task even when the model stops making tool calls. The basic agent can frequently match or exceed custom scaffolds, so you should always try it as a baseline for your tasks! Note that when using the basic agent you should *always* set a `max_messages` so that there is some termination point if the model gets off track or stuck in a loop. ### Example -Here is an example use of `basic_agent()` as the `plan` for a CTF evaluation: +Here is an example use of `basic_agent()` as the `solver` for a CTF evaluation: ``` python from inspect_ai import Task, task @@ -54,7 +54,7 @@ you are going to use and how they fit into your plan. # <1> def ctf(): return Task( dataset=json_dataset("ctf.json"), - plan=basic_agent( + solver=basic_agent( init=system_message(SYSTEM_MESSAGE), tools=[bash(timeout=180), python(timeout=180)], # <2> max_attempts=3, # <3> @@ -92,10 +92,9 @@ There are several options available for customising the behaviour of the basic a For multiple attempts, submissions are evaluated using the task's main scorer, with value of 1.0 indicating a correct answer. Scorer values are converted to float (e.g. "C" becomes 1.0) using the standard `value_to_float()` function. Provide an alternate conversion scheme as required via `score_value`. - ## Custom Scaffold {#sec-custom-scaffolding} -The basic agent demonstrated above will work well for some tasks, but in other cases you may need to provide more custom logic. For example, you might want to: +The basic agent demonstrated above will work well for some tasks, but in other cases you may want to provide more custom logic. For example, you might want to: {{< include _tools-scaffold.md >}} @@ -103,11 +102,11 @@ The basic agent demonstrated above will work well for some tasks, but in other c While its possible to make tools globally available to the model via `use_tools()`, you may also want to filter the available tools either based on task stages or dynamically based on some other criteria. -Here's an example of a `Solver` that filters the available tools between calls to `generate()`: +Here's an example of a solver agent that filters the available tools between calls to `generate()`: ``` python @solver -def generate_ctf(): +def ctf_agent(): async def solve(state: TaskState, generate: Generate): # first pass w/ core tools @@ -128,8 +127,6 @@ def generate_ctf(): return solve ``` -In this example we rely on the default `generate()` tool calling behaviour (`"loop"`). However, you can also imaging combining tool filtering with the more tailored tool calling logic described in [Tool Calls](#sec-tool-calls). - ### Agents API For more sophisticated agents, Inspect offers several additional advanced APIs for state management, sub-agents, and fine grained logging. See the [Agents API](agents-api.qmd) article for additional details. @@ -259,7 +256,7 @@ Finally, here's a task that uses the `wikipedia_search()` solver: def wikipedia() -> Task: return Task( dataset=json_dataset("wikipedia.jsonl"), - plan=wikipedia_search(), + solver=wikipedia_search(), scorer=model_graded_fact(), ) ``` @@ -327,7 +324,7 @@ dataset = [ def file_probe() return Task( dataset=dataset, - plan=[ + solver=[ use_tools([list_files()]), generate() ], diff --git a/docs/caching.qmd b/docs/caching.qmd index 63698f510..5747316e6 100644 --- a/docs/caching.qmd +++ b/docs/caching.qmd @@ -25,7 +25,7 @@ For example, here we are iterating on our self critique template, so we cache th def theory_of_mind(): return Task( dataset=example_dataset("theory_of_mind"), - plan=[ + solver=[ chain_of_thought(), generate(cache = True), self_critique(CRITIQUE_TEMPLATE) diff --git a/docs/datasets.qmd b/docs/datasets.qmd index 50c5a777e..2dfed0881 100644 --- a/docs/datasets.qmd +++ b/docs/datasets.qmd @@ -243,7 +243,7 @@ dataset=MemoryDataset([ def security_guide(): return Task( dataset=dataset, - plan=[system_message(SYSTEM_MESSAGE), generate()], + solver=[system_message(SYSTEM_MESSAGE), generate()], scorer=model_graded_fact(), ) ``` diff --git a/docs/errors-and-limits.qmd b/docs/errors-and-limits.qmd index 001d4f7c1..4859da67a 100644 --- a/docs/errors-and-limits.qmd +++ b/docs/errors-and-limits.qmd @@ -24,7 +24,7 @@ In some cases you might wish to tolerate some number of errors without failing t def intercode_ctf(): return Task( dataset=read_dataset(), - plan=[ + solver=[ system_message("system.txt"), use_tools([bash(timeout=120)]), generate(), @@ -65,7 +65,7 @@ In open-ended model conversations (for example, an agent evalution with tool usa def intercode_ctf(): return Task( dataset=read_dataset(), - plan=[ + solver=[ system_message("system.txt"), use_tools([bash(timeout=120)]), generate(), diff --git a/docs/examples/index.qmd b/docs/examples/index.qmd index 3232179df..a06a32961 100644 --- a/docs/examples/index.qmd +++ b/docs/examples/index.qmd @@ -14,6 +14,7 @@ aliases: