diff --git a/docs/_tools-scaffold.md b/docs/_tools-scaffold.md
index c0794ddac..1506c6f6d 100644
--- a/docs/_tools-scaffold.md
+++ b/docs/_tools-scaffold.md
@@ -20,23 +20,30 @@ state.messages.append(output.message)
 state.messages.extend(call_tools(output.message, state.tools))
 ```
 
-This does everything that default `generate()` does, save for an outer loop to continue calling the mode as long as it continues calling tools. You could implement the outer loop as follows:
+This does everything that default `generate()` does, save for an outer loop to continue calling the mode as long as it continues calling tools. This is a complete solver agent that implements the outer loop:
 
 ``` python
-model = get_model()
-while True:
-    # call model
-    output = await model.generate(state.messages, state.tools)
-
-    # update state
-    state.output = output
-    state.messages.append(output.message)
-
-    # make tool calls or terminate if there are none
-    if output.message.tool_calls:
-        state.messages.extend(call_tools(output.message, state.tools))
-    else:
-        break
+@solver
+def agent_loop():
+    async def solve(state: TaskState, generate: Generate):
+        model = get_model()
+        while True:
+            # call model
+            output = await model.generate(state.messages, state.tools)
+
+            # update state
+            state.output = output
+            state.messages.append(output.message)
+
+            # make tool calls or terminate if there are none
+            if output.message.tool_calls:
+                state.messages.extend(call_tools(output.message, state.tools))
+            else:
+                break
+
+        return state
+
+    return solve
 ```
 
 You can imagine several ways you might want to customise this loop:
diff --git a/docs/agents-api.qmd b/docs/agents-api.qmd
index 502f59fbe..fce126ff3 100644
--- a/docs/agents-api.qmd
+++ b/docs/agents-api.qmd
@@ -11,13 +11,13 @@ This article describes advanced Inspect APIs available for creating evaluations
 5.  Delegating work to sub-tasks
 6.  Sandboxing arbitrary code execution
 
-We'll assume that you already understand Inspect [Solvers](solvers.qmd) and [Tools](tools.qmd) (please review those articles as required before proceeding).
+We'll assume that you have already covered the basics of [Solvers](solvers.qmd), [Tools](tools.qmd), and [Agents](agents.qmd) (please review those articles as required before proceeding).
 
 ## Use of `metadata`
 
 Before proceeding, it's important to point that some of the features described below were previously approximated by using the `metadata` field of `TaskState`, specifically `metadata` was often used as a catch-all storage location for:
 
--   Carrying state between solvers and sometimes tools.
+-   Sharing state between solvers.
 -   Providing a place to log additional structured data.
 -   Recording calls to "helper" models used for elicitation or scoring.
 
@@ -138,7 +138,7 @@ from inspect_ai.log import transcript
 transcript().info("here is some custom info")
 ```
 
-You can pass arbitrary JSON serialisable objects to `info()`.
+Strings passed to `info()` will be rendered as markdown. In addition to strings you can also pass arbitrary JSON serialisable objects to `info()`.
 
 ### Grouping with Steps
 
@@ -216,11 +216,11 @@ Note that we don't `await` the subtasks when building up our list of `searches`.
 
 ### Forking {#sec-forking}
 
-Inspect's `fork()` function provids a convenient wrapper around a very common use of subtasks: running a `TaskState` against a set of solvers in parallel to explore different trajectories. 
+Inspect's `fork()` function provids a convenient wrapper around a very common use of subtasks: running a `TaskState` against a set of solvers in parallel to explore different trajectories.
 
 For example, let's say you have a solver named `explore()` that takes `temperature` as a parameter. You might want to try the solver out with multiple temperature values and then continue on with the best result:
 
-```python
+``` python
 from inspect_ai.solver import fork
 
 results = await fork(state, [
@@ -241,7 +241,7 @@ Many agents provide models with the ability to execute arbitrary code. It's impo
 def file_probe()
     return Task(
         dataset=dataset,
-        plan=[
+        solver=[
             use_tools([list_files()]), 
             generate()
         ],
diff --git a/docs/agents.qmd b/docs/agents.qmd
index b92347ded..c6b7c5817 100644
--- a/docs/agents.qmd
+++ b/docs/agents.qmd
@@ -4,6 +4,8 @@
 
 Agents combine planning, memory, and tool usage to pursue more complex, longer horizon tasks (e.g. a [Capture the Flag](https://en.wikipedia.org/wiki/Capture_the_flag_(cybersecurity)) challenge). Agents are an area of active research, and many schemes for implementing them have been developed, including [AutoGPT](https://arxiv.org/abs/2306.02224), [ReAct](https://arxiv.org/pdf/2303.11366.pdf), and [Reflexion](https://arxiv.org/pdf/2303.11366.pdf).
 
+An agent isn't a special construct within Inspect, it's merely a solver that includes tool use and calls `generate()` internally to interact with the model.
+
 Inspect supports a variety of approaches to agent evaluations, including:
 
 1.  Using Inspect's built-in `basic_agent()`.
@@ -12,8 +14,6 @@ Inspect supports a variety of approaches to agent evaluations, including:
 
 3.  Adapting an agent provided by a research paper or open source library (for example, using a 3rd party agent library like [LangChain](https://python.langchain.com/docs/modules/agents/) or [Langroid](https://langroid.github.io/langroid/)).
 
-We'll cover the basics of all of these approaches below.
-
 An important additional consideration for agent evaluations is sandboxing (providing a secure environment for models to execute code within). The [Sandbox Environments](#sec-sandbox-environments) section goes into more depth on this.
 
 ## Basic Agent {#sec-basic-agent}
@@ -22,17 +22,17 @@ The `basic_agent()`provides a ReAct tool loop with support for retries and encou
 
 1.  When developing tasks and datasets it's convenient to have a ready made agent that you know that will competently navigate your task.
 
-2.  When developing custom agents, it's a good idea to start out with an idea of how the model performs using its native planning and eliciatation capabilities. The basic agent is a good way to establish this baseline.
+2.  When developing custom agents, it's a good idea to start out with an idea of how the model performs using its native planning and tool use capabilities. The basic agent is a good way to establish this baseline.
 
 3.  It provides a sound basis for comparison of the native agentic capabilities of models both over time and across providers.
 
-The basic agent incorporates best practices for giving models some additional resilience and persistence, both through the optional `max_attempts` parameter, as well as by continuing the task even when the model stops making tool calls. The basic agent can frequently match or exeed custom scaffolds, so you should always try it as a baseline for your tasks!
+The basic agent incorporates best practices for giving models some additional resilience and persistence, both through the optional `max_attempts` parameter, as well as by continuing the task even when the model stops making tool calls. The basic agent can frequently match or exceed custom scaffolds, so you should always try it as a baseline for your tasks!
 
 Note that when using the basic agent you should *always* set a `max_messages` so that there is some termination point if the model gets off track or stuck in a loop.
 
 ### Example
 
-Here is an example use of `basic_agent()` as the `plan` for a CTF evaluation:
+Here is an example use of `basic_agent()` as the `solver` for a CTF evaluation:
 
 ``` python
 from inspect_ai import Task, task
@@ -54,7 +54,7 @@ you are going to use and how they fit into your plan.                    # <1>
 def ctf():
     return Task(
         dataset=json_dataset("ctf.json"),
-        plan=basic_agent(
+        solver=basic_agent(
             init=system_message(SYSTEM_MESSAGE),
             tools=[bash(timeout=180), python(timeout=180)], # <2>
             max_attempts=3,                                 # <3>
@@ -92,10 +92,9 @@ There are several options available for customising the behaviour of the basic a
 
 For multiple attempts, submissions are evaluated using the task's main scorer, with value of 1.0 indicating a correct answer. Scorer values are converted to float (e.g. "C" becomes 1.0) using the standard `value_to_float()` function. Provide an alternate conversion scheme as required via `score_value`.
 
-
 ## Custom Scaffold {#sec-custom-scaffolding}
 
-The basic agent demonstrated above will work well for some tasks, but in other cases you may need to provide more custom logic. For example, you might want to:
+The basic agent demonstrated above will work well for some tasks, but in other cases you may want to provide more custom logic. For example, you might want to:
 
 {{< include _tools-scaffold.md >}}
 
@@ -103,11 +102,11 @@ The basic agent demonstrated above will work well for some tasks, but in other c
 
 While its possible to make tools globally available to the model via `use_tools()`, you may also want to filter the available tools either based on task stages or dynamically based on some other criteria.
 
-Here's an example of a `Solver` that filters the available tools between calls to `generate()`:
+Here's an example of a solver agent that filters the available tools between calls to `generate()`:
 
 ``` python
 @solver
-def generate_ctf():
+def ctf_agent():
     async def solve(state: TaskState, generate: Generate):
         
         # first pass w/ core tools
@@ -128,8 +127,6 @@ def generate_ctf():
     return solve
 ```
 
-In this example we rely on the default `generate()` tool calling behaviour (`"loop"`). However, you can also imaging combining tool filtering with the more tailored tool calling logic described in [Tool Calls](#sec-tool-calls).
-
 ### Agents API
 
 For more sophisticated agents, Inspect offers several additional advanced APIs for state management, sub-agents, and fine grained logging. See the [Agents API](agents-api.qmd) article for additional details.
@@ -259,7 +256,7 @@ Finally, here's a task that uses the `wikipedia_search()` solver:
 def wikipedia() -> Task:
     return Task(
         dataset=json_dataset("wikipedia.jsonl"),
-        plan=wikipedia_search(),
+        solver=wikipedia_search(),
         scorer=model_graded_fact(),
     )
 ```
@@ -327,7 +324,7 @@ dataset = [
 def file_probe()
     return Task(
         dataset=dataset,
-        plan=[
+        solver=[
             use_tools([list_files()]), 
             generate()
         ],
diff --git a/docs/caching.qmd b/docs/caching.qmd
index 63698f510..5747316e6 100644
--- a/docs/caching.qmd
+++ b/docs/caching.qmd
@@ -25,7 +25,7 @@ For example, here we are iterating on our self critique template, so we cache th
 def theory_of_mind():
     return Task(
         dataset=example_dataset("theory_of_mind"),
-        plan=[
+        solver=[
             chain_of_thought(),
             generate(cache = True),
             self_critique(CRITIQUE_TEMPLATE)
diff --git a/docs/datasets.qmd b/docs/datasets.qmd
index 50c5a777e..2dfed0881 100644
--- a/docs/datasets.qmd
+++ b/docs/datasets.qmd
@@ -243,7 +243,7 @@ dataset=MemoryDataset([
 def security_guide():
     return Task(
         dataset=dataset,
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=model_graded_fact(),
     )
 ```
diff --git a/docs/errors-and-limits.qmd b/docs/errors-and-limits.qmd
index 001d4f7c1..4859da67a 100644
--- a/docs/errors-and-limits.qmd
+++ b/docs/errors-and-limits.qmd
@@ -24,7 +24,7 @@ In some cases you might wish to tolerate some number of errors without failing t
 def intercode_ctf():
     return Task(
         dataset=read_dataset(),
-        plan=[
+        solver=[
             system_message("system.txt"),
             use_tools([bash(timeout=120)]),
             generate(),
@@ -65,7 +65,7 @@ In open-ended model conversations (for example, an agent evalution with tool usa
 def intercode_ctf():
     return Task(
         dataset=read_dataset(),
-        plan=[
+        solver=[
             system_message("system.txt"),
             use_tools([bash(timeout=120)]),
             generate(),
diff --git a/docs/examples/index.qmd b/docs/examples/index.qmd
index 3232179df..a06a32961 100644
--- a/docs/examples/index.qmd
+++ b/docs/examples/index.qmd
@@ -14,6 +14,7 @@ aliases:
 <nav id="TOC" role="doc-toc">
 <ul>
 <li><a href="#coding" id="toc-coding">Coding</a></li>
+<li><a href="#assistants" id="toc-coding">Assistants</a></li>
 <li><a href="#cybersecurity" id="toc-coding">Cybersecurity</a></li>
 <li><a href="#mathematics" id="toc-mathematics">Mathematics</a></li>
 <li><a href="#reasoning" id="toc-reasoning">Reasoning</a></li>
diff --git a/docs/index.qmd b/docs/index.qmd
index 99f03fddc..e546d30f0 100644
--- a/docs/index.qmd
+++ b/docs/index.qmd
@@ -6,11 +6,10 @@ toc: true
 
 Welcome to Inspect, a framework for large language model evaluations created by the [UK AI Safety Institute](https://www.gov.uk/government/organisations/ai-safety-institute).
 
-Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages. 
+Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages.
 
 ![](images/inspect.png){.lightbox .border fig-alt="Inspect running inside Visual Studio Code. The editor shows the ARC evaluation and the log viewer at right shows results from the evaluation."}
 
-
 We'll walk through a fairly trivial "Hello, Inspect" example below. Read on to learn the basics, then read the documentation on [Workflow](#sec-workflow), [Solvers](#sec-solvers), [Tools](#sec-tools), [Scorers](#sec-scorers), [Datasets](#sec-datasets), and [Models](#sec-models) to learn how to create more advanced evaluations.
 
 ## Getting Started
@@ -88,37 +87,39 @@ In addition to the model providers shown above, Inspect also supports models hos
 
 Inspect evaluations have three main components:
 
-1.  **Datasets** contain a set of labeled samples. Datasets are typically just a table with `input` and `target` columns, where `input` is a prompt and `target` is either literal value(s) or grading guidance.
+1.  **Datasets** contain a set of labelled samples. Datasets are typically just a table with `input` and `target` columns, where `input` is a prompt and `target` is either literal value(s) or grading guidance.
 
-2.  **Solvers** are composed together in a *plan* to evaluate the `input` in the dataset. The most elemental solver, `generate()`, just calls the model with a prompt and collects the output. Other solvers might do prompt engineering, multi-turn dialog, critique, etc.
+2.  **Solvers** are chained together to evaluate the `input` in the dataset and produce a final result. The most elemental solver, `generate()`, just calls the model with a prompt and collects the output. Other solvers might do prompt engineering, multi-turn dialog, critique, or provide an agent scaffold.
 
 3.  **Scorers** evaluate the final output of solvers. They may use text comparisons, model grading, or other custom schemes
 
 Let's take a look at a simple evaluation that aims to see how models perform on the [Sally-Anne](https://en.wikipedia.org/wiki/Sally%E2%80%93Anne_test) test, which assesses the ability of a person to infer false beliefs in others. Here are some samples from the dataset:
 
-| input                                                                                                                                                                                                                                   | target  |
+| input | target |
 |---------------------------------------------|---------------------------|
-| Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry. Where was the boots at the beginning?                     | bathtub |
-| Hannah entered the patio. Noah entered the patio. The sweater is in the bucket. Noah exited the patio. Ethan entered the study. Ethan exited the study. Hannah moved the sweater to the pantry. Where will Hannah look for the sweater? | pantry  |
+| Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry. Where was the boots at the beginning? | bathtub |
+| Hannah entered the patio. Noah entered the patio. The sweater is in the bucket. Noah exited the patio. Ethan entered the study. Ethan exited the study. Hannah moved the sweater to the pantry. Where will Hannah look for the sweater? | pantry |
 
-Here's the code for the evaluation[&nbsp;(click on the numbers at right for further explanation)]{.content-visible when-format="html"}:
+Here's the code for the evaluation[ (click on the numbers at right for further explanation)]{.content-visible when-format="html"}:
 
-``` python
-from inspect_ai import Task, eval, task
+``` {.python filename="theory.py"}
+from inspect_ai import Task, task
 from inspect_ai.dataset import example_dataset
 from inspect_ai.scorer import model_graded_fact
 from inspect_ai.solver import (               
-  chain_of_thought, generate, self_critique   
+  prompt_template, generate, self_critique   
 )                                             
 
+DEFAULT_PROMPT="<prompt>"
+
 @task
 def theory_of_mind():
     return Task(  # <1>
         dataset=example_dataset("theory_of_mind"),
-        plan=[
-          chain_of_thought(), # <2>
-          generate(),         # <2>
-          self_critique()     # <2>
+        solver=[                      # <2>
+          prompt_template(DEFAULT_PROMPT), # <2>
+          generate(),                      # <2>
+          self_critique()                  # <2>
         ],
         scorer=model_graded_fact() # <3>
     )
@@ -130,21 +131,28 @@ def theory_of_mind():
 
 3.  Since the output is likely to have pretty involved language, we use a model for scoring.
 
-Note that this is a purposely over-simplified example! The templates used for prompting, critique, and grading can all be customised, and in a more rigorous evaluation we'd explore improving them in the context of this specific dataset.
+Note that you can provide a *single* solver or multiple solvers chained together as we did here.
 
 The `@task` decorator applied to the `theory_of_mind()` function is what enables `inspect eval` to find and run the eval in the source file passed to it. For example, here we run the eval against GPT-4:
 
 ``` bash
-$ inspect eval theory_of_mind.py --model openai/gpt-4
+$ inspect eval theory.py --model openai/gpt-4
 ```
 
 ![](images/running-theory.png){fig-alt="The Inspect task results displayed in the terminal. A progress bar indicates that the evaluation is about 60% complete."}
 
+
+::: {.callout-note appearance="simple"}
+This example demonstrates evals being run from the terminal with the `inspect eval` command. There is also an `eval()` function which can be used for exploratory work---this is covered further in [Workflow](#sec-workflow).
+:::
+
+## Evaluation Logs
+
 By default, eval logs are written to the `./logs` sub-directory of the current working directory. When the eval is complete you will find a link to the log at the bottom of the task results summary.
 
 You can also explore eval results using the Inspect log viewer. Run `inspect view` to open the viewer (you only need to do this once as the viewer will automatically updated when new evals are run):
 
-```bash
+``` bash
 $ inspect view
 ```
 
@@ -152,9 +160,87 @@ $ inspect view
 
 See the [Log Viewer](#sec-log-viewer) section for additional details on using Inspect View.
 
-::: {.callout-note appearance="simple"}
-This example demonstrates evals being run from the terminal with the `inspect eval` command. There is also an `eval()` function which can be used for exploratory work---this is covered further in [Workflow](#sec-workflow).
-:::
+
+## Tasks and Solvers
+
+While tasks always include a *default* solver, you can also vary the solver to explore other strategies and elicitation techniques.
+
+### Solver Roles
+
+In the example above we combined together several solvers into a composite solver. This illustrates the fact that there are two distinct roles that solvers can play in the system:
+
+1. As a _composite_ end-to-end specification of how to solve a task.
+
+2. As a _component_ that is chained together with other solvers to create a composite solver;
+
+Some solvers are capable of playing both roles. For example, `generate()` is a complete end-to-end solver (albeit a simple one) but is often also used as a _component_ within other solvers.
+
+### Solver Functions
+
+The most convenient way to create a composite solver is to define a `@solver` decorated function that returns a chain of other solvers. For example, imagine we have written a `tree_of_thought` module that we want to use to create an additional solver. We can re-write the task to have multiple solver functions (where `critique` is used as the default):
+
+```{.python filename="theory.py"}
+from inspect_ai import Task, task
+from inspect_ai.dataset import example_dataset
+from inspect_ai.scorer import model_graded_fact
+from inspect_ai.solver import (               
+  chain, prompt_template, generate, self_critique   
+) 
+
+DEFAULT_PROMPT="<prompt>"
+
+from tree_of_thought imoprt TREE_PROMPT, generate_tree
+
+@solver 
+def critique():
+    return chain(
+        prompt_template(DEFAULT_PROMPT), 
+        generate(), 
+        self_critique()
+    )
+
+@solver
+def tree_of_thought():
+    return chain(
+        prompt_template(TREE_PROMPT), 
+        generate_tree()
+    )
+
+@task
+def theory_of_mind():
+    return Task(  
+        dataset=example_dataset("theory_of_mind"),
+        solver=critique(),
+        scorer=model_graded_fact()
+    )
+```
+
+Note that we use the `chain()` function to combine mutliple solvers into a composite one.
+
+You can switch between solvers when running the evaluation:
+
+``` bash
+# run with the default solver (critique)
+$ inspect eval theory.py --model=openai/gpt-4
+
+# run with the tree of thought solver
+$ inspect eval theory.py --solver=tree_of_thought --model=openai/gpt-4
+```
+
+Composite solvers by no means need to be implemented using chains. While chains are frequently used in more straightforward knowledge and reasoning evaluations, fully custom solver functions are often used for multi-turn dialog and agent evaluations.
+
+
+## Eval from Python
+
+Above we demonstrated using `inspect eval` from CLI to run evaluations—you can perform all of the same operations from directly within Python using the `eval()` function. For example:
+
+``` python
+from inspect_ai import eval
+
+eval(theory_of_mind(), model="openai/gpt-4o")
+eval(theory_of_mind(), solver=tree_of_thought(), model="openai/gpt-4o")
+```
+
 
 ## Learning More
 
diff --git a/docs/log-viewer.qmd b/docs/log-viewer.qmd
index 09c2f5643..6483aeb7e 100644
--- a/docs/log-viewer.qmd
+++ b/docs/log-viewer.qmd
@@ -171,9 +171,9 @@ tail --follow /tmp/inspect.log
 
 ## Task Information
 
-The **Info** panel of the log viewer provides additional meta-information about evaluation tasks, including dataset, plan, and scorer details, git revision, and model token usage:
+The **Info** panel of the log viewer provides additional meta-information about evaluation tasks, including dataset, solver, and scorer details, git revision, and model token usage:
 
-![](images/inspect-view-info.png){.border .lightbox fig-alt="The Info panel of the Inspect log viewer, displaying various details about the evaluation including dataset, plan, and scorer details, git revision, and model token usage."}
+![](images/inspect-view-info.png){.border .lightbox fig-alt="The Info panel of the Inspect log viewer, displaying various details about the evaluation including dataset, solver, and scorer details, git revision, and model token usage."}
 
 ## Publishing {#sec-publishing}
 
diff --git a/docs/models.qmd b/docs/models.qmd
index 5f188c558..44a101129 100644
--- a/docs/models.qmd
+++ b/docs/models.qmd
@@ -144,7 +144,7 @@ def popularity(model):
 
     return Task(
         dataset=json_dataset("popularity.jsonl"),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=match(),
         config=config,
     )
@@ -412,7 +412,7 @@ def theory_of_mind():
 
   return Task(
      dataset=json_dataset("theory_of_mind.jsonl"),
-     plan=[
+     solver=[
          chain_of_thought(),
          generate(),
          self_critique(model = critique_model)
diff --git a/docs/parallelism.qmd b/docs/parallelism.qmd
index 079e40296..0e7e7ae7d 100644
--- a/docs/parallelism.qmd
+++ b/docs/parallelism.qmd
@@ -95,7 +95,7 @@ Another common scenario is running the same task with variations of hyperparamet
 tasks = [
     Task(
         dataset=csv_dataset("dataset.csv"),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=match(),
         config=GenerateConfig(temperature=temperature),
     )
diff --git a/docs/scorers.qmd b/docs/scorers.qmd
index a2477301b..ff106bb6c 100644
--- a/docs/scorers.qmd
+++ b/docs/scorers.qmd
@@ -63,7 +63,7 @@ Scorers provide one or more built-in metrics (each of the scorers above provides
 ``` python
 Task(
     dataset=dataset,
-    plan=[
+    solver=[
         system_message(SYSTEM_MESSAGE),
         multiple_choice()
     ],
@@ -290,7 +290,7 @@ There are several ways to use multiple scorers in an evaluation:
 ``` python
 Task(
     dataset=dataset,
-    plan=[
+    solver=[
         system_message(SYSTEM_MESSAGE),
         generate()
     ],
@@ -435,7 +435,7 @@ def challenge() -> Task:
                 target="hello-world.txt",
             )
         ],
-        plan=[use_tools([bash()]), generate()],
+        solver=[use_tools([bash()]), generate()],
         sandbox="local",
         scorer=check_file_exists(),
     )
@@ -453,7 +453,7 @@ You can override scorer's built-in metrics by passing an alternate list of `metr
 ``` python
 Task(
     dataset=dataset,
-    plan=[
+    solver=[
         system_message(SYSTEM_MESSAGE),
         multiple_choice()
     ],
@@ -527,7 +527,7 @@ By default, this is done by taking the mean of all sample scores, but you may sp
 def gpqa():
     return Task(
         dataset=read_gpqa_dataset("gpqa_main.csv"),
-        plan=[
+        solver=[
             system_message(SYSTEM_MESSAGE),
             multiple_choice(),
         ],
diff --git a/docs/solvers.qmd b/docs/solvers.qmd
index c940cbc33..739bc06ca 100644
--- a/docs/solvers.qmd
+++ b/docs/solvers.qmd
@@ -15,16 +15,24 @@ Solvers are the heart of Inspect evaluations and can serve a wide variety of pur
 5.  Multi-turn dialog
 6.  Running an agent scaffold
 
-Here's an example task definition that composes a few standard solvers into a plan:
+Tasks have a single top-level solver that defines an execution plan. This solver could be implemented with arbitrary Python code (calling the model as required) or could consist of a set of other sovlers composed together. Solvers can therefore play two differnet roles: 
+
+1. _Composite_ specifications for task execution; and
+
+2. _Components_ that can be chained together.
+
+### Example
+
+Here's an example task definition that composes a few standard solver components:
 
 ``` python
 @task
 def theory_of_mind():
     return Task(
         dataset=json_dataset("theory_of_mind.jsonl"),
-        plan=[
+        solver=[
             system_message("system.txt"),
-            chain_of_thought(),
+            prompt_template("prompt.txt"),
             generate(),
             self_critique()
         ],
@@ -32,11 +40,33 @@ def theory_of_mind():
     )
 ```
 
-Typically, a call to `generate()` is included in the list of solvers (this solver is just a simple call to the model). You can also create a more sophisticated solver that calls `generate()` internally, perhaps even more than once (this is often required for more complex evaluations). Next, we'll describe how solvers operate on *task states* to do their work.
+In this example we pass a list of solver components directly to the `Task`. More often, though we'll wrap our solvers in an `@solver` decorated function to create a composite solver:
 
-::: {.callout-note appearance="simple"}
-The concept of using solvers and task states for evals was originally introduced in [OpenAI Evals](https://github.com/openai/evals/blob/main/evals/solvers/README.md). Inspect solvers are an evolution of this core design.
-:::
+``` python
+@solver
+def critique(
+    system_prompt = "system.txt",
+    user_prompt = "prompt.txt",
+):
+    return chain(
+        system_message(system_prompt),
+        prompt_template(user_prompt),
+        generate(),
+        self_critique()
+    )
+
+@task
+def theory_of_mind():
+    return Task(
+        dataset=json_dataset("theory_of_mind.jsonl"),
+        solver=critique(),
+        scorer=model_graded_fact(),
+    )
+```
+
+Composite solvers by no means need to be implemented using chains. While chains are frequently used in more straightforward knowledge and reasoning evaluations, fully custom solver functions are often used for multi-turn dialog and agent evaluations.
+
+This section covers mostly solvers as components (both built in and creating your own). The [Agents](agents.qmd) section describes fully custom sovlers in more depth.
 
 ## Task States
 
@@ -66,7 +96,7 @@ async def solve(state: TaskState, generate: Generate):
     return state
 ```
 
-The `generate` function passed to solvers is a convenience function that takes a `TaskState`, calls the model with it, appends the assistant message, and sets the model output. This is never used by prompt engineering solvers and nearly always used by more complex solvers that want to have multiple model interactions.
+The `generate` function passed to solvers is a convenience function that takes a `TaskState`, calls the model with it, appends the assistant message, and sets the model output. This is never used by prompt engineering solvers and often used by more complex solvers that want to have multiple model interactions.
 
 Here are what some of the built-in solvers do with the `TaskState`:
 
@@ -107,7 +137,7 @@ Inspect has a number of built-in solvers, each of which can be customised in som
 
 -   `generate()`
 
-    As illustrated above, just a simple call to `generate(state)`. This is the default solver if no `plan` is specified.
+    As illustrated above, just a simple call to `generate(state)`. This is the default solver if no `solver` is specified.
 
 -   `self_critique()`
 
@@ -136,7 +166,7 @@ We'll present an example and then discuss the various options below (in most cas
 1.  The `Sample` must include the available `choices`. Choices should not include letters (as they are automatically included when presenting the choices to the model).
 2.  The `Sample` `target` should be a capital letter (e.g. A, B, C, D, etc.)
 3.  You should always pair it with the `choice()` scorer in your task definition.
-4.  It calls `generate()` internally, so you should not include `generate()` in your plan.
+4.  It calls `generate()` internally, so you do need to separately include the `generate()` solver.
 
 ### Example
 
@@ -163,7 +193,7 @@ def mmlu():
     # task with multiple choice() and choice() scorer
     return Task(
         dataset=task_dataset,
-        plan=multiple_choice(),
+        solver=multiple_choice(),
         scorer=choice(),
     )
 
@@ -242,7 +272,7 @@ There are two additional fields that solvers might modify (but they are typicall
 | Member | Type | Description |
 |-------------------|-------------------|----------------------------------|
 | `metadata` | dict | Original metadata from `Sample`, as well as any other custom metadata that solvers choose to write (typically used to coordinate between solvers and/or for custom logging). |
-| `completed` | bool | Solvers can set `completed = True` to cause the task to exit the plan immediately. |
+| `completed` | bool | Solvers can set `completed = True` to cause the task to exit the sample immediately. |
 
 Sometimes its import to have access to the *original* prompt input for the task (as other solvers may have re-written or even removed it entirely). This is available using the `input` and `input_text` properties:
 
@@ -391,10 +421,9 @@ Note that calls to `generate()` (for both the critique model and the model being
 
 In some cases it is useful for a solver to score a task directly to assist in deciding whether or how to continue. You can do this using the `score()` function:
 
-```python
+``` python
 from inspect_ai.scorer import score
 
-
 def solver_that_scores() -> Solver:
     async def solve(state: TaskState, generate: Generate) -> TaskState:
         
@@ -414,7 +443,7 @@ When creating custom solvers, it's critical that you understand Inspect's concur
 
 ## Early Termination
 
-In some cases a solver has the context available to request an early termination of the plan (i.e. don't call the rest of the solvers). In this case, setting the `TaskState.completed` field will result in forgoing remaining solvers in the plan. For example, here's a simple solver that terminates the plan early:
+In some cases a solver has the context available to request an early termination of the sample (i.e. don't call the rest of the solvers). In this case, setting the `TaskState.completed` field will result in forgoing remaining solvers. For example, here's a simple solver that terminates the smaple early:
 
 ``` python
 @solver
@@ -431,4 +460,4 @@ Early termination might also occur if you specify the `max_messages` option and
 ``` python
 # could terminate early
 eval(my_task, max_messages = 10)
-```
+```
\ No newline at end of file
diff --git a/docs/tools.qmd b/docs/tools.qmd
index 3c8a3c042..7cc6751db 100644
--- a/docs/tools.qmd
+++ b/docs/tools.qmd
@@ -50,7 +50,7 @@ We can use this tool in an evaluation by passing it to the `use_tools()` Solver:
 def addition_problem():
     return Task(
         dataset=[Sample(input="What is 1 + 1?", target=["2"])],
-        plan=[
+        solver=[
             use_tools(add()), 
             generate()
         ],
@@ -165,7 +165,7 @@ use_tools(addition(), tool_choice="none")
 The last form (`tool_choice="none"`) would typically be used to turn off tool usage after an initial generation where the tool used. For example:
 
 ``` python
-plan = [
+solver = [
   use_tools(addition(), tool_choice=ToolFunction(name="addition")),
   generate(),
   follow_up_prompt(),
@@ -230,12 +230,12 @@ def web_search(
     ...
 ```
 
-You can use the `web_search()` tool in a plan like this:
+You can use the `web_search()` tool like this:
 
 ``` python
 from inspect_ai.tool import web_search
 
-plan=[
+solver=[
     use_tools(web_search()), 
     generate()
 ],
@@ -274,7 +274,7 @@ CMD_TIMEOUT = 180
 def intercode_ctf():
     return Task(
         dataset=read_dataset(),
-        plan=[
+        solver=[
             system_message("system.txt"),
             use_tools([
                 bash(CMD_TIMEOUT), 
diff --git a/docs/tutorial.qmd b/docs/tutorial.qmd
index 1a48735de..c0b2a0567 100644
--- a/docs/tutorial.qmd
+++ b/docs/tutorial.qmd
@@ -64,7 +64,7 @@ Discerning whether the correct security guidance was provided by the model might
 def security_guide():
     return Task(
         dataset=csv_dataset("security_guide.csv"),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=model_graded_fact(),
     )
 ```
@@ -136,7 +136,7 @@ def hellaswag():
     # define task
     return Task(
         dataset=dataset,
-        plan=[
+        solver=[
           system_message(SYSTEM_MESSAGE),
           multiple_choice()
         ],
@@ -228,8 +228,8 @@ We'll load the dataset from [HuggingFace](https://huggingface.co/datasets/gsm8k)
 ```{python}
 @task
 def gsm8k(fewshot=10, fewshot_seed=42):
-    # build plan dynamically (may or may not be doing fewshot)
-    plan = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
+    # build solver list dynamically (may or may not be doing fewshot)
+    solver = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
     if fewshot:
         fewshots = hf_dataset(
             path="gsm8k",
@@ -240,7 +240,7 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             seed=fewshot_seed,
             limit=fewshot,
         )
-        plan.insert(
+        solver.insert(
             0,
             system_message(
                 "\n\n".join([sample_to_fewshot(sample) for sample in fewshots])
@@ -255,7 +255,7 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             split="test",
             sample_fields=record_to_sample,
         ),
-        plan=plan,
+        solver=solver,
         scorer=match(numeric=True),
     )
 ```
@@ -336,7 +336,7 @@ def math(shuffle=True):
             shuffle=True,
             trust=True,
         ),
-        plan=[
+        solver=[
             prompt_template(PROMPT_TEMPLATE),
             generate(),
         ],
@@ -484,7 +484,7 @@ def addition_problem():
             input="What is 1 + 1?",
             target=["2", "2.0"]
         )],
-        plan=[use_tools(add()), generate()],
+        solver=[use_tools(add()), generate()],
         scorer=match(numeric=True),
     )
 ```
@@ -507,7 +507,7 @@ The definition of the task calls out to a couple of helper functions that do mos
 
 1) `read_dataset()`, which reads samples from the file system. Note that samples include both instructions and files to copy into the secure sandbox. See the [full source code](https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/gdm_capabilities/intercode_ctf) of this example for details.
 
-2. `ctf_agent()`, which defines the agent's plan. The plan consists principally of using `bash()` and `python()` tools in a loop until the flag is discovered. We'll describe this function in more detail below.
+2. `ctf_agent()`, which defines the agent's solver. The solver consists principally of using `bash()` and `python()` tools in a loop until the flag is discovered. We'll describe this function in more detail below.
 
 ```{python}
 from textwrap import dedent
@@ -516,7 +516,7 @@ from dataset import read_dataset
 
 from inspect_ai import Task, task
 from inspect_ai.scorer import includes
-from inspect_ai.solver import basic_agent, plan, system_message
+from inspect_ai.solver import basic_agent, solver, system_message
 from inspect_ai.tool import bash, python
 
 @task
@@ -527,7 +527,7 @@ def intercode_ctf(
 ):
     return Task(
         dataset=read_dataset(shuffle),
-        plan=ctf_agent(max_attempts),
+        solver=ctf_agent(max_attempts),
         scorer=includes(),
         max_messages=max_messages,
         sandbox="docker",
diff --git a/docs/workflow.qmd b/docs/workflow.qmd
index 04a01330c..72337c1f6 100644
--- a/docs/workflow.qmd
+++ b/docs/workflow.qmd
@@ -2,28 +2,47 @@
 
 There are a variety of ways to run evaluations that range from interactive work in a notebook or REPL all the way up to running large evaluation suites. We'll start with the basics, then cover exploratory workflows, and finally discuss how to compose evals together into a suite.
 
+::: {.callout-note appearance="simple"}
+If you are using VS Code, we recommend installing the VS Code Extension, which includes tools for running, tuning, debugging, and visualising evals. See the article on the [VS Code Extension](#sec-vscode) for more details on installing and using the extension.
+:::
+
 ## Eval Basics
 
 To create an evaluation, write a function that returns a `Task`. This task will bring together the dataset, solvers, scorer, and configuration required for the evaluation. Here's the example used in the introduction:
 
-``` python
+``` {.python filename="theory.py"}
 from inspect_ai import Task, task
 from inspect_ai.dataset import example_dataset
 from inspect_ai.scorer import model_graded_fact
-from inspect_ai.solver import (
-  chain_of_thought, generate, self_critique
-)
+from inspect_ai.solver import (               
+  chain, prompt_template, generate, self_critique   
+) 
+
+DEFAULT_PROMPT="<prompt>"
+
+from tree_of_thought imoprt TREE_PROMPT, generate_tree
+
+@solver 
+def critique():
+    return chain([
+        prompt_template(DEFAULT_PROMPT), 
+        generate(), 
+        self_critique()
+    ])
+
+@solver
+def tree_of_thought():
+    return chain([
+        prompt_template(TREE_PROMPT), 
+        generate_tree()
+    ])
 
 @task
 def theory_of_mind():
-    return Task(
+    return Task(  
         dataset=example_dataset("theory_of_mind"),
-        plan=[
-          chain_of_thought(), 
-          generate(), 
-          self_critique()
-        ],
-        scorer=model_graded_fact(),
+        solver=critique(),
+        scorer=model_graded_fact()
     )
 ```
 
@@ -41,11 +60,11 @@ $ inspect eval theory.py --model openai/gpt-4
 
 Immediately after an evaluation completes, a link to the log for the evaluation is written to the terminal.
 
-::: {.callout-note title="VS Code Extension"}
+Note that we have two solvers: `critique` (the default) and `tree_of_thought`. We can evaluate using the tree of thought solver with:
 
-If you are using VS Code, we also recommend installing the VS Code Extension, which includes tools for running, tuning, debugging, and visualising evals. See the article on the [VS Code Extension](#sec-vscode) for more details on installing and using the extension.
-
-:::
+``` bash
+$ inspect eval theory.py --solver tree_of_thought --model openai/gpt-4
+```
 
 ### Models
 
@@ -64,6 +83,52 @@ $ export INSPECT_EVAL_MODEL=google/gemini-1.0-pro
 $ inspect eval theory.py
 ```
 
+### Parameters
+
+You can optionally parameterise tasks by just adding parameters to the `@task` function. For example, here we provide a way to vary the dataset for the theory of mind task:
+
+``` python
+@task
+def theory_of_mind(dataset = "validation.csv"):
+    return Task(  
+        dataset=csv_dataset(dataset),
+        solver=critique(),
+        scorer=model_graded_fact()
+    )
+```
+
+Use `-T` to specify task parameters from the CLI:
+
+``` bash
+$ insepct eval theory.py -T "test.csv" --model openai/gpt-4o
+```
+
+### Solvers
+
+You can very the solver used for a task using the `--solver` flag. For example:
+
+``` bash
+$ inspect eval theory.py --solver=tree_of_thought --model=openai/gpt-4o
+```
+
+Solvers can additionally have their own parameters which you can also specify using the CLI. For example, here we extend the `tree_of_thought` solver to take a depth parameter (which we forward on to `generate_tree()`):
+
+``` python
+@solver
+def tree_of_thought(depth):
+    return chain([
+        prompt_template(TREE_PROMPT), 
+        generate_tree(depth)
+    ])
+```
+
+Use `-S` to specify solver parameters from the CLI:
+
+``` bash
+$ inspect eval theory.py \
+    --solver=tree_of_thought -S depth=3 \
+    --model=openai/gpt-4
+```
 
 ### Visualising
 
@@ -75,7 +140,6 @@ $ inspect view
 
 ![](images/inspect-view-main.png){.border .lightbox fig-alt="The Inspect log viewer, displaying a summary of results for the task as well as 8 individual samples."}
 
-
 The log viewer will update automatically whenever a new evaluation is completed (you can also navigate back to previous evaluations). The log viewer summarises aggregate data and also provides a detailed view into each sample. For example, here we zoom in on the model's scoring explanation for a specific sample:
 
 ![](images/inspect-view-scoring.png){.border .lightbox fig-alt="The Inspect log viewer showing a sample expanded, with details on the scoring of the sample, including the input, target, answer, and explanation."}
@@ -94,7 +158,7 @@ $ inspect eval theory.py --limit 10
 $ inspect eval theory.py --max-tokens 128
 
 # set temperature and seed
-$ inspect eval theory.py --temperature 0 --seed 42
+$ inspect eval theory.py --temperature 0.5 --seed 42
 ```
 
 ## Configuration {#sec-workflow-configuration}
@@ -146,8 +210,6 @@ from inspect_ai.dataset import json_dataset
 from inspect_ai.scorer import model_graded_fact
 from inspect_ai.solver import generate, system_message
 
-from itertools import product
-
 @task
 def security_guide(
     system="devops.txt", 
@@ -156,7 +218,7 @@ def security_guide(
 ):
    return Task(
       dataset=json_dataset("security_guide.jsonl"),
-      plan=[system_message(system), generate()],
+      solver=[system_message(system), generate()],
       scorer=model_graded_fact(
           template=grader, model=grader_model
       )
@@ -166,6 +228,8 @@ def security_guide(
 The `system` and `grader` arguments point to files we are using as system message and grader model templates. At the outset we might want to explore every possible combination of these parameters. We can use the `itertools.product` function to do this:
 
 ``` python
+from itertools import product
+
 # 'grid' will be a permutation of all parameters
 params = {
     "system": ["devops.txt", "researcher.txt"],
@@ -210,7 +274,7 @@ Returning to the example above, let's say that after experimenting, we were comf
 def security_guide(system="devops.txt"):
    return Task(
       dataset=json_dataset("security_guide.jsonl"),
-      plan=[system_message(system), generate()],
+      solver=[system_message(system), generate()],
       scorer=model_graded_fact(
           template="expert.txt", model="openai/gpt-4"
       )
diff --git a/evals/agieval/agieval.py b/evals/agieval/agieval.py
index c5728bfce..3e6ab3753 100644
--- a/evals/agieval/agieval.py
+++ b/evals/agieval/agieval.py
@@ -1,5 +1,4 @@
 import re
-from typing import Callable, List
 
 from inspect_ai import Task
 from inspect_ai.dataset import Dataset, Sample, json_dataset
@@ -16,7 +15,9 @@
     stderr,
 )
 from inspect_ai.solver import (
+    Solver,
     TaskState,
+    chain,
     generate,
     multiple_choice,
     prompt_template,
@@ -90,11 +91,11 @@ def record_to_sample(record):
     )
 
 
-def build_plan(
+def agieval_solver(
     dataset_name: str,
     cot: bool = False,
     fewshot_samples: Dataset | None = None,
-) -> List[Callable]:
+) -> Solver:
     # Determine the template for the tasks in the proper language and for the correct type (MCQ, Cloze)
     if dataset_name in EN_TASK:
         template_prompt = (
@@ -112,16 +113,18 @@ def build_plan(
         # Raise an error if the task name is not recognized
         raise ValueError(f"Dataset '{dataset_name}' not recognized.")
 
-    # Create a plan according to the task type and options (cot, fewshots...)
+    # Create a solver according to the task type and options (cot, fewshots...)
     if dataset_name in CLOZE_TASK:
         template_prompt = template_prompt.format(
             prompt="{prompt}", cot_string=cot_string, fewshot_string=fewshot_string
         )
 
-        plan = [
-            prompt_template(template=template_prompt),
-            generate(),
-        ]
+        solver = chain(
+            [
+                prompt_template(template=template_prompt),
+                generate(),
+            ]
+        )
     else:
         template_prompt = template_prompt.format(
             letters="{letters}",
@@ -131,12 +134,14 @@ def build_plan(
             fewshot_string=fewshot_string,
         )
 
-        plan = [
-            multiple_choice(template=template_prompt),
-            generate(),
-        ]
+        solver = chain(
+            [
+                multiple_choice(template=template_prompt),
+                generate(),
+            ]
+        )
 
-    return plan
+    return solver
 
 
 def task_template(
@@ -158,8 +163,8 @@ def task_template(
     else:
         fewshot_samples = None
 
-    # make a plan according to the type of task and language
-    plan = build_plan(
+    # create a solver according to the type of task and language
+    solver = agieval_solver(
         dataset_name=dataset_name, cot=cot, fewshot_samples=fewshot_samples
     )
 
@@ -168,7 +173,7 @@ def task_template(
 
     return Task(
         dataset=dataset,
-        plan=plan,
+        solver=solver,
         scorer=scorer,
         # from source paper 4.2.4: Implementation Details
         config=GenerateConfig(
diff --git a/evals/arc/arc.py b/evals/arc/arc.py
index a3c30c6ce..08040e5e4 100644
--- a/evals/arc/arc.py
+++ b/evals/arc/arc.py
@@ -26,7 +26,7 @@ def arc_task(dataset_name):
             split="test",
             sample_fields=record_to_sample,
         ),
-        plan=multiple_choice(),
+        solver=multiple_choice(),
         scorer=choice(),
     )
 
diff --git a/evals/boolq/boolq.py b/evals/boolq/boolq.py
index ac6760ea4..94b2f15f9 100644
--- a/evals/boolq/boolq.py
+++ b/evals/boolq/boolq.py
@@ -33,7 +33,7 @@ def boolq():
 
     return Task(
         dataset=dataset,
-        plan=[prompt_template(template=TEMPLATE), generate()],
+        solver=[prompt_template(template=TEMPLATE), generate()],
         scorer=pattern(r"(Yes|No).?\Z"),
     )
 
diff --git a/evals/commonsense_qa/commonsense_qa.py b/evals/commonsense_qa/commonsense_qa.py
index be1dd610b..dcf008288 100644
--- a/evals/commonsense_qa/commonsense_qa.py
+++ b/evals/commonsense_qa/commonsense_qa.py
@@ -27,7 +27,7 @@ def commonsense_qa():
 
     return Task(
         dataset=dataset,
-        plan=multiple_choice(),
+        solver=multiple_choice(),
         scorer=choice(),
         config=GenerateConfig(temperature=0),
     )
diff --git a/evals/drop/drop.py b/evals/drop/drop.py
index cbe7e95cb..e54d82d9b 100644
--- a/evals/drop/drop.py
+++ b/evals/drop/drop.py
@@ -22,6 +22,8 @@
 from inspect_ai.dataset import Sample, hf_dataset
 from inspect_ai.scorer import f1
 from inspect_ai.solver import (
+    Solver,
+    chain,
     generate,
     prompt_template,
     system_message,
@@ -68,7 +70,7 @@ def drop(
             trust=True,
             sample_fields=record_to_sample,
         ),
-        plan=build_plan(fewshot=fewshot, fewshot_seed=fewshot_seed),
+        solver=drop_solver(fewshot=fewshot, fewshot_seed=fewshot_seed),
         scorer=f1(extract_answer),
     )
 
@@ -79,11 +81,11 @@ def extract_answer(answer: str) -> str:
     return match.group(1) if match else answer
 
 
-def build_plan(
+def drop_solver(
     fewshot: int,
     fewshot_seed: int,
-) -> List:
-    """Builds plan using various solvers for the DROP task.
+) -> Solver:
+    """Builds solver for the DROP task.
 
     Arguments:
         fewshot (int): Number of few shot examples to use.
@@ -110,13 +112,11 @@ def build_plan(
     else:
         sys_msg = system_message(SYSTEM_PROMPT_TEMPLATE)
 
-    plan = [
+    return chain(
         sys_msg,
         prompt_template(USER_PROMPT_TEMPLATE),
         generate(),
-    ]
-
-    return plan
+    )
 
 
 def record_to_sample(record: Dict) -> Sample:
diff --git a/evals/gaia/gaia.py b/evals/gaia/gaia.py
index 89b88de55..3064a17c5 100644
--- a/evals/gaia/gaia.py
+++ b/evals/gaia/gaia.py
@@ -7,7 +7,7 @@
 from inspect_ai import Task, task
 from inspect_ai.dataset import Dataset, Sample, hf_dataset
 from inspect_ai.scorer import match
-from inspect_ai.solver import Plan, basic_agent, system_message
+from inspect_ai.solver import Solver, basic_agent, system_message
 from inspect_ai.tool import bash, web_search
 
 GAIA_DATASET_LOCATION = Path(__file__).parent / "resources" / "GAIA"
@@ -16,7 +16,7 @@
 
 @task
 def gaia(
-    plan: Plan | None = None,
+    solver: Solver | None = None,
     input_prompt: str | None = None,
     max_messages: int = 30,
     subset: Literal[
@@ -33,9 +33,9 @@ def gaia(
         filter=filter if filter else lambda x: True,
     )
 
-    # provide default plan if required
-    if plan is None:
-        plan = basic_agent(
+    # provide default solver if required
+    if solver is None:
+        solver = basic_agent(
             init=system_message(
                 "Please solve the coding task below. Once you are done, "
                 + "use the submit() tool to provide your answer."
@@ -49,7 +49,7 @@ def gaia(
     # return task
     return Task(
         dataset=dataset,
-        plan=plan,
+        solver=solver,
         scorer=scorer,
         sandbox=("docker", str(COMPOSE_FILE)),
         max_messages=max_messages,
diff --git a/evals/gdm_capabilities/in_house_ctf/task.py b/evals/gdm_capabilities/in_house_ctf/task.py
index 63e770a8c..b0def2e3c 100644
--- a/evals/gdm_capabilities/in_house_ctf/task.py
+++ b/evals/gdm_capabilities/in_house_ctf/task.py
@@ -34,7 +34,7 @@ def in_house_ctf(
 ) -> Task:
     return Task(
         dataset=read_dataset(challenges=challenges),
-        plan=agent or default_agent(max_attempts=max_attempts),
+        solver=agent or default_agent(max_attempts=max_attempts),
         scorer=check_flag(),
         max_messages=max_messages,
         epochs=Epochs(10, "at_least_1"),
@@ -43,7 +43,9 @@ def in_house_ctf(
 
 
 @solver
-def default_agent(init: Solver | list[Solver] | None = None, max_attempts: int = 1):
+def default_agent(
+    init: Solver | list[Solver] | None = None, max_attempts: int = 1
+) -> Solver:
     return basic_agent(
         init=init or system_message(default_system_message()),
         tools=agent_bash_tool(),
diff --git a/evals/gdm_capabilities/intercode_ctf/task.py b/evals/gdm_capabilities/intercode_ctf/task.py
index 33c14769b..15bdb66a3 100644
--- a/evals/gdm_capabilities/intercode_ctf/task.py
+++ b/evals/gdm_capabilities/intercode_ctf/task.py
@@ -16,7 +16,7 @@ def intercode_ctf(
 ):
     return Task(
         dataset=read_dataset(shuffle),
-        plan=ctf_agent(max_attempts),
+        solver=ctf_agent(max_attempts),
         scorer=includes(),
         max_messages=max_messages,
         sandbox="docker",
diff --git a/evals/gpqa/gpqa.py b/evals/gpqa/gpqa.py
index 8162e372b..1ff413f2f 100644
--- a/evals/gpqa/gpqa.py
+++ b/evals/gpqa/gpqa.py
@@ -34,7 +34,7 @@ def gpqa_diamond():
             csv_file="https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv",
             sample_fields=record_to_sample,
         ),
-        plan=[
+        solver=[
             multiple_choice(shuffle=True),
         ],
         scorer=choice(),
diff --git a/evals/gsm8k/gsm8k.py b/evals/gsm8k/gsm8k.py
index 9ae3489c9..c94b9bc27 100644
--- a/evals/gsm8k/gsm8k.py
+++ b/evals/gsm8k/gsm8k.py
@@ -31,8 +31,8 @@
 
 @task
 def gsm8k(fewshot=10, fewshot_seed=42):
-    # build plan dynamically (may or may not be doing fewshot)
-    plan = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
+    # build solver dynamically (may or may not be doing fewshot)
+    solver = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
     if fewshot:
         fewshots = hf_dataset(
             path="gsm8k",
@@ -43,7 +43,7 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             seed=fewshot_seed,
             limit=fewshot,
         )
-        plan.insert(
+        solver.insert(
             0,
             system_message(
                 "\n\n".join([sample_to_fewshot(sample) for sample in fewshots])
@@ -58,7 +58,7 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             split="test",
             sample_fields=record_to_sample,
         ),
-        plan=plan,
+        solver=solver,
         scorer=match(numeric=True),
     )
 
diff --git a/evals/hellaswag/hellaswag.py b/evals/hellaswag/hellaswag.py
index 2571f4da8..a91d86918 100644
--- a/evals/hellaswag/hellaswag.py
+++ b/evals/hellaswag/hellaswag.py
@@ -29,7 +29,7 @@ def hellaswag():
     # define task
     return Task(
         dataset=dataset,
-        plan=[system_message(SYSTEM_MESSAGE), multiple_choice()],
+        solver=[system_message(SYSTEM_MESSAGE), multiple_choice()],
         scorer=choice(),
     )
 
diff --git a/evals/humaneval/humaneval.py b/evals/humaneval/humaneval.py
index db820f836..017b967cd 100644
--- a/evals/humaneval/humaneval.py
+++ b/evals/humaneval/humaneval.py
@@ -56,7 +56,7 @@ def humaneval():
             path="openai_humaneval", split="test", sample_fields=record_to_sample
         ),
         epochs=Epochs(NUM_EPOCHS, ["mean", "pass_at_1", "pass_at_2", "pass_at_5"]),
-        plan=[generate()],
+        solver=[generate()],
         scorer=verify(),
         sandbox="docker",
     )
diff --git a/evals/ifeval/ifeval.py b/evals/ifeval/ifeval.py
index a8820c2e2..d0c912f85 100644
--- a/evals/ifeval/ifeval.py
+++ b/evals/ifeval/ifeval.py
@@ -29,7 +29,7 @@ def ifeval():
         dataset=hf_dataset(
             path="google/IFEval", split="train", sample_fields=record_to_sample
         ),
-        plan=[generate()],
+        solver=[generate()],
         scorer=instruction_following(),
     )
 
diff --git a/evals/mathematics/mathematics.py b/evals/mathematics/mathematics.py
index 53147d0f1..367a534a5 100644
--- a/evals/mathematics/mathematics.py
+++ b/evals/mathematics/mathematics.py
@@ -39,7 +39,13 @@
     scorer,
     stderr,
 )
-from inspect_ai.solver import TaskState, generate, prompt_template, system_message
+from inspect_ai.solver import (
+    Solver,
+    TaskState,
+    generate,
+    prompt_template,
+    system_message,
+)
 
 # Few-shot prompt template partially based on https://arxiv.org/pdf/2206.14858 - Appendix D.2
 SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE = """
@@ -78,7 +84,7 @@ def math(
 
     return Task(
         dataset=dataset,
-        plan=build_plan(fewshot=fewshot, fewshot_seed=fewshot_seed),
+        solver=math_solver(fewshot=fewshot, fewshot_seed=fewshot_seed),
         scorer=[
             expression_equivalance(model=grader_model),
             expression_exact_match(),
@@ -130,17 +136,17 @@ async def score(state: TaskState, target: Target):
     return score
 
 
-def build_plan(
+def math_solver(
     fewshot: int,
     fewshot_seed: int,
-) -> list:
-    """Builds plan using various solvers for the MATH task.
+) -> list[Solver]:
+    """Build solver for MATH task.
 
     Arguments:
         fewshot (int): Number of few shot examples to use.
         fewshot_seed (int): Random seed for sampling few shot examples.
     """
-    plan = [prompt_template(USER_PROMPT_TEMPLATE), generate()]
+    solver = [prompt_template(USER_PROMPT_TEMPLATE), generate()]
 
     if fewshot:
         fewshot_samples = hf_dataset(
@@ -152,7 +158,7 @@ def build_plan(
             seed=fewshot_seed,
             limit=fewshot,
         )
-        plan.insert(
+        solver.insert(
             0,
             system_message(
                 SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE.format(
@@ -163,4 +169,4 @@ def build_plan(
             ),
         )
 
-    return plan
+    return solver
diff --git a/evals/mathvista/mathvista.py b/evals/mathvista/mathvista.py
index 8a5909d11..d35a99a9a 100644
--- a/evals/mathvista/mathvista.py
+++ b/evals/mathvista/mathvista.py
@@ -44,7 +44,7 @@ def mathvista() -> Task:
 
     return Task(
         dataset=dataset,
-        plan=[mathvista_solver()],
+        solver=[mathvista_solver()],
         scorer=mathvista_scorer(),
     )
 
diff --git a/evals/mbpp/mbpp.py b/evals/mbpp/mbpp.py
index cb57fca6d..c2f97a362 100644
--- a/evals/mbpp/mbpp.py
+++ b/evals/mbpp/mbpp.py
@@ -115,7 +115,7 @@ def mbpp(
     return Task(
         dataset=dataset,
         epochs=Epochs(NUM_EPOCHS, ["mean", "pass_at_1", "pass_at_2", "pass_at_5"]),
-        plan=[
+        solver=[
             prompt_template(template),
             generate(),
         ],
diff --git a/evals/mmlu/mmlu.py b/evals/mmlu/mmlu.py
index 1ebfaa8fa..e8f487ff2 100644
--- a/evals/mmlu/mmlu.py
+++ b/evals/mmlu/mmlu.py
@@ -71,13 +71,13 @@ def mmlu(subjects=[], cot=False):
         task_dataset = dataset
 
     if cot:
-        plan = multiple_choice(template=MULTIPLE_CHOICE_TEMPLATE_COT)
+        solver = multiple_choice(template=MULTIPLE_CHOICE_TEMPLATE_COT)
     else:
-        plan = multiple_choice()
+        solver = multiple_choice()
 
     return Task(
         dataset=task_dataset,
-        plan=plan,
+        solver=solver,
         scorer=choice(),
         config=GenerateConfig(temperature=0.5),
     )
diff --git a/evals/mmlu_pro/mmlu_pro.py b/evals/mmlu_pro/mmlu_pro.py
index 0e8caa462..3d915b4be 100644
--- a/evals/mmlu_pro/mmlu_pro.py
+++ b/evals/mmlu_pro/mmlu_pro.py
@@ -74,7 +74,7 @@ def mmlu_pro(
 
     return Task(
         dataset=dataset,
-        plan=build_plan(fewshot=fewshot),
+        solver=mmlu_pro_solver(fewshot=fewshot),
         scorer=choice(),
     )
 
@@ -136,10 +136,10 @@ def filter_dataset(dataset: Dataset, subjects: list) -> Dataset:
     return dataset
 
 
-def build_plan(
+def mmlu_pro_solver(
     fewshot: int,
-) -> list:
-    plan = [multiple_choice(template=USER_PROMPT_TEMPLATE, shuffle=False)]
+) -> list[Solver]:
+    solver = [multiple_choice(template=USER_PROMPT_TEMPLATE, shuffle=False)]
 
     if fewshot:
         fewshot_samples = hf_dataset(
@@ -148,7 +148,7 @@ def build_plan(
             trust=True,
             sample_fields=record_to_sample,
         )
-        plan.insert(
+        solver.insert(
             0,
             mmlu_pro_system_message(
                 dataset=fewshot_samples,
@@ -156,7 +156,7 @@ def build_plan(
             ),
         )
 
-    return plan
+    return solver
 
 
 def record_to_sample(record: Dict) -> Sample:
diff --git a/evals/mmmu/mmmu.py b/evals/mmmu/mmmu.py
index d08bac993..4077ffb85 100644
--- a/evals/mmmu/mmmu.py
+++ b/evals/mmmu/mmmu.py
@@ -42,7 +42,7 @@ def mmmu_task_multiple_choice(dataset):
 
     return Task(
         dataset=multiple_choice_questions_inspect,
-        plan=[prompt_template(MULT_CHOICE_PROMPT), multiple_choice()],
+        solver=[prompt_template(MULT_CHOICE_PROMPT), multiple_choice()],
         scorer=choice(),
         config=GenerateConfig(temperature=0),
     )
@@ -66,7 +66,7 @@ def mmmu_task_open(dataset):
 
     return Task(
         dataset=open_ended_questions_inspect,
-        plan=[prompt_template(OPEN_PROMPT), generate()],
+        solver=[prompt_template(OPEN_PROMPT), generate()],
         scorer=model_graded_fact(),
         config=GenerateConfig(temperature=0),
     )
diff --git a/evals/piqa/piqa.py b/evals/piqa/piqa.py
index dce9481d0..c127b84e6 100644
--- a/evals/piqa/piqa.py
+++ b/evals/piqa/piqa.py
@@ -41,7 +41,7 @@ def piqa():
 
     return Task(
         dataset=dataset,
-        plan=[multiple_choice(template=TEMPLATE)],
+        solver=[multiple_choice(template=TEMPLATE)],
         scorer=choice(),
     )
 
diff --git a/evals/pubmedqa/pubmedqa.py b/evals/pubmedqa/pubmedqa.py
index dd4ce90e7..0d676fb89 100644
--- a/evals/pubmedqa/pubmedqa.py
+++ b/evals/pubmedqa/pubmedqa.py
@@ -34,7 +34,7 @@ def pubmedqa():
 
     return Task(
         dataset=dataset,
-        plan=[multiple_choice(template=TEMPLATE)],
+        solver=[multiple_choice(template=TEMPLATE)],
         scorer=choice(),
     )
 
diff --git a/evals/race-h/race-h.py b/evals/race-h/race-h.py
index b662e896f..88aeae82e 100644
--- a/evals/race-h/race-h.py
+++ b/evals/race-h/race-h.py
@@ -41,7 +41,7 @@ def race_h():
 
     return Task(
         dataset=dataset,
-        plan=[multiple_choice(template=TEMPLATE)],
+        solver=[multiple_choice(template=TEMPLATE)],
         scorer=choice(),
         config=GenerateConfig(temperature=0.0),
     )
diff --git a/evals/squad/squad.py b/evals/squad/squad.py
index 9f642ec65..c3b3ff978 100644
--- a/evals/squad/squad.py
+++ b/evals/squad/squad.py
@@ -20,7 +20,7 @@ def squad():
 
     return Task(
         dataset=dataset,
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=[f1(), exact()],
     )
 
diff --git a/evals/swe_bench/swe_bench.py b/evals/swe_bench/swe_bench.py
index b930b3b8c..66f95c88b 100644
--- a/evals/swe_bench/swe_bench.py
+++ b/evals/swe_bench/swe_bench.py
@@ -75,8 +75,8 @@ def swe_bench(
             The dataset to use. This should  either be the name of a dataset in the HF hub, or a path to a dataset on disk.
         split : str
             The split of the dataset to load.
-        plan : Plan
-            The plan to use when creating the task. If None, uses the default pkan.
+        solver : Solver
+            The solver to use when creating the task. If None, uses the default solver.
         max_messages : int
             The maximum number of messages to generate for each sample.
         filter : Callable[[Sample],bool]
@@ -128,7 +128,7 @@ def swe_bench(
     return Task(
         name=f"{dataset}_{split}",
         dataset=samples,
-        plan=solver,
+        solver=solver,
         scorer=swebench_scorer(),
         max_messages=max_messages,
     )
diff --git a/evals/swe_bench/test_swe_bench.py b/evals/swe_bench/test_swe_bench.py
index 0e62fac9f..f996fe85f 100644
--- a/evals/swe_bench/test_swe_bench.py
+++ b/evals/swe_bench/test_swe_bench.py
@@ -128,7 +128,7 @@ def test_same_scores_for_swe_agent() -> None:
         swebench_baseline_scorer(SWEAGENT_BASELINE, name="sweagent_baseline"),
     ]
 
-    # Make plans which apply the swe-agent's patch for each  swebench instance
+    # Make solvers which apply the swe-agent's patch for each  swebench instance
     test_tasks = []
     for test_sample, swe_agent_patch in zip(
         test_task.dataset, test_dataset["swe_agent_patch"]
@@ -136,7 +136,7 @@ def test_same_scores_for_swe_agent() -> None:
         test_tasks.append(
             Task(
                 dataset=[test_sample],
-                plan=apply_patch_solver(swe_agent_patch),
+                solver=apply_patch_solver(swe_agent_patch),
                 scorer=scorers,
             )
         )
diff --git a/evals/truthfulqa/truthfulqa.py b/evals/truthfulqa/truthfulqa.py
index 7dbfde638..bfc1fe3e4 100644
--- a/evals/truthfulqa/truthfulqa.py
+++ b/evals/truthfulqa/truthfulqa.py
@@ -44,7 +44,7 @@ def record_to_sample(record):
 
     return Task(
         dataset=dataset,
-        plan=[multiple_choice(multiple_correct=multiple_correct, shuffle=True)],
+        solver=[multiple_choice(multiple_correct=multiple_correct, shuffle=True)],
         scorer=choice(),
     )
 
diff --git a/evals/winogrande/winogrande.py b/evals/winogrande/winogrande.py
index 0fdb0a30f..47c588349 100644
--- a/evals/winogrande/winogrande.py
+++ b/evals/winogrande/winogrande.py
@@ -24,6 +24,7 @@
 from inspect_ai.model import GenerateConfig
 from inspect_ai.scorer import choice
 from inspect_ai.solver import (
+    Solver,
     multiple_choice,
     system_message,
 )
@@ -68,7 +69,7 @@ def winogrande(
             trust=True,
             sample_fields=record_to_sample,
         ),
-        plan=build_plan(
+        solver=winogrande_solver(
             dataset_name=dataset_name, fewshot=fewshot, fewshot_seed=fewshot_seed
         ),
         scorer=choice(),
@@ -76,12 +77,12 @@ def winogrande(
     )
 
 
-def build_plan(
+def winogrande_solver(
     dataset_name: str,
     fewshot: int,
     fewshot_seed: int,
-) -> list:
-    plan = [multiple_choice(template=USER_PROMPT_TEMPLATE, shuffle=False)]
+) -> list[Solver]:
+    solver = [multiple_choice(template=USER_PROMPT_TEMPLATE, shuffle=False)]
 
     if fewshot:
         fewshot_samples = hf_dataset(
@@ -94,7 +95,7 @@ def build_plan(
             seed=fewshot_seed,
             limit=fewshot,
         )
-        plan.insert(
+        solver.insert(
             0,
             system_message(
                 SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE.format(
@@ -105,7 +106,7 @@ def build_plan(
             ),
         )
 
-    return plan
+    return solver
 
 
 def record_to_sample(record: Dict) -> Sample:
diff --git a/evals/xstest/xstest.py b/evals/xstest/xstest.py
index d0c14584d..fbbf93c28 100644
--- a/evals/xstest/xstest.py
+++ b/evals/xstest/xstest.py
@@ -61,7 +61,7 @@ def xstest(subset="safe", scorer_model="openai/gpt-4o"):
 
     return Task(
         dataset=dataset,
-        plan=[system_message("You are a helpful assistant."), generate()],
+        solver=[system_message("You are a helpful assistant."), generate()],
         scorer=model_graded_qa(
             model=scorer_model,
             template=scorer_template,
diff --git a/examples/biology_qa.py b/examples/biology_qa.py
index 81bb83ed1..3ef4d229d 100644
--- a/examples/biology_qa.py
+++ b/examples/biology_qa.py
@@ -12,6 +12,6 @@ def biology_qa() -> Task:
             name="biology_qa",
             sample_fields=FieldSpec(input="question", target="answer"),
         ),
-        plan=[use_tools(web_search()), generate()],
+        solver=[use_tools(web_search()), generate()],
         scorer=model_graded_qa(),
     )
diff --git a/examples/cache.py b/examples/cache.py
index ec60b42d6..3d9b256c4 100644
--- a/examples/cache.py
+++ b/examples/cache.py
@@ -32,7 +32,7 @@ async def solve(state: TaskState, generate: Generate):
 def cache_example():
     return Task(
         dataset=_dataset(),
-        plan=[
+        solver=[
             # This will configure a basic cache with default settings, see the
             # defaults in `CachePolicy` for more info.
             solver_with_cache(cache=True),
@@ -45,7 +45,7 @@ def cache_example():
 def cache_example_with_expiry():
     return Task(
         dataset=_dataset(),
-        plan=[
+        solver=[
             # Explicitly cache calls for 12 hours
             solver_with_cache(cache=CachePolicy(expiry="12h")),
         ],
@@ -57,7 +57,7 @@ def cache_example_with_expiry():
 def cache_example_never_expires():
     return Task(
         dataset=_dataset(),
-        plan=[
+        solver=[
             # Cache requests but never expire them
             solver_with_cache(cache=CachePolicy(expiry=None)),
         ],
@@ -69,7 +69,7 @@ def cache_example_never_expires():
 def cache_example_scoped():
     return Task(
         dataset=_dataset(),
-        plan=[
+        solver=[
             # Scope the cache key with additional fields and set expiry to a week
             solver_with_cache(
                 cache=CachePolicy(
@@ -86,7 +86,7 @@ def cache_example_scoped():
 def cache_example_ignore_epochs():
     return Task(
         dataset=_dataset(),
-        plan=[
+        solver=[
             # Ignore the epoch when caching. Running this with (for example)
             # `--epochs 20` will still be fast as the first generate call will
             # get cached and re-used by subsequent calls
diff --git a/examples/hello_world.py b/examples/hello_world.py
index 7c359f7bd..3b0e78116 100644
--- a/examples/hello_world.py
+++ b/examples/hello_world.py
@@ -15,7 +15,7 @@ def hello_world():
                 target="Hello World",
             )
         ],
-        plan=[
+        solver=[
             generate(),
         ],
         scorer=exact(),
diff --git a/examples/images/images.py b/examples/images/images.py
index 4f4ea2d4a..9a93a68c1 100644
--- a/examples/images/images.py
+++ b/examples/images/images.py
@@ -12,6 +12,6 @@
 def images():
     return Task(
         dataset=json_dataset("images.jsonl"),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=match(),
     )
diff --git a/examples/langchain/wikipedia.py b/examples/langchain/wikipedia.py
index fe0c7a7e4..8ce637f2b 100644
--- a/examples/langchain/wikipedia.py
+++ b/examples/langchain/wikipedia.py
@@ -56,6 +56,6 @@ async def agent(llm: BaseChatModel, input: dict[str, Any]):
 def wikipedia() -> Task:
     return Task(
         dataset=json_dataset("wikipedia.jsonl"),
-        plan=wikipedia_search(),
+        solver=wikipedia_search(),
         scorer=model_graded_fact(),
     )
diff --git a/examples/popularity.py b/examples/popularity.py
index 4490dd44f..87fa31eeb 100644
--- a/examples/popularity.py
+++ b/examples/popularity.py
@@ -31,7 +31,7 @@ def popularity(model):
 
     return Task(
         dataset=dataset,
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=match(),
         config=config,
     )
diff --git a/examples/security_guide.py b/examples/security_guide.py
index 7d6bbaece..c5da0ff9e 100644
--- a/examples/security_guide.py
+++ b/examples/security_guide.py
@@ -12,6 +12,6 @@
 def security_guide():
     return Task(
         dataset=example_dataset("security_guide"),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=model_graded_fact(),
     )
diff --git a/examples/theory_of_mind.py b/examples/theory_of_mind.py
index 7997beec7..95fa9a541 100644
--- a/examples/theory_of_mind.py
+++ b/examples/theory_of_mind.py
@@ -7,12 +7,12 @@
 @task
 def theory_of_mind(critique=False):
     # use self_critique if requested
-    plan = [chain_of_thought(), generate()]
+    solver = [chain_of_thought(), generate()]
     if critique:
-        plan.append(self_critique())
+        solver.append(self_critique())
 
     return Task(
         dataset=example_dataset("theory_of_mind"),
-        plan=plan,
+        solver=solver,
         scorer=model_graded_fact(),
     )
diff --git a/examples/tool_use.py b/examples/tool_use.py
index ade9df375..9c6137b14 100644
--- a/examples/tool_use.py
+++ b/examples/tool_use.py
@@ -32,7 +32,7 @@ async def execute(x: int, y: int):
 def addition_problem():
     return Task(
         dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0"])],
-        plan=[use_tools(add()), generate()],
+        solver=[use_tools(add()), generate()],
         scorer=match(numeric=True),
     )
 
@@ -76,7 +76,7 @@ def bash():
 
     return Task(
         dataset=dataset,
-        plan=[
+        solver=[
             system_message(SYSTEM_MESSAGE),
             use_tools(list_files()),
             generate(),
@@ -106,7 +106,7 @@ async def execute(file: str):
 def read():
     return Task(
         dataset=[Sample(input="Please read the file 'foo.txt'")],
-        plan=[use_tools([read_file()]), generate()],
+        solver=[use_tools([read_file()]), generate()],
         scorer=match(),
         sandbox="local",
     )
@@ -130,7 +130,7 @@ async def execute(file: str, contents: str):
 def write():
     return Task(
         dataset=[Sample(input="Please write 'bar' to a file named 'foo.txt'.")],
-        plan=[
+        solver=[
             use_tools([write_file()]),
             generate(),
         ],
@@ -148,6 +148,6 @@ def parallel_add():
                 target=["2 4"],
             )
         ],
-        plan=[use_tools([add()]), generate()],
+        solver=[use_tools([add()]), generate()],
         scorer=includes(),
     )
diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py
index f96f49e4f..b9ea34690 100644
--- a/src/inspect_ai/_cli/eval.py
+++ b/src/inspect_ai/_cli/eval.py
@@ -574,7 +574,7 @@ def eval_exec(
     bundle_overwrite: bool = False,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> bool:
-    # parse task, plan, and model args
+    # parse task, solver, and model args
     task_args = parse_cli_args(t)
     solver_args = parse_cli_args(s)
     model_args = parse_cli_args(m)
diff --git a/src/inspect_ai/_eval/loader.py b/src/inspect_ai/_eval/loader.py
index d1976c1e0..b7b2c64ac 100644
--- a/src/inspect_ai/_eval/loader.py
+++ b/src/inspect_ai/_eval/loader.py
@@ -441,7 +441,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
                     )
                 if len(solvers) > 1:
                     raise PrerequisiteError(
-                        f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which plan using file.py@solver)"
+                        f"The source file {solver_file.as_posix()} has more than one @solver function (qualify which solver using file.py@solver)"
                     )
                 solver_name = solvers[0][0]
 
diff --git a/src/inspect_ai/solver/_plan.py b/src/inspect_ai/solver/_plan.py
index e9d02f5e2..1f71278f5 100644
--- a/src/inspect_ai/solver/_plan.py
+++ b/src/inspect_ai/solver/_plan.py
@@ -183,7 +183,10 @@ def wrapper(*w_args: Any, **w_kwargs: Any) -> Plan:
             return plan
 
         return plan_register(
-            plan=cast(PlanType, wrapper), name=plan_name, attribs=attribs, params=params
+            plan=cast(PlanType, wrapper),
+            name=plan_name,
+            attribs=attribs,
+            params=params,
         )
 
     from inspect_ai._util.logger import warn_once
diff --git a/src/inspect_ai/solver/_solver.py b/src/inspect_ai/solver/_solver.py
index 2429cf882..6d2a46a1c 100644
--- a/src/inspect_ai/solver/_solver.py
+++ b/src/inspect_ai/solver/_solver.py
@@ -226,7 +226,7 @@ def generate(
 ) -> Solver:
     r"""Generate output from the model and append it to task message history.
 
-    generate() is the default plan/solver if none is specified for a given task.
+    generate() is the default solver if none is specified for a given task.
 
     Args:
       tool_calls (Literal["loop", "single", "none"]): Resolve tool calls:
diff --git a/tests/model/test_mock_model_llm.py b/tests/model/test_mock_model_llm.py
index 7cad755f4..4e90a7ffd 100644
--- a/tests/model/test_mock_model_llm.py
+++ b/tests/model/test_mock_model_llm.py
@@ -16,7 +16,7 @@ def test_mock_model_eval():
                 target=MockLLM.default_output,
             ),
         ],
-        plan=[generate()],
+        solver=[generate()],
         scorer=includes(),
     )
 
diff --git a/tests/solver/test_basic_agent.py b/tests/solver/test_basic_agent.py
index fae143ada..7ce953a25 100644
--- a/tests/solver/test_basic_agent.py
+++ b/tests/solver/test_basic_agent.py
@@ -41,7 +41,7 @@ async def execute(x: int, y: int):
 def run_basic_agent(tools: list[Tool] | Solver) -> EvalLog:
     task = Task(
         dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
-        plan=basic_agent(
+        solver=basic_agent(
             init=system_message(AGENT_SYSTEM_MESSAGE),
             tools=[addition()],
             submit_name=AGENT_SUBMIT_TOOL_NAME,
@@ -99,7 +99,7 @@ def test_basic_agent_retries():
     def addition_task(max_attempts):
         return Task(
             dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
-            plan=basic_agent(tools=[addition()], max_attempts=max_attempts),
+            solver=basic_agent(tools=[addition()], max_attempts=max_attempts),
             scorer=includes(),
         )
 
diff --git a/tests/solver/test_fork.py b/tests/solver/test_fork.py
index a393cc6f9..05d802f67 100644
--- a/tests/solver/test_fork.py
+++ b/tests/solver/test_fork.py
@@ -56,5 +56,5 @@ async def solve(state: TaskState, _generate: Generate):
         return solve
 
     return Task(
-        dataset=[Sample(input="Say Hello", target="Hello")], plan=forking_solver()
+        dataset=[Sample(input="Say Hello", target="Hello")], solver=forking_solver()
     )
diff --git a/tests/solver/test_prompt.py b/tests/solver/test_prompt.py
index 2204363f6..4f42fe85a 100644
--- a/tests/solver/test_prompt.py
+++ b/tests/solver/test_prompt.py
@@ -26,7 +26,7 @@ def check_template_variables(solver: Solver):
                 metadata=dict(variable=VARIABLE_VALUE, prompt=PROMPT_VALUE),
             )
         ],
-        plan=[solver, generate()],
+        solver=[solver, generate()],
     )
 
     log = eval(task, model="mockllm/model")[0]
diff --git a/tests/solver/test_store.py b/tests/solver/test_store.py
index ce67bd9de..31edd2d0f 100644
--- a/tests/solver/test_store.py
+++ b/tests/solver/test_store.py
@@ -25,7 +25,7 @@ async def solve(state: TaskState, generate: Generate):
             Sample(input="Say Hello", target="Hello"),
             Sample(input="Say Goodbye", target="Goodbye"),
         ],
-        plan=[store_solver(), generate()],
+        solver=[store_solver(), generate()],
         scorer=match(),
     )
 
@@ -59,7 +59,7 @@ async def solve(state: TaskState, generate: Generate):
         dataset=[
             Sample(input="Get the cookie using the available tools.", target="ignored"),
         ],
-        plan=[use_tools(get_cookie()), store_solver(), generate()],
+        solver=[use_tools(get_cookie()), store_solver(), generate()],
         scorer=includes(),
     )
 
diff --git a/tests/solver/test_subtask.py b/tests/solver/test_subtask.py
index 1060d58d3..ce2a3e260 100644
--- a/tests/solver/test_subtask.py
+++ b/tests/solver/test_subtask.py
@@ -31,7 +31,7 @@ async def solve(state: TaskState, generate: Generate):
         dataset=[
             Sample(input="1", target="2"),
         ],
-        plan=[generate(), subtask_solver()],
+        solver=[generate(), subtask_solver()],
         scorer=match(),
     )
 
diff --git a/tests/solver/test_transcript.py b/tests/solver/test_transcript.py
index e20db33e8..f9e96c810 100644
--- a/tests/solver/test_transcript.py
+++ b/tests/solver/test_transcript.py
@@ -25,7 +25,7 @@ async def solve(state: TaskState, generate: Generate):
         dataset=[
             Sample(input="Say Hello", target="Hello"),
         ],
-        plan=[transcript_solver(), generate()],
+        solver=[transcript_solver(), generate()],
         scorer=match(),
     )
 
diff --git a/tests/test_eval_set.py b/tests/test_eval_set.py
index 82b3195dc..31a39f7f3 100644
--- a/tests/test_eval_set.py
+++ b/tests/test_eval_set.py
@@ -56,13 +56,13 @@ def test_eval_set_dynamic() -> None:
         task1 = Task(
             name="task1",
             dataset=deepcopy(dataset),
-            plan=[failing_solver(0.2), generate()],
+            solver=[failing_solver(0.2), generate()],
             scorer=includes(),
         )
         task2 = Task(
             name="task2",
             dataset=deepcopy(dataset),
-            plan=[failing_solver(0.2), generate()],
+            solver=[failing_solver(0.2), generate()],
             scorer=includes(),
         )
         success, logs = eval_set(
@@ -85,7 +85,7 @@ def test_eval_set_identifiers() -> None:
     def make_task(param="param"):
         return Task(
             dataset=deepcopy(dataset),
-            plan=[failing_solver(0.2), generate()],
+            solver=[failing_solver(0.2), generate()],
             scorer=includes(),
         )
 
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index 31c5f23b6..806eb0efe 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -34,7 +34,7 @@ async def test_extension_sandboxenv():
                     input="Please use the list_files tool to list the files in the current directory"
                 )
             ],
-            plan=[use_tools(list_files()), generate()],
+            solver=[use_tools(list_files()), generate()],
             scorer=includes(),
             sandbox="podman",
         )
diff --git a/tests/test_fail_on_error.py b/tests/test_fail_on_error.py
index e14f17671..b5d8add4a 100644
--- a/tests/test_fail_on_error.py
+++ b/tests/test_fail_on_error.py
@@ -29,7 +29,7 @@ def create_failing_task(
 
     return Task(
         dataset=dataset,
-        plan=[failing_solver(fail), generate()],
+        solver=[failing_solver(fail), generate()],
         fail_on_error=fail_on_error,
     )
 
@@ -93,7 +93,7 @@ def fail_on_error_failing_task():
             Sample(input="Say hello", target="hello"),
             Sample(input="Say hello", target="hello"),
         ],
-        plan=[failing_solver(lambda _s: random() > 0.33), generate()],
+        solver=[failing_solver(lambda _s: random() > 0.33), generate()],
         fail_on_error=False,
         scorer=includes(),
     )
diff --git a/tests/test_helpers/utils.py b/tests/test_helpers/utils.py
index 2865de768..c2dd1e3b6 100644
--- a/tests/test_helpers/utils.py
+++ b/tests/test_helpers/utils.py
@@ -162,7 +162,7 @@ def failing_task(rate=0.5, samples=1) -> Task:
         dataset.append(Sample(input="Say hello", target="hello"))
     return Task(
         dataset=dataset,
-        plan=[failing_solver(rate), generate()],
+        solver=[failing_solver(rate), generate()],
         scorer=match(),
     )
 
diff --git a/tests/test_package/inspect_package/inspect_extensions.py b/tests/test_package/inspect_package/inspect_extensions.py
index d41231ee7..6b800f11d 100644
--- a/tests/test_package/inspect_package/inspect_extensions.py
+++ b/tests/test_package/inspect_package/inspect_extensions.py
@@ -1,7 +1,7 @@
 from inspect_ai.model import modelapi
 from inspect_ai.util import sandboxenv
 
-from .plans.cot import cot  # noqa: F401
+from .solvers.cot import cot  # noqa: F401
 
 # delayed import for the model and sandbox allows us to only resolve the imports
 # when they are actually requeasted (so that we don't end up requiring all
diff --git a/tests/test_package/inspect_package/plans/cot.py b/tests/test_package/inspect_package/solvers/cot.py
similarity index 100%
rename from tests/test_package/inspect_package/plans/cot.py
rename to tests/test_package/inspect_package/solvers/cot.py
diff --git a/tests/test_run_dir/task1/task1.py b/tests/test_run_dir/task1/task1.py
index fc7a43104..9a0adf8f8 100644
--- a/tests/test_run_dir/task1/task1.py
+++ b/tests/test_run_dir/task1/task1.py
@@ -10,6 +10,6 @@
 def task1():
     return Task(
         dataset=[Sample(input="What is 1+1?", target="2")],
-        plan=[file_check("task1.py"), generate()],
+        solver=[file_check("task1.py"), generate()],
         scorer=includes(),
     )
diff --git a/tests/test_run_dir/task2/task2.py b/tests/test_run_dir/task2/task2.py
index e4a1fe4da..bc4491a47 100644
--- a/tests/test_run_dir/task2/task2.py
+++ b/tests/test_run_dir/task2/task2.py
@@ -10,6 +10,6 @@
 def task2():
     return Task(
         dataset=[Sample(input="What is 1+1?", target="2")] * 10,
-        plan=[file_check("task2.py"), generate()],
+        solver=[file_check("task2.py"), generate()],
         scorer=includes(),
     )
diff --git a/tests/test_task_state.py b/tests/test_task_state.py
index 498243c84..4db99607a 100644
--- a/tests/test_task_state.py
+++ b/tests/test_task_state.py
@@ -22,7 +22,7 @@ async def solve(state: TaskState, generate: Generate):
     max_messages = randint(1, 3) * 2
     task = Task(
         dataset=[Sample(input="Say Hello", target="Hello")],
-        plan=max_messages_solver(),
+        solver=max_messages_solver(),
         scorer=match(),
         max_messages=max_messages,
     )
diff --git a/tests/tools/test_sandbox_tool_eval.py b/tests/tools/test_sandbox_tool_eval.py
index 54653b295..ad498a603 100644
--- a/tests/tools/test_sandbox_tool_eval.py
+++ b/tests/tools/test_sandbox_tool_eval.py
@@ -29,7 +29,7 @@ def test_sandbox_environment_read_file():
     ]
     task = Task(
         dataset=dataset,
-        plan=[use_tools([read_file(), list_files()]), generate()],
+        solver=[use_tools([read_file(), list_files()]), generate()],
         scorer=includes(),
         sandbox="local",
     )
@@ -66,7 +66,7 @@ def test_sandbox_environment_list_files():
     ]
     task = Task(
         dataset=dataset,
-        plan=[use_tools([read_file(), list_files()]), generate()],
+        solver=[use_tools([read_file(), list_files()]), generate()],
         scorer=includes(),
         sandbox="local",
     )
@@ -102,7 +102,7 @@ def test_sandbox_environment_read_file_error():
     ]
     task = Task(
         dataset=dataset,
-        plan=[
+        solver=[
             use_tools(
                 [
                     read_file(),
@@ -152,7 +152,7 @@ def test_sandbox_environment_nonroot_files():
     ]
     task = Task(
         dataset=dataset,
-        plan=[
+        solver=[
             use_tools(
                 [
                     command_exec(),
diff --git a/tests/tools/test_tool_types.py b/tests/tools/test_tool_types.py
index 297d0b682..e068ef222 100644
--- a/tests/tools/test_tool_types.py
+++ b/tests/tools/test_tool_types.py
@@ -115,7 +115,7 @@ def check_point(model: str, tool: Tool, function_name: str) -> None:
                 )
             ]
         ),
-        plan=[
+        solver=[
             use_tools([tool], tool_choice=ToolFunction(function_name)),
             generate(),
         ],
@@ -143,7 +143,7 @@ def check_list_of_numbers(model: str) -> None:
                 )
             ]
         ),
-        plan=[
+        solver=[
             use_tools([mean()], tool_choice=ToolFunction("mean")),
             generate(),
         ],
@@ -162,7 +162,7 @@ def check_list_of_objects(model: str) -> None:
                 )
             ]
         ),
-        plan=[
+        solver=[
             use_tools([extract_words()], tool_choice=ToolFunction("extract_words")),
             generate(),
         ],
diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py
index f30bf5b02..008733cb6 100644
--- a/tests/tools/test_tools.py
+++ b/tests/tools/test_tools.py
@@ -99,7 +99,7 @@ def check_tools_calls(model: Model, **model_args) -> None:
     model = get_model(model)
     task = Task(
         dataset=addition_dataset,
-        plan=[use_tools(addition()), generate()],
+        solver=[use_tools(addition()), generate()],
         scorer=match("any", numeric=True),
     )
 
@@ -123,7 +123,7 @@ def check_tools_none(model: Model, **model_args) -> None:
     model = get_model(model)
     task = Task(
         dataset=addition_dataset,
-        plan=[use_tools(addition(), tool_choice="none"), generate()],
+        solver=[use_tools(addition(), tool_choice="none"), generate()],
         scorer=match(),
     )
 
@@ -141,7 +141,7 @@ def check_tools_force(model: Model, **model_args) -> None:
     model = get_model(model)
     task = Task(
         dataset=addition_dataset,
-        plan=[
+        solver=[
             use_tools(
                 [addition(), addition2(), addition3()],
                 tool_choice=ToolFunction(name="addition2"),
@@ -231,7 +231,7 @@ async def solve(state: TaskState, generate: Generate):
 
     task = Task(
         dataset=[Sample(input="Pick a random color.")],
-        plan=dynamic_tools(),
+        solver=dynamic_tools(),
         scorer=match(),
     )
 
@@ -249,7 +249,7 @@ async def solve(state: TaskState, generate: Generate):
 def test_tool_error():
     task = Task(
         dataset=[Sample(input="Please read the file 'foo.txt'")],
-        plan=[use_tools([read_file()]), generate()],
+        solver=[use_tools([read_file()]), generate()],
         scorer=match(),
         sandbox="local",
     )
@@ -268,7 +268,7 @@ def test_tool_error():
 def test_tool_eval_error():
     task = Task(
         dataset=[Sample(input="Please raise an error.")],
-        plan=[use_tools([raise_error()]), generate()],
+        solver=[use_tools([raise_error()]), generate()],
         scorer=match(),
         sandbox="local",
     )
@@ -302,7 +302,7 @@ async def exec(x: int, y: int):
     def task(tool_calls: Literal["loop", "single", "none"]):
         return Task(
             dataset=[Sample(input="What is 1+1?", target="2")],
-            plan=[
+            solver=[
                 system_message(
                     "When using the add tool, be sure to call it at least three consecutive times to ensure the correct result."
                 ),
diff --git a/tests/util/sandbox/test_docker_compose_multiple_services.py b/tests/util/sandbox/test_docker_compose_multiple_services.py
index d98aa34a0..51d4ccf1c 100644
--- a/tests/util/sandbox/test_docker_compose_multiple_services.py
+++ b/tests/util/sandbox/test_docker_compose_multiple_services.py
@@ -99,7 +99,7 @@ def test_docker_compose_multiple_services_write_file():
     ]
     task = Task(
         dataset=dataset,
-        plan=[
+        solver=[
             use_tools(
                 [
                     write_file_service_1(),
diff --git a/tests/util/test_images.py b/tests/util/test_images.py
index 8af11d050..f41102c16 100644
--- a/tests/util/test_images.py
+++ b/tests/util/test_images.py
@@ -23,7 +23,7 @@ def images():
         dataset=json_dataset(
             os.path.join("tests", "util", "test_images", "images.jsonl")
         ),
-        plan=[system_message(SYSTEM_MESSAGE), generate()],
+        solver=[system_message(SYSTEM_MESSAGE), generate()],
         scorer=match(),
     )
 
diff --git a/tools/vscode/assets/templates/task.py.template b/tools/vscode/assets/templates/task.py.template
index 7fd065063..e0c0454e6 100644
--- a/tools/vscode/assets/templates/task.py.template
+++ b/tools/vscode/assets/templates/task.py.template
@@ -12,7 +12,7 @@ def {{<taskName>}}():
 
     return Task(
         dataset=dataset,
-        plan=[
+        solver=[
           generate(),
         ],
         scorer=match(),