From 5d419e4f99a563274fcdfcfed629af9932f24754 Mon Sep 17 00:00:00 2001
From: aisi-inspect <166920645+aisi-inspect@users.noreply.github.com>
Date: Sun, 9 Jun 2024 16:33:17 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll        |    2 +-
 agents.html      | 1449 ++++++++++++++++++++++++++++++++++++++++++++++
 datasets.html    |    5 +
 eval-logs.html   |    7 +-
 eval-suites.html |    5 +
 eval-tuning.html |    5 +
 examples.html    |   39 +-
 index.html       |   10 +-
 log-viewer.html  |    7 +-
 models.html      |    5 +
 scorers.html     |   11 +-
 search.json      |  170 ++++--
 sitemap.xml      |   12 +-
 solvers.html     |    7 +
 tools.html       |  488 ++--------------
 vscode.html      |    7 +-
 workflow.html    |    7 +-
 17 files changed, 1703 insertions(+), 533 deletions(-)
 create mode 100644 agents.html
diff --git a/.nojekyll b/.nojekyll
index 5896b806f..5c6d18f2d 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-f4cc5e9f
\ No newline at end of file
+8b4953a3
\ No newline at end of file
diff --git a/agents.html b/agents.html
new file mode 100644
index 000000000..891e28080
--- /dev/null
+++ b/agents.html
@@ -0,0 +1,1449 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.32">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>Inspect – Agents</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="site_libs/quarto-nav/headroom.min.js"></script>
+<script src="site_libs/clipboard/clipboard.min.js"></script>
+<script src="site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="site_libs/quarto-search/fuse.min.js"></script>
+<script src="site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="./">
+<link href="./scorers.html" rel="next">
+<link href="./tools.html" rel="prev">
+<script src="site_libs/quarto-html/quarto.js"></script>
+<script src="site_libs/quarto-html/popper.min.js"></script>
+<script src="site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="site_libs/quarto-html/anchor.min.js"></script>
+<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+
+<meta property="og:title" content="Inspect">
+<meta property="og:description" content="Open-source framework for large language model evaluations">
+<meta property="og:image" content="https://UKGovernmentBEIS.github.io/inspect_ai/images/inspect.png">
+<meta property="og:site_name" content="Inspect">
+<meta property="og:image:height" content="1258">
+<meta property="og:image:width" content="2400">
+<meta name="twitter:title" content="Inspect">
+<meta name="twitter:description" content="Open-source framework for large language model evaluations">
+<meta name="twitter:image" content="https://UKGovernmentBEIS.github.io/inspect_ai/images/inspect.png">
+<meta name="twitter:card" content="summary_large_image">
+<meta name="twitter:image-height" content="1258">
+<meta name="twitter:image-width" content="2400">
+</head>
+
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./solvers.html">Components</a></li><li class="breadcrumb-item"><a href="./agents.html"><span class="chapter-title">Agents</span></a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto"><div class="quarto-sidebar-header"><div class="sidebar-header-item">
+<p><a href="https://www.gov.uk/government/organisations/ai-safety-institute"><img src="images/aisi-logo.png" class="img-fluid" alt="UK AI Safety Institute Website"></a></p>
+</div></div>
+    <div class="pt-lg-2 mt-2 text-left sidebar-header">
+    <div class="sidebar-title mb-0 py-0">
+      <a href="./">Inspect</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/UKGovernmentBEIS/inspect_ai" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./index.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Welcome</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">Basics</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./workflow.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Workflow</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./log-viewer.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Log Viewer</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./vscode.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">VS Code</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./examples.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Examples</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true">
+ <span class="menu-text">Components</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./solvers.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Solvers</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link active"><span class="chapter-title">Agents</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./scorers.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Scorers</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./datasets.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Datasets</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./models.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Models</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
+ <span class="menu-text">Advanced</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./eval-logs.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Eval Logs</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./eval-suites.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Eval Suites</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./eval-tuning.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Eval Tuning</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#overview" id="toc-overview" class="nav-link active" data-scroll-target="#overview">Overview</a></li>
+  <li><a href="#tool-use-loop" id="toc-tool-use-loop" class="nav-link" data-scroll-target="#tool-use-loop">Tool Use Loop</a>
+  <ul class="collapse">
+  <li><a href="#example-intercode-ctf" id="toc-example-intercode-ctf" class="nav-link" data-scroll-target="#example-intercode-ctf">Example: InterCode CTF</a></li>
+  </ul></li>
+  <li><a href="#custom-scaffolding" id="toc-custom-scaffolding" class="nav-link" data-scroll-target="#custom-scaffolding">Custom Scaffolding</a></li>
+  <li><a href="#agent-libraries" id="toc-agent-libraries" class="nav-link" data-scroll-target="#agent-libraries">Agent Libraries</a>
+  <ul class="collapse">
+  <li><a href="#sec-langchain" id="toc-sec-langchain" class="nav-link" data-scroll-target="#sec-langchain">Example: LangChain</a></li>
+  </ul></li>
+  <li><a href="#sec-tool-environments" id="toc-sec-tool-environments" class="nav-link" data-scroll-target="#sec-tool-environments">Tool Environments</a>
+  <ul class="collapse">
+  <li><a href="#example-file-listing" id="toc-example-file-listing" class="nav-link" data-scroll-target="#example-file-listing">Example: File Listing</a></li>
+  <li><a href="#environment-interface" id="toc-environment-interface" class="nav-link" data-scroll-target="#environment-interface">Environment Interface</a></li>
+  <li><a href="#environment-binding" id="toc-environment-binding" class="nav-link" data-scroll-target="#environment-binding">Environment Binding</a></li>
+  <li><a href="#sec-docker-configuration" id="toc-sec-docker-configuration" class="nav-link" data-scroll-target="#sec-docker-configuration">Docker Configuration</a></li>
+  <li><a href="#resource-management" id="toc-resource-management" class="nav-link" data-scroll-target="#resource-management">Resource Management</a></li>
+  <li><a href="#troubleshooting" id="toc-troubleshooting" class="nav-link" data-scroll-target="#troubleshooting">Troubleshooting</a></li>
+  </ul></li>
+  </ul>
+<div class="toc-actions"><ul><li><a href="https://github.com/UKGovernmentBEIS/inspect_ai/issues/new" class="toc-action"><i class="bi bi-github"></i>Report an issue</a></li></ul></div></nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="./solvers.html">Components</a></li><li class="breadcrumb-item"><a href="./agents.html"><span class="chapter-title">Agents</span></a></li></ol></nav>
+<div class="quarto-title">
+<h1 class="title"><span id="sec-agents" class="quarto-section-identifier"><span class="chapter-title">Agents</span></span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<section id="overview" class="level2">
+<h2 class="anchored" data-anchor-id="overview">Overview</h2>
+<p>Agents combine planning, memory, and tool usage to pursue more complex, longer horizon tasks (e.g.&nbsp;a <a href="https://en.wikipedia.org/wiki/Capture_the_flag_(cybersecurity)">Capture the Flag</a> challenge). Agents are an area of active research, and many schemes for implementing them have been developed, including <a href="https://arxiv.org/abs/2306.02224">AutoGPT</a>, <a href="https://arxiv.org/pdf/2303.11366.pdf">ReAct</a>, and <a href="https://arxiv.org/pdf/2303.11366.pdf">Reflexion</a>.</p>
+<p>Inspect supports a variety of approaches to agent evaluations, including:</p>
+<ol type="1">
+<li><p>Using Inspect’s built in tool-use loop along with a ReAct prompt that encourages the model to explicitly reason about each tool usage. When you call <code>generate()</code> and the model responds with a tool call, Inspect will automatically re-prompt the model for another generation.</p></li>
+<li><p>Implementing your own outer scaffolding loop on top of the default <code>generate()</code> behavior. This will involve repeated calls to <code>generate()</code> with various <code>tools</code> being made available in the <code>TaskState</code> for each call. It may also involve using the model to help determine what actions to take next.</p></li>
+<li><p>Adapting another scaffolding scheme provided by a research paper or open source library (for example, using a 3rd party agent library like <a href="https://python.langchain.com/docs/modules/agents/">LangChain</a> or <a href="https://langroid.github.io/langroid/">Langroid</a>).</p></li>
+</ol>
+<p>We’ll cover the basics of all of these approaches below.</p>
+<p>An important additional consideration for agent evaluations is sandboxing (providing a secure environment for models to execute code within). The <a href="#sec-tool-environments">Tool Environments</a> section goes into more depth on this.</p>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>The features described in this section are not yet available in the version of Inspect published to PyPI (rather they are only available in the development version of Inspect). To install the development version:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>If you are building agent evaluations based on the documentation here, you should install the development version before proceeding.</p>
+</div>
+</div>
+</section>
+<section id="tool-use-loop" class="level2">
+<h2 class="anchored" data-anchor-id="tool-use-loop">Tool Use Loop</h2>
+<p>A basic agent can be implemented by providing tools to the model with <code>use_tools()</code> and then calling <code>generate()</code>. Every time the model calls a tool, the approriate Python function is called and then the model is re-prompted to generate based on the output of the function. This is typically combined with a <a href="https://arxiv.org/pdf/2303.11366.pdf">ReAct</a> prompt that urges the model to reason about each action it takes. For example:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>prompt_template(template <span class="op">=</span> <span class="st">"""</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="st">    Each message may perform one function call. You will</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="st">    see the result of the function right after sending </span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="st">    the message. If you need to perform multiple actions,</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="st">    you can always send more messages with subsequent </span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="st">    function calls. Do some reasoning before your actions,</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="st">    describing what function calls you are going to use </span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="st">    and how they fit into your plan. </span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="st">    </span><span class="sc">{prompt}</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="st">"""</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Note that this is merely an example! A production ReAct prompt would typically be longer and more detailed. It would also typically have some fewshot examples from the dataset domain. See <a href="https://www.promptingguide.ai/techniques/react">Prompt Engineering Guide: React</a> for additional details.</p>
+<section id="example-intercode-ctf" class="level3">
+<h3 class="anchored" data-anchor-id="example-intercode-ctf">Example: InterCode CTF</h3>
+<p>This example implements the <a href="https://intercode-benchmark.github.io/#ctf">CTF Benchmark</a> from the <a href="https://arxiv.org/abs/2306.14898">InterCode</a> paper (click the numbers in the right margin for additional explanation of the code):</p>
+<div class="sourceCode" id="annotated-cell-2"><pre class="sourceCode python code-annotation-code code-with-copy code-annotated"><code class="sourceCode python"><span id="annotated-cell-2-1"><a href="#annotated-cell-2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> dataset <span class="im">import</span> read_dataset</span>
+<span id="annotated-cell-2-2"><a href="#annotated-cell-2-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
+<span id="annotated-cell-2-3"><a href="#annotated-cell-2-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes</span>
+<span id="annotated-cell-2-4"><a href="#annotated-cell-2-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.solver <span class="im">import</span> (</span>
+<span id="annotated-cell-2-5"><a href="#annotated-cell-2-5" aria-hidden="true" tabindex="-1"></a>    Generate, TaskState, bash, generate, python, solver,</span>
+<span id="annotated-cell-2-6"><a href="#annotated-cell-2-6" aria-hidden="true" tabindex="-1"></a>    system_message, tool_environment, use_tools</span>
+<span id="annotated-cell-2-7"><a href="#annotated-cell-2-7" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="annotated-cell-2-8"><a href="#annotated-cell-2-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-9"><a href="#annotated-cell-2-9" aria-hidden="true" tabindex="-1"></a>CMD_TIMEOUT <span class="op">=</span> <span class="dv">180</span>  <span class="co"># max seconds to run bash/python cmds</span></span>
+<span id="annotated-cell-2-10"><a href="#annotated-cell-2-10" aria-hidden="true" tabindex="-1"></a>MAX_MESSAGES <span class="op">=</span> <span class="dv">30</span>  <span class="co"># max chat messages before giving up</span></span>
+<span id="annotated-cell-2-11"><a href="#annotated-cell-2-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-12"><a href="#annotated-cell-2-12" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
+<span id="annotated-cell-2-13"><a href="#annotated-cell-2-13" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> intercode_ctf(shuffle <span class="op">=</span> <span class="va">False</span>):</span>
+<span id="annotated-cell-2-14"><a href="#annotated-cell-2-14" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="1">1</button><span id="annotated-cell-2-15" class="code-annotation-target"><a href="#annotated-cell-2-15" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>read_dataset(shuffle),</span>
+<span id="annotated-cell-2-16"><a href="#annotated-cell-2-16" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="2">2</button><span id="annotated-cell-2-17" class="code-annotation-target"><a href="#annotated-cell-2-17" aria-hidden="true" tabindex="-1"></a>            system_message(<span class="st">"system.txt"</span>),</span>
+<span id="annotated-cell-2-18"><a href="#annotated-cell-2-18" aria-hidden="true" tabindex="-1"></a>            use_tools([</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="3">3</button><span id="annotated-cell-2-19" class="code-annotation-target"><a href="#annotated-cell-2-19" aria-hidden="true" tabindex="-1"></a>                bash(timeout<span class="op">=</span>CMD_TIMEOUT),</span>
+<span id="annotated-cell-2-20"><a href="#annotated-cell-2-20" aria-hidden="true" tabindex="-1"></a>                python(timeout<span class="op">=</span>CMD_TIMEOUT)</span>
+<span id="annotated-cell-2-21"><a href="#annotated-cell-2-21" aria-hidden="true" tabindex="-1"></a>            ]),</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="4">4</button><span id="annotated-cell-2-22" class="code-annotation-target"><a href="#annotated-cell-2-22" aria-hidden="true" tabindex="-1"></a>            sample_init(),</span>
+<span id="annotated-cell-2-23"><a href="#annotated-cell-2-23" aria-hidden="true" tabindex="-1"></a>            generate(),</span>
+<span id="annotated-cell-2-24"><a href="#annotated-cell-2-24" aria-hidden="true" tabindex="-1"></a>        ],</span>
+<span id="annotated-cell-2-25"><a href="#annotated-cell-2-25" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>includes(),</span>
+<span id="annotated-cell-2-26"><a href="#annotated-cell-2-26" aria-hidden="true" tabindex="-1"></a>        max_messages<span class="op">=</span>MAX_MESSAGES,            </span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="5">5</button><span id="annotated-cell-2-27" class="code-annotation-target"><a href="#annotated-cell-2-27" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span>,</span>
+<span id="annotated-cell-2-28"><a href="#annotated-cell-2-28" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="annotated-cell-2-29"><a href="#annotated-cell-2-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-30"><a href="#annotated-cell-2-30" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
+<span id="annotated-cell-2-31"><a href="#annotated-cell-2-31" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> sample_init():</span>
+<span id="annotated-cell-2-32"><a href="#annotated-cell-2-32" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> solve(state: TaskState, generate: Generate):</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="6">6</button><span id="annotated-cell-2-33" class="code-annotation-target"><a href="#annotated-cell-2-33" aria-hidden="true" tabindex="-1"></a>        <span class="co"># run setup code if provided</span></span>
+<span id="annotated-cell-2-34"><a href="#annotated-cell-2-34" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> state.metadata.get(<span class="st">"setup"</span>) <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
+<span id="annotated-cell-2-35"><a href="#annotated-cell-2-35" aria-hidden="true" tabindex="-1"></a>            <span class="cf">await</span> tool_environment().<span class="bu">exec</span>(</span>
+<span id="annotated-cell-2-36"><a href="#annotated-cell-2-36" aria-hidden="true" tabindex="-1"></a>                [<span class="st">"bash"</span>, <span class="st">"-c"</span>, state.metadata[<span class="st">"setup"</span>]]</span>
+<span id="annotated-cell-2-37"><a href="#annotated-cell-2-37" aria-hidden="true" tabindex="-1"></a>            )</span>
+<span id="annotated-cell-2-38"><a href="#annotated-cell-2-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-39"><a href="#annotated-cell-2-39" aria-hidden="true" tabindex="-1"></a>        <span class="co"># add available files (if any) to prompt</span></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-2" data-target-annotation="7">7</button><span id="annotated-cell-2-40" class="code-annotation-target"><a href="#annotated-cell-2-40" aria-hidden="true" tabindex="-1"></a>        files <span class="op">=</span> state.metadata[<span class="st">"files"</span>]</span>
+<span id="annotated-cell-2-41"><a href="#annotated-cell-2-41" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">len</span>(files) <span class="op">&gt;</span> <span class="dv">0</span>:</span>
+<span id="annotated-cell-2-42"><a href="#annotated-cell-2-42" aria-hidden="true" tabindex="-1"></a>            state.user_prompt.text <span class="op">=</span> (</span>
+<span id="annotated-cell-2-43"><a href="#annotated-cell-2-43" aria-hidden="true" tabindex="-1"></a>                <span class="st">"The following files are located in the "</span> </span>
+<span id="annotated-cell-2-44"><a href="#annotated-cell-2-44" aria-hidden="true" tabindex="-1"></a>                <span class="op">+</span> <span class="ss">f"working directory: </span><span class="sc">{</span><span class="st">' '</span><span class="sc">.</span>join(files)<span class="sc">}</span><span class="ss">"</span> </span>
+<span id="annotated-cell-2-45"><a href="#annotated-cell-2-45" aria-hidden="true" tabindex="-1"></a>                <span class="op">+</span> <span class="ss">f"</span><span class="ch">\n\n</span><span class="sc">{</span>state<span class="sc">.</span>user_prompt<span class="sc">.</span>text<span class="sc">}</span><span class="ss">"</span>)</span>
+<span id="annotated-cell-2-46"><a href="#annotated-cell-2-46" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-47"><a href="#annotated-cell-2-47" aria-hidden="true" tabindex="-1"></a>        <span class="co"># return state</span></span>
+<span id="annotated-cell-2-48"><a href="#annotated-cell-2-48" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> state</span>
+<span id="annotated-cell-2-49"><a href="#annotated-cell-2-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-2-50"><a href="#annotated-cell-2-50" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> solve</span><div class="code-annotation-gutter-bg"></div><div class="code-annotation-gutter"></div></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<dl class="code-annotation-container-hidden code-annotation-container-grid">
+<dt data-target-cell="annotated-cell-2" data-target-annotation="1">1</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="15" data-code-annotation="1">The <code>read_dataset()</code> function (imported from <a href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/examples/agents/intercode-ctf/dataset.py">dataset.py</a>) downloads the data from the InterCode GH repo and converts it into a native Inspect <code>Dataset</code>).</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="2">2</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="17" data-code-annotation="2">The system prompt (<a href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/examples/agents/intercode-ctf/system.txt">system.txt</a>) describes the CTF challenge, provides a ReAct prompt, and includes several fewshot examples.</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="3">3</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="19,20" data-code-annotation="3">Make the <code>bash()</code> and <code>python()</code> tools available (with a timeout to ensure they don’t perform extremely long running operations). Note that using these tools requires a tool environment, which you can see is provided below).</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="4">4</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="22" data-code-annotation="4">For each sample we run some initialization code that executes a custom setup command (if provided) as well as adds a list of files included with the sample to the prompt.</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="5">5</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="27" data-code-annotation="5">Specify that Docker should be used as the tool environemnt (the container is built from the provided <a href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/examples/agents/intercode-ctf/Dockerfile">Dockerfile</a>)</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="6">6</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="33" data-code-annotation="6">InterCode samples can include a “setup” field that includes a command to run before executing the sample.</span>
+</dd>
+<dt data-target-cell="annotated-cell-2" data-target-annotation="7">7</dt>
+<dd>
+<span data-code-cell="annotated-cell-2" data-code-lines="40" data-code-annotation="7">Ammend the prompt with a list of files copied to the working directory for the sample.</span>
+</dd>
+</dl>
+<p>The full source code for this example can be found in the Inspect GitHub repo at <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/examples/agents/intercode-ctf">examples/agents/intercode-ctf</a>.</p>
+</section>
+</section>
+<section id="custom-scaffolding" class="level2">
+<h2 class="anchored" data-anchor-id="custom-scaffolding">Custom Scaffolding</h2>
+<p>The default tool use loop above will work fine for some evaluations, but in other cases you may need to provide more custom logic. For example, you might want to:</p>
+<ol type="1">
+<li>Urge the model to continue (or take a different path) if it gives up; or</li>
+<li>Have mutliple <code>generate()</code> passes each with a distinct set of tools.</li>
+</ol>
+<p>Here’s a solver that prompts the model to keep going after a failure to come up with a valid submission. Note that you might implement this with a limited number of “retries” or (as illustrated below) you might rely on <code>max_messages</code> to terminate the evaulation:</p>
+<div class="sourceCode" id="annotated-cell-3"><pre class="sourceCode python code-annotation-code code-with-copy code-annotated"><code class="sourceCode python"><span id="annotated-cell-3-1"><a href="#annotated-cell-3-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
+<span id="annotated-cell-3-2"><a href="#annotated-cell-3-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ctf()</span>
+<span id="annotated-cell-3-3"><a href="#annotated-cell-3-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
+<span id="annotated-cell-3-4"><a href="#annotated-cell-3-4" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>csv_dataset(<span class="st">"data"</span>),</span>
+<span id="annotated-cell-3-5"><a href="#annotated-cell-3-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="1">1</button><span id="annotated-cell-3-6" class="code-annotation-target"><a href="#annotated-cell-3-6" aria-hidden="true" tabindex="-1"></a>            use_tools(bash(<span class="dv">180</span>), python(<span class="dv">180</span>)),</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="2">2</button><span id="annotated-cell-3-7" class="code-annotation-target"><a href="#annotated-cell-3-7" aria-hidden="true" tabindex="-1"></a>            generate_until_submission()</span>
+<span id="annotated-cell-3-8"><a href="#annotated-cell-3-8" aria-hidden="true" tabindex="-1"></a>        ],</span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="3">3</button><span id="annotated-cell-3-9" class="code-annotation-target"><a href="#annotated-cell-3-9" aria-hidden="true" tabindex="-1"></a>        max_messages<span class="op">=</span><span class="dv">30</span>,</span>
+<span id="annotated-cell-3-10"><a href="#annotated-cell-3-10" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span></span>
+<span id="annotated-cell-3-11"><a href="#annotated-cell-3-11" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="annotated-cell-3-12"><a href="#annotated-cell-3-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="annotated-cell-3-13"><a href="#annotated-cell-3-13" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
+<span id="annotated-cell-3-14"><a href="#annotated-cell-3-14" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_until_submission():</span>
+<span id="annotated-cell-3-15"><a href="#annotated-cell-3-15" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> solve(state: TaskState, generate: Generate):</span>
+<span id="annotated-cell-3-16"><a href="#annotated-cell-3-16" aria-hidden="true" tabindex="-1"></a>        </span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="4">4</button><span id="annotated-cell-3-17" class="code-annotation-target"><a href="#annotated-cell-3-17" aria-hidden="true" tabindex="-1"></a>        <span class="cf">while</span> <span class="kw">not</span> state.completed:</span>
+<span id="annotated-cell-3-18"><a href="#annotated-cell-3-18" aria-hidden="true" tabindex="-1"></a></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="5">5</button><span id="annotated-cell-3-19" class="code-annotation-target"><a href="#annotated-cell-3-19" aria-hidden="true" tabindex="-1"></a>            state <span class="op">=</span> <span class="cf">await</span> generate(state)</span>
+<span id="annotated-cell-3-20"><a href="#annotated-cell-3-20" aria-hidden="true" tabindex="-1"></a>            <span class="cf">if</span> has_submission(state.output.completion)</span>
+<span id="annotated-cell-3-21"><a href="#annotated-cell-3-21" aria-hidden="true" tabindex="-1"></a>                <span class="cf">break</span></span>
+<span id="annotated-cell-3-22"><a href="#annotated-cell-3-22" aria-hidden="true" tabindex="-1"></a>            </span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-3" data-target-annotation="6">6</button><span id="annotated-cell-3-23" class="code-annotation-target"><a href="#annotated-cell-3-23" aria-hidden="true" tabindex="-1"></a>            state.messages.append(ChatMessageUser(</span>
+<span id="annotated-cell-3-24"><a href="#annotated-cell-3-24" aria-hidden="true" tabindex="-1"></a>                content <span class="op">=</span> <span class="st">"Keep going, you can do it!"</span></span>
+<span id="annotated-cell-3-25"><a href="#annotated-cell-3-25" aria-hidden="true" tabindex="-1"></a>            ))</span>
+<span id="annotated-cell-3-26"><a href="#annotated-cell-3-26" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="annotated-cell-3-27"><a href="#annotated-cell-3-27" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> solve</span><div class="code-annotation-gutter-bg"></div><div class="code-annotation-gutter"></div></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<dl class="code-annotation-container-hidden code-annotation-container-grid">
+<dt data-target-cell="annotated-cell-3" data-target-annotation="1">1</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="6" data-code-annotation="1">Set a timeout of 3 minutes for tool execution.</span>
+</dd>
+<dt data-target-cell="annotated-cell-3" data-target-annotation="2">2</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="7" data-code-annotation="2">Custom solver that re-prompts the model after it gives up.</span>
+</dd>
+<dt data-target-cell="annotated-cell-3" data-target-annotation="3">3</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="9" data-code-annotation="3">The loop in <code>generate_until_submission()</code> will not always terminate unless a <code>max_messages</code> is supplied!</span>
+</dd>
+<dt data-target-cell="annotated-cell-3" data-target-annotation="4">4</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="17" data-code-annotation="4">When <code>max_messages</code> is exceeded, <code>state.completed</code> will be set to <code>False</code>.</span>
+</dd>
+<dt data-target-cell="annotated-cell-3" data-target-annotation="5">5</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="19,20,21" data-code-annotation="5">Run standard generate tool use loop and check to see if the model came up with a submission (or alternatively gave up).</span>
+</dd>
+<dt data-target-cell="annotated-cell-3" data-target-annotation="6">6</dt>
+<dd>
+<span data-code-cell="annotated-cell-3" data-code-lines="23,24,25" data-code-annotation="6">Provide user message that urges the model to continue.</span>
+</dd>
+</dl>
+<p>Here’s an example of a <code>Solver</code> that filters the available tools between calls to <code>generate()</code>:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> generate_ctf():</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> solve(state: TaskState, generate: Generate):</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>        </span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>        <span class="co"># first pass w/ tools</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>        state.tools <span class="op">=</span> [decompile(), dissasemble(), bash()]</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>        state <span class="op">=</span> <span class="cf">await</span> generate(state)</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>        <span class="co"># second pass w/ prompt and different tools</span></span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>        state.tools <span class="op">=</span> [python()]</span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>        state.messages.append(ChatMessageUser( </span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>            content <span class="op">=</span> <span class="st">"Use Python to extract the flag."</span> </span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>        ))  </span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>        state <span class="op">=</span> <span class="cf">await</span> generate(state)</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>        <span class="co"># clear tools and return</span></span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>        state.tools <span class="op">=</span> []</span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> state</span>
+<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> solve</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>You can imagine many other variations on the examples above. The key thing to take from these examples is that you can use custom solvers to wrap code around the default <code>generate()</code> tool use loop.</p>
+</section>
+<section id="agent-libraries" class="level2">
+<h2 class="anchored" data-anchor-id="agent-libraries">Agent Libraries</h2>
+<p>You can also adapt code from a research paper or 3rd party agent library to run within an Inspect solver. Below we’ll provide an example of doing this for a <a href="https://python.langchain.com/v0.2/docs/tutorials/agents/">LangChain Agent</a>.</p>
+<p>When adapting 3rd party agent code, it’s important that the agent scaffolding use Inspect’s model API rather than whatever interface is built in to the existing code or library (otherwise you might be evaluating the wrong model!). If the agent is executing arbitrary code, it’s also beneficial to use Inspect <a href="#sec-tool-environments">Tool Environments</a> for sandboxing.</p>
+<section id="sec-langchain" class="level3">
+<h3 class="anchored" data-anchor-id="sec-langchain">Example: LangChain</h3>
+<p>This example demonstrates how to integrate a LangChain Agent with Inspect. The agent uses Wikipedia via the <a href="https://tavily.com/">Tavili Search API</a> to perform question answering tasks. If you want to start by getting some grounding in the code <em>without</em> the Inspect integration, see <a href="https://brightinventions.pl/blog/introducing-langchain-agents-tutorial-with-example/">this article</a> upon which the example is based.</p>
+<p>The main thing that an integration with an agent framework needs to account for is:</p>
+<ol type="1">
+<li><p>Bridging Inspect’s model API into the API of the agent framework. In this example this is done via the <code>InspectChatModel</code> class (which derives from the LangChain <code>BaseChatModel</code> and provides access to the Inspect model being used for the current evaluation).</p></li>
+<li><p>Bridging from the Inspect solver interface to the standard input and output types of the agent library. In this example this is provided by the <code>langchain_solver()</code> function, which takes a LangChain agent function and converts it to an Inspect solver.</p></li>
+</ol>
+<p>Here’s the implementation of <code>langchain_solver()</code> (imports excluded for brevity):</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Interface for LangChain agent function</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> LangChainAgent(Protocol):</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> <span class="fu">__call__</span>(<span class="va">self</span>, llm: BaseChatModel, <span class="bu">input</span>: <span class="bu">dict</span>[<span class="bu">str</span>, Any]): ...</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert a LangChain agent function into a Solver</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> langchain_solver(agent: LangChainAgent) <span class="op">-&gt;</span> Solver:</span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> solve(state: TaskState, generate: Generate) <span class="op">-&gt;</span> TaskState:</span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># create the inspect model api bridge</span></span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>        llm <span class="op">=</span> InspectChatModel()</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>        <span class="co"># call the agent</span></span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>        <span class="cf">await</span> agent(</span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>            llm <span class="op">=</span> llm,</span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>            <span class="bu">input</span> <span class="op">=</span> <span class="bu">dict</span>(</span>
+<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>                <span class="bu">input</span><span class="op">=</span>state.user_prompt.text,</span>
+<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>                chat_history<span class="op">=</span>as_langchain_chat_history(</span>
+<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>                    state.messages[<span class="dv">1</span>:]</span>
+<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>                ),</span>
+<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>            )</span>
+<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>        )</span>
+<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>        <span class="co"># collect output from llm interface</span></span>
+<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>        state.messages <span class="op">=</span> llm.messages</span>
+<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>        state.output <span class="op">=</span> llm.output</span>
+<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        state.output.completion <span class="op">=</span> output</span>
+<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>        </span>
+<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>        <span class="co"># return state</span></span>
+<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> state</span>
+<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> solve</span>
+<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a><span class="co"># LangChain BaseChatModel for Inspect Model API</span></span>
+<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> InspectChatModel(BaseChatModel):</span>
+<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>     <span class="cf">async</span> <span class="kw">def</span> _agenerate(</span>
+<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>,</span>
+<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>        messages: <span class="bu">list</span>[BaseMessage],</span>
+<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>        stop: <span class="bu">list</span>[<span class="bu">str</span>] <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
+<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>        run_manager: AsyncCallbackManagerForLLMRun <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
+<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>        <span class="op">**</span>kwargs: <span class="bu">dict</span>[<span class="bu">str</span>, Any],</span>
+<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> ChatResult:</span>
+<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>        ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Note that the the <code>inspect_langchain</code> module imported here is not a built in feature of Inspect. Rather, you can find its <a href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/examples/agents/langchain/inspect_langchain.py">source code</a> as part of the example. You can use this to create your own LangChain agents or as the basis for creating similar integrations with other agent frameworks.</p>
+</div>
+</div>
+</div>
+<p>Now here’s the <code>wikipedia_search()</code> solver (imports again excluded for brevity):</p>
+<div class="sourceCode" id="annotated-cell-6"><pre class="sourceCode python code-annotation-code code-with-copy code-annotated"><code class="sourceCode python"><span id="annotated-cell-6-1"><a href="#annotated-cell-6-1" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
+<span id="annotated-cell-6-2"><a href="#annotated-cell-6-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> wikipedia_search(</span>
+<span id="annotated-cell-6-3"><a href="#annotated-cell-6-3" aria-hidden="true" tabindex="-1"></a>    max_iterations: <span class="bu">int</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="dv">15</span>,</span>
+<span id="annotated-cell-6-4"><a href="#annotated-cell-6-4" aria-hidden="true" tabindex="-1"></a>    max_execution_time: <span class="bu">float</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span></span>
+<span id="annotated-cell-6-5"><a href="#annotated-cell-6-5" aria-hidden="true" tabindex="-1"></a>) <span class="op">-&gt;</span> Solver:</span>
+<span id="annotated-cell-6-6"><a href="#annotated-cell-6-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># standard prompt for tools agent</span></span>
+<span id="annotated-cell-6-7"><a href="#annotated-cell-6-7" aria-hidden="true" tabindex="-1"></a>    prompt <span class="op">=</span> hub.pull(<span class="st">"hwchase17/openai-tools-agent"</span>)</span>
+<span id="annotated-cell-6-8"><a href="#annotated-cell-6-8" aria-hidden="true" tabindex="-1"></a></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-6" data-target-annotation="1">1</button><span id="annotated-cell-6-9" class="code-annotation-target"><a href="#annotated-cell-6-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># tavily and wikipedia tools</span></span>
+<span id="annotated-cell-6-10"><a href="#annotated-cell-6-10" aria-hidden="true" tabindex="-1"></a>    tavily_api <span class="op">=</span> TavilySearchAPIWrapper()  <span class="co"># type: ignore</span></span>
+<span id="annotated-cell-6-11"><a href="#annotated-cell-6-11" aria-hidden="true" tabindex="-1"></a>    tools <span class="op">=</span> (</span>
+<span id="annotated-cell-6-12"><a href="#annotated-cell-6-12" aria-hidden="true" tabindex="-1"></a>        [TavilySearchResults(api_wrapper<span class="op">=</span>tavily_api)] <span class="op">+</span> </span>
+<span id="annotated-cell-6-13"><a href="#annotated-cell-6-13" aria-hidden="true" tabindex="-1"></a>        load_tools([<span class="st">"wikipedia"</span>])</span>
+<span id="annotated-cell-6-14"><a href="#annotated-cell-6-14" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="annotated-cell-6-15"><a href="#annotated-cell-6-15" aria-hidden="true" tabindex="-1"></a></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-6" data-target-annotation="2">2</button><span id="annotated-cell-6-16" class="code-annotation-target"><a href="#annotated-cell-6-16" aria-hidden="true" tabindex="-1"></a>    <span class="co"># agent function</span></span>
+<span id="annotated-cell-6-17"><a href="#annotated-cell-6-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> agent(</span>
+<span id="annotated-cell-6-18"><a href="#annotated-cell-6-18" aria-hidden="true" tabindex="-1"></a>        llm: BaseChatModel, </span>
+<span id="annotated-cell-6-19"><a href="#annotated-cell-6-19" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span>: <span class="bu">dict</span>[<span class="bu">str</span>, Any]</span>
+<span id="annotated-cell-6-20"><a href="#annotated-cell-6-20" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> <span class="bu">str</span> <span class="op">|</span> <span class="bu">list</span>[<span class="bu">str</span> <span class="op">|</span> <span class="bu">dict</span>[<span class="bu">str</span>,Any]]:  </span>
+<span id="annotated-cell-6-21"><a href="#annotated-cell-6-21" aria-hidden="true" tabindex="-1"></a>        <span class="co"># create agent</span></span>
+<span id="annotated-cell-6-22"><a href="#annotated-cell-6-22" aria-hidden="true" tabindex="-1"></a>        tools_agent <span class="op">=</span> create_openai_tools_agent(</span>
+<span id="annotated-cell-6-23"><a href="#annotated-cell-6-23" aria-hidden="true" tabindex="-1"></a>          llm, tools, prompt</span>
+<span id="annotated-cell-6-24"><a href="#annotated-cell-6-24" aria-hidden="true" tabindex="-1"></a>        )</span>
+<span id="annotated-cell-6-25"><a href="#annotated-cell-6-25" aria-hidden="true" tabindex="-1"></a>        executor <span class="op">=</span> AgentExecutor.from_agent_and_tools(</span>
+<span id="annotated-cell-6-26"><a href="#annotated-cell-6-26" aria-hidden="true" tabindex="-1"></a>            agent<span class="op">=</span>cast(BaseMultiActionAgent, tools_agent),</span>
+<span id="annotated-cell-6-27"><a href="#annotated-cell-6-27" aria-hidden="true" tabindex="-1"></a>            tools<span class="op">=</span>tools,</span>
+<span id="annotated-cell-6-28"><a href="#annotated-cell-6-28" aria-hidden="true" tabindex="-1"></a>            name<span class="op">=</span><span class="st">"wikipedia_search"</span>,</span>
+<span id="annotated-cell-6-29"><a href="#annotated-cell-6-29" aria-hidden="true" tabindex="-1"></a>            max_iterations<span class="op">=</span>max_iterations,  </span>
+<span id="annotated-cell-6-30"><a href="#annotated-cell-6-30" aria-hidden="true" tabindex="-1"></a>            max_execution_time<span class="op">=</span>max_execution_time</span>
+<span id="annotated-cell-6-31"><a href="#annotated-cell-6-31" aria-hidden="true" tabindex="-1"></a>        )</span>
+<span id="annotated-cell-6-32"><a href="#annotated-cell-6-32" aria-hidden="true" tabindex="-1"></a></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-6" data-target-annotation="3">3</button><span id="annotated-cell-6-33" class="code-annotation-target"><a href="#annotated-cell-6-33" aria-hidden="true" tabindex="-1"></a>        <span class="co"># execute the agent and return output</span></span>
+<span id="annotated-cell-6-34"><a href="#annotated-cell-6-34" aria-hidden="true" tabindex="-1"></a>        result <span class="op">=</span> <span class="cf">await</span> executor.ainvoke(<span class="bu">input</span>)  </span>
+<span id="annotated-cell-6-35"><a href="#annotated-cell-6-35" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> result[<span class="st">"output"</span>]</span>
+<span id="annotated-cell-6-36"><a href="#annotated-cell-6-36" aria-hidden="true" tabindex="-1"></a></span>
+<button class="code-annotation-anchor" data-target-cell="annotated-cell-6" data-target-annotation="4">4</button><span id="annotated-cell-6-37" class="code-annotation-target"><a href="#annotated-cell-6-37" aria-hidden="true" tabindex="-1"></a>    <span class="co"># return agent function as inspect solver</span></span>
+<span id="annotated-cell-6-38"><a href="#annotated-cell-6-38" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> langchain_solver(agent)</span><div class="code-annotation-gutter-bg"></div><div class="code-annotation-gutter"></div></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<dl class="code-annotation-container-hidden code-annotation-container-grid">
+<dt data-target-cell="annotated-cell-6" data-target-annotation="1">1</dt>
+<dd>
+<span data-code-cell="annotated-cell-6" data-code-lines="9" data-code-annotation="1">Note that we register native LangChain tools. These will be converted to the standard Inspect <code>ToolInfo</code> when generate is called.</span>
+</dd>
+<dt data-target-cell="annotated-cell-6" data-target-annotation="2">2</dt>
+<dd>
+<span data-code-cell="annotated-cell-6" data-code-lines="16" data-code-annotation="2">This is the standard interface to LangChain agents. We take this function and automatically create a standard Inspect solver from it below when we pass it to <code>langchain_solver()</code>.</span>
+</dd>
+<dt data-target-cell="annotated-cell-6" data-target-annotation="3">3</dt>
+<dd>
+<span data-code-cell="annotated-cell-6" data-code-lines="33" data-code-annotation="3">Invoke the agent using the chat history passed in <code>input</code>. We call the async executor API to play well with Inspect’s concurrency.</span>
+</dd>
+<dt data-target-cell="annotated-cell-6" data-target-annotation="4">4</dt>
+<dd>
+<span data-code-cell="annotated-cell-6" data-code-lines="37" data-code-annotation="4">The <code>langchain_solver()</code> function maps the simpler agent function semantics into the standard Inspect solver API.</span>
+</dd>
+</dl>
+<p>If you reviewed the <a href="https://brightinventions.pl/blog/introducing-langchain-agents-tutorial-with-example/">original article</a> that this example was based on, you’ll see that most of the code is unchanged (save for the fact that we have switched from a function agent to a tools agent). The main difference is that we compose the agent function into an Inspect solver by passing it to <code>langchain_solver()</code>.</p>
+<p>Finally, here’s a task that uses the <code>wikipedia_search()</code> solver:</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> wikipedia() <span class="op">-&gt;</span> Task:</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>json_dataset(<span class="st">"wikipedia.jsonl"</span>),</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>wikipedia_search(),</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>model_graded_fact(),</span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The full source code for this example can be found in the Inspect GitHub repo at <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/examples/agents/langchain">examples/agents/langchain</a>.</p>
+</section>
+</section>
+<section id="sec-tool-environments" class="level2">
+<h2 class="anchored" data-anchor-id="sec-tool-environments">Tool Environments</h2>
+<p>The examples shown above execute tool code within the main process running the evaluation task. In some cases however, you may require the provisioning of dedicated environments for running tool code. This might be the case if:</p>
+<ul>
+<li><p>You are creating tools that enable execution of arbitrary code (e.g.&nbsp;a tool that executes shell commands or Python code).</p></li>
+<li><p>You need to provision per-sample file system resources.</p></li>
+<li><p>You want to provide access to a more sophisticated evaluation environment (e.g.&nbsp;creating network hosts for a cybersecurity eval).</p></li>
+</ul>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Tool environments are not yet available in the version of Inspect published to PyPI (they are rather only available from the development version of Inspect). To install the development version:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+</div>
+<section id="example-file-listing" class="level3">
+<h3 class="anchored" data-anchor-id="example-file-listing">Example: File Listing</h3>
+<p>Let’s take a look at a simple example to illustrate. First, we’ll define a <code>list_files()</code> tool. This tool need to access the <code>ls</code> command—it does so by calling the <code>tool_environment()</code> function to get access to the <code>ToolEnvironment</code> instance for the currently executing <code>Sample</code>:</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.solver <span class="im">import</span> tool, tool_environment</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="at">@tool</span>(prompt<span class="op">=</span><span class="st">"Use the list_files function to enumerate files."</span>)</span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> list_files():</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> execute(<span class="bu">dir</span>: <span class="bu">str</span>):</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>        <span class="co">"""List the files in a directory.</span></span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a><span class="co">        Args:</span></span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a><span class="co">            dir (str): Directory</span></span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a><span class="co">        Returns:</span></span>
+<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a><span class="co">            File listing of the directory</span></span>
+<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a><span class="co">        """</span></span>
+<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a>        result <span class="op">=</span> <span class="cf">await</span> tool_environment().<span class="bu">exec</span>([<span class="st">"ls"</span>, <span class="bu">dir</span>])</span>
+<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.success:</span>
+<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>            <span class="cf">return</span> result.stdout</span>
+<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
+<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>            <span class="cf">return</span> <span class="ss">f"Error: </span><span class="sc">{</span>result<span class="sc">.</span>stderr<span class="sc">}</span><span class="ss">"</span></span>
+<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> execute</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The <code>exec()</code> function is used to list the directory contents. Note that its not immediately clear where or how <code>exec()</code> is implemented (that will be described shortly!).</p>
+<p>Here’s an evaluation that makes use of this tool:</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> task, Task</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.solver <span class="im">import</span> generate, use_tools</span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> [</span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>    Sample(</span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span><span class="op">=</span><span class="st">'Is there a file named "bar.txt" '</span> </span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>               <span class="op">+</span> <span class="st">'in the current directory?'</span>,</span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a>        target<span class="op">=</span><span class="st">"Yes"</span>,</span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>        files<span class="op">=</span>{<span class="st">"bar.txt"</span>: <span class="st">"hello"</span>},</span>
+<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a>]</span>
+<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
+<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> file_probe()</span>
+<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
+<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>dataset,</span>
+<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
+<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a>            use_tools([list_files()]), </span>
+<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a>            generate()</span>
+<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a>        ],</span>
+<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span>,</span>
+<span id="cb8-24"><a href="#cb8-24" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>includes(),</span>
+<span id="cb8-25"><a href="#cb8-25" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb8-26"><a href="#cb8-26" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We’ve included <code>tool_environment = "docker"</code> to indicate that tool environment operations should be executed in a Docker container. Specifying a tool environment (either at the task or evaluation level) is required if your tools call the <code>tool_environment()</code> function.</p>
+<p>Note that <code>files</code> are specified as part of the <code>Sample</code>. Files can be specified inline using plain text (as depicted above), inline using a base64-encoded data URI, or as a path to a file or remote resource (e.g.&nbsp;S3 bucket). Relative file paths are resolved according to the location of the underlying dataset file.</p>
+</section>
+<section id="environment-interface" class="level3">
+<h3 class="anchored" data-anchor-id="environment-interface">Environment Interface</h3>
+<p>The following methods are available for all tool environments:</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> ToolEnvironment:</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>   </span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> <span class="bu">exec</span>(</span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>,</span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>        cmd: <span class="bu">list</span>[<span class="bu">str</span>],</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span>: <span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>        env: <span class="bu">dict</span>[<span class="bu">str</span>, <span class="bu">str</span>] <span class="op">=</span> {},</span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>        timeout: <span class="bu">int</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> ExecResult[<span class="bu">str</span>]:</span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>        ...</span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> write_file(</span>
+<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>, <span class="bu">file</span>: <span class="bu">str</span>, contents: <span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span></span>
+<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> <span class="va">None</span>:</span>
+<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a>        ...</span>
+<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> read_file(</span>
+<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>, <span class="bu">file</span>: <span class="bu">str</span>, text: <span class="bu">bool</span> <span class="op">=</span> <span class="va">True</span></span>
+<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> Union[<span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span>]:</span>
+<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a>        ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="environment-binding" class="level3">
+<h3 class="anchored" data-anchor-id="environment-binding">Environment Binding</h3>
+<p>There are two tool environments built in to Inspect:</p>
+<table class="table">
+<thead>
+<tr class="header">
+<th>Environment Type</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>local</code></td>
+<td>Run <code>tool_environment()</code> methods in the same address space and file system as the running evaluation. The local environment should <em>only be used</em> if you are already running your evaluation in another sandbox.</td>
+</tr>
+<tr class="even">
+<td><code>docker</code></td>
+<td>Run <code>tool_environment()</code> methods within a Docker container (see the <a href="#sec-docker-configuration">Docker Configuration</a> section below for additional details).</td>
+</tr>
+</tbody>
+</table>
+<p>Tool environments can be bound at the <code>Task</code> level or at the <code>eval()</code> level (where <code>eval()</code> takes precedence). To bind a tool environment to a <code>Task</code>, use the <code>tool_environment</code> option:</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>Task(</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>    dataset<span class="op">=</span>dataset,</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>    plan([</span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>        use_tools([read_file(), list_files()])), </span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>        generate()</span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>    ]),</span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>    scorer<span class="op">=</span>match(),</span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>    tool_environment<span class="op">=</span><span class="st">"docker"</span></span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>For this example, if there is a <code>compose.yaml</code> file in the task directory it will be used to provision Docker services (if there is no <code>compose.yaml</code> then the Docker’s default Python 3.12 image will be used). You can specify an alternate config file using a tuple:</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>tool_environment<span class="op">=</span>(<span class="st">"docker"</span>, <span class="st">"my-compose.yaml"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Similar conventions exist for <code>eval()</code> and the CLI:</p>
+<div class="sourceCode" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="bu">eval</span>(task, tool_environment<span class="op">=</span><span class="st">"docker"</span>)</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="bu">eval</span>(task, tool_environment<span class="op">=</span>(<span class="st">"docker"</span>,<span class="st">"my-compose.yaml"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb13"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval <span class="at">--tool-environment</span> docker</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval <span class="at">--tool-environment</span> docker:my-compose.yaml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="sec-docker-configuration" class="level3">
+<h3 class="anchored" data-anchor-id="sec-docker-configuration">Docker Configuration</h3>
+<p>While <code>--tool-environment</code> can be a default un-configured environment (e.g.&nbsp;“docker”), more commonly you’ll provide explicit configuration in either a <code>Dockerfile</code> or a <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> configuration file (<code>compose.yaml</code>).</p>
+<p>Here is how Docker tool environments are created based on the presence of <code>Dockerfile</code> and/or <code>compose.yml</code> in the task directory:</p>
+<table class="table">
+<thead>
+<tr class="header">
+<th>Config Files</th>
+<th>Behavior</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>None</td>
+<td>Creates a tool environment based on the official <a href="https://hub.docker.com/_/python">python:3.12-bookworm</a> image.</td>
+</tr>
+<tr class="even">
+<td><code>Dockerfile</code></td>
+<td>Creates a tool environment by building the image.</td>
+</tr>
+<tr class="odd">
+<td><code>compose.yaml</code></td>
+<td>Creates tool environment(s) based on <code>compose.yaml</code>.</td>
+</tr>
+</tbody>
+</table>
+<p>Here is what a simple <code>compose.yaml</code> would look like for a single tool environment that uses the <code>ctf-agent-environment</code> Docker image:</p>
+<div class="code-with-filename">
+<div class="code-with-filename-file">
+<pre><strong>compose.yaml</strong></pre>
+</div>
+<div class="sourceCode" id="cb14" data-filename="compose.yaml"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span><span class="at"> </span></span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 0.5gb</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note that we’ve also chosen to limit the CPU and memory usage of the container (see the <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> documentation for information on these and other container options).</p>
+<section id="multiple-environments" class="level4">
+<h4 class="anchored" data-anchor-id="multiple-environments">Multiple Environments</h4>
+<p>In some cases you may want to create multiple tool environments (e.g.&nbsp;if one environment has complex dependencies that conflict with the dependencies of other environments). To do this specify multiple named services:</p>
+<div class="code-with-filename">
+<div class="code-with-filename-file">
+<pre><strong>compose.yaml</strong></pre>
+</div>
+<div class="sourceCode" id="cb15" data-filename="compose.yaml"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 0.5gb</span></span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ghidra</span><span class="kw">:</span></span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-ghidra-environment</span></span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 1gb</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The first environment listed is the “default” environment, and can be accessed from within a tool with a normal call to <code>tool_environment()</code>. Other environments would be accessed by name, for example:</p>
+<div class="sourceCode" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>tool_environment()          <span class="co"># default tool environment</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>tool_environment(<span class="st">"ghidra"</span>)  <span class="co"># named tool environment</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-default callout-note callout-titled" data-apperance="simple">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>If you define multiple tool environments you are <em>required</em> to name one of them “default” so that Inspect knows which environment to copy samples files to and resolve for calls to <code>tool_environment()</code> without an argument.</p>
+</div>
+</div>
+</section>
+<section id="infrastructure" class="level4">
+<h4 class="anchored" data-anchor-id="infrastructure">Infrastructure</h4>
+<p>Note that in many cases you’ll want to provision additional infrastructure (e.g.&nbsp;other hosts or volumes). For example, here we define an additional container (“writer”) as well as a volume shared between the default container and the writer container:</p>
+<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span><span class="at"> </span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">volumes</span><span class="kw">:</span></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="kw">-</span><span class="at"> ctf-challenge-volume:/shared-data</span></span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="at">    </span></span>
+<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">writer</span><span class="kw">:</span></span>
+<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-challenge-writer</span></span>
+<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">volumes</span><span class="kw">:</span></span>
+<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="kw">-</span><span class="at"> ctf-challenge-volume:/shared-data</span></span>
+<span id="cb17-11"><a href="#cb17-11" aria-hidden="true" tabindex="-1"></a><span class="fu">volumes</span><span class="kw">:</span></span>
+<span id="cb17-12"><a href="#cb17-12" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ctf-challenge-volume</span><span class="kw">:</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>See the documentation on <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> files for information on their full schema and feature set.</p>
+</section>
+</section>
+<section id="resource-management" class="level3">
+<h3 class="anchored" data-anchor-id="resource-management">Resource Management</h3>
+<p>Creating and executing code within Docker containers can be expensive both in terms of memory and CPU utilization. Inspect provides some automatic resource management to keep usage reasonable in the default case. This section describes that behavior as well as how you can tune it for your use-cases.</p>
+<section id="running-containers" class="level4">
+<h4 class="anchored" data-anchor-id="running-containers">Running Containers</h4>
+<p>As described above, each <code>Sample</code> is provisioned its own container. The number of running containers for an evaluation is therefore determined by the <code>max_samples</code> option (which is by default set to <code>max_connections</code>, typically 10 unless overridden).</p>
+<p>Use <code>max_samples</code> to dial up or down the number of containers running at any given time. Note that a running container does not necessarily use CPU resources unless it has active background processes.</p>
+</section>
+<section id="concurrent-execution" class="level4">
+<h4 class="anchored" data-anchor-id="concurrent-execution">Concurrent Execution</h4>
+<p>The <code>ToolEnvironment.exec()</code> method runs a command within a tool environment, typically consuming CPU resources. To protect against overwhelming the system’s CPUs, the implementation of <code>exec()</code> uses Inspect’s <code>subprocess()</code> function, which automatically limits concurrent child processes to the number of CPUs on your system (<code>os.cpu_count()</code>).</p>
+<p>You can change the number of permitted concurrent subprocess executions using the <code>max_subprocesses</code> option. You might do this for example if you know that your <code>exec()</code> commands tend to use <em>multiple</em> CPU cores and thus should be executed with less concurrency.</p>
+</section>
+</section>
+<section id="troubleshooting" class="level3">
+<h3 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h3>
+<p>You can view more detailed logging around the creation and use of tool environments by using the <code>tools</code> log level. For example:</p>
+<div class="sourceCode" id="cb18"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval ctf.py <span class="at">--log-level</span> tools</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The tools log level is just above <code>warning</code> (so it will not show <code>http</code> or <code>debug</code> level messages).</p>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp("https:\/\/UKGovernmentBEIS\.github\.io\/inspect_ai\/");
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      const annoteTargets = window.document.querySelectorAll('.code-annotation-anchor');
+      for (let i=0; i<annoteTargets.length; i++) {
+        const annoteTarget = annoteTargets[i];
+        const targetCell = annoteTarget.getAttribute("data-target-cell");
+        const targetAnnotation = annoteTarget.getAttribute("data-target-annotation");
+        const contentFn = () => {
+          const content = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+          if (content) {
+            const tipContent = content.cloneNode(true);
+            tipContent.classList.add("code-annotation-tip-content");
+            return tipContent.outerHTML;
+          }
+        }
+        const config = {
+          allowHTML: true,
+          content: contentFn,
+          onShow: (instance) => {
+            selectCodeLines(instance.reference);
+            instance.reference.classList.add('code-annotation-active');
+            window.tippy.hideAll();
+          },
+          onHide: (instance) => {
+            unselectCodeLines();
+            instance.reference.classList.remove('code-annotation-active');
+          },
+          maxWidth: 300,
+          delay: [50, 0],
+          duration: [200, 0],
+          offset: [5, 10],
+          arrow: true,
+          trigger: 'click',
+          appendTo: function(el) {
+            return el.parentElement.parentElement.parentElement;
+          },
+          interactive: true,
+          interactiveBorder: 10,
+          theme: 'quarto',
+          placement: 'right',
+          positionFixed: true,
+          popperOptions: {
+            modifiers: [
+            {
+              name: 'flip',
+              options: {
+                flipVariations: false, // true by default
+                allowedAutoPlacements: ['right'],
+                fallbackPlacements: ['right', 'top', 'top-start', 'top-end', 'bottom', 'bottom-start', 'bottom-end', 'left'],
+              },
+            },
+            {
+              name: 'preventOverflow',
+              options: {
+                mainAxis: false,
+                altAxis: false
+              }
+            }
+            ]        
+          }      
+        };
+        window.tippy(annoteTarget, config); 
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="./tools.html" class="pagination-link" aria-label="Tools">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-title">Tools</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="./scorers.html" class="pagination-link" aria-label="Scorers">
+        <span class="nav-page-text"><span class="chapter-title">Scorers</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+<footer class="footer">
+  <div class="nav-footer">
+    <div class="nav-footer-left">
+      <ul class="footer-items list-unstyled">
+    <li class="nav-item">
+    <a class="nav-link active" href="https://www.gov.uk/government/organisations/ai-safety-institute" aria-current="page">
+<p>UK AI Safety Institute</p>
+</a>
+  </li>  
+</ul>
+    </div>   
+    <div class="nav-footer-center">
+      <ul class="footer-items list-unstyled">
+    <li class="nav-item">
+    <a class="nav-link" href="https://github.com/UKGovernmentBEIS/inspect_ai">
+<p>Code</p>
+</a>
+  </li>  
+    <li class="nav-item">
+    <a class="nav-link" href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/CHANGELOG.md">
+<p>Changelog</p>
+</a>
+  </li>  
+    <li class="nav-item">
+    <a class="nav-link" href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/LICENSE">
+<p>License</p>
+</a>
+  </li>  
+    <li class="nav-item">
+    <a class="nav-link" href="https://github.com/UKGovernmentBEIS/inspect_ai/issues">
+<p>Issues</p>
+</a>
+  </li>  
+</ul>
+    <div class="toc-actions d-sm-block d-md-none"><ul><li><a href="https://github.com/UKGovernmentBEIS/inspect_ai/issues/new" class="toc-action"><i class="bi bi-github"></i>Report an issue</a></li></ul></div></div>
+    <div class="nav-footer-right">
+      <ul class="footer-items list-unstyled">
+    <li class="nav-item compact">
+    <a class="nav-link" href="https://twitter.com/AISafetyInst">
+      <i class="bi bi-twitter" role="img" aria-label="UK AI Safety Institute Twitter">
+</i> 
+    </a>
+  </li>  
+    <li class="nav-item compact">
+    <a class="nav-link" href="https://github.com/UKGovernmentBEIS/inspect_ai/">
+      <i class="bi bi-github" role="img" aria-label="Inspect on GitHub">
+</i> 
+    </a>
+  </li>  
+</ul>
+    </div>
+  </div>
+</footer>
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/datasets.html b/datasets.html
index fcc18be89..e52caef70 100644
--- a/datasets.html
+++ b/datasets.html
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/eval-logs.html b/eval-logs.html
index b70eb8bb3..0a0842dd8 100644
--- a/eval-logs.html
+++ b/eval-logs.html
@@ -214,6 +214,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1066,7 +1071,7 @@ <h3 class="anchored" data-anchor-id="reading-logs">Reading Logs</h3>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"selector":".lightbox","loop":false,"descPosition":"bottom","openEffect":"zoom","closeEffect":"zoom"});
+<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","selector":".lightbox","closeEffect":"zoom","loop":false,"openEffect":"zoom"});
 window.onload = () => {
   lightboxQuarto.on('slide_before_load', (data) => {
     const { slideIndex, slideNode, slideConfig, player, trigger } = data;
diff --git a/eval-suites.html b/eval-suites.html
index f3401ab0d..43fbf39bc 100644
--- a/eval-suites.html
+++ b/eval-suites.html
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/eval-tuning.html b/eval-tuning.html
index 8d865dd86..d2ff5703d 100644
--- a/eval-tuning.html
+++ b/eval-tuning.html
@@ -210,6 +210,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/examples.html b/examples.html
index 80c109a6d..eddb8ebf6 100644
--- a/examples.html
+++ b/examples.html
@@ -240,6 +240,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -414,7 +419,7 @@ <h2 class="anchored" data-anchor-id="sec-security-guide">Security Guide</h2>
 <section id="setup" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect and defining a system message that orients the model to its role as a computer security expert.</p>
-<div id="4914dcbb" class="cell">
+<div id="7c53ab5d" class="cell">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> example_dataset</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_fact</span>
@@ -431,7 +436,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
 <section id="eval" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval">Eval</h3>
 <p>Discerning whether the correct security guidance was provided by the model might provide difficult using only text matching algorithms. Here we use a model to read the response and assess the quality of the answer.</p>
-<div id="c43281ae" class="cell">
+<div id="3982a569" class="cell">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> security_guide():</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -461,7 +466,7 @@ <h2 class="anchored" data-anchor-id="sec-hellaswag">HellaSwag</h2>
 <section id="setup-1" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect, defining a system message, and writing a function to convert dataset records to samples (we need to do this to convert the index-based label in the dataset to a letter).</p>
-<div id="6f750972" class="cell">
+<div id="d9c7551d" class="cell">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> answer</span>
@@ -486,7 +491,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
 <section id="eval-1" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-1">Eval</h3>
 <p>We’ll load the dataset from <a href="https://huggingface.co/datasets/Rowan/hellaswag">HuggingFace</a> using the <code>hf_dataset()</code> function. We’ll draw data from the validation split, and use the <code>record_to_sample()</code> function to parse the records (we’ll also pass <code>trust=True</code> to indicate that we are okay with Hugging Face executing the dataset loading code provided by hellaswag):</p>
-<div id="ab2cd4ad" class="cell">
+<div id="31965f2d" class="cell">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> hellaswag():</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>   </span>
@@ -542,7 +547,7 @@ <h2 class="anchored" data-anchor-id="sec-theory-of-mind">Theory of Mind</h2>
 <section id="eval-2" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-2">Eval</h3>
 <p>This example demonstrates adding parameters to a <code>@task</code> function to create dynamic variants of an evaluation. Here we use a <code>critique</code> parameter to determine whether a <code>self_critique()</code> solver is able to improve on the model’s baseline answer.</p>
-<div id="0c85c4b7" class="cell">
+<div id="997e04b3" class="cell">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> example_dataset</span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_fact</span>
@@ -596,7 +601,7 @@ <h2 class="anchored" data-anchor-id="sec-mathematics">MATH</h2>
 <section id="setup-2" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect and defining a prompt that asks the model to reason step by step and respond with its answer on a line at the end. It also nudges the model not to enclose its answer in <code>\boxed</code>, a LaTeX command for displaying equations that models often use in math output.</p>
-<div id="d8ba7a44" class="cell">
+<div id="f6aaa0ca" class="cell">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
 <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
@@ -630,7 +635,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
 <section id="eval-3" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
 <p>Here is the basic setup for our eval. We <code>shuffle</code> the dataset so that when we use <code>--limit</code> to develop on smaller slices we get some variety of inputs and results:</p>
-<div id="45c78c0d" class="cell">
+<div id="330eede8" class="cell">
 <div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math(shuffle<span class="op">=</span><span class="va">True</span>):</span>
 <span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -651,7 +656,7 @@ <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
 <span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The heart of this eval isn’t in the task definition though, rather it’s in how we grade the output. Math expressions can be logically equivalent but not literally the same. Consequently, we’ll use a model to assess whether the output and the target are logically equivalent. the <code>expression_equivalence()</code> custom scorer implements this:</p>
-<div id="ebc1aa65" class="cell">
+<div id="f6a1557d" class="cell">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="at">@scorer</span>(metrics<span class="op">=</span>[accuracy(), bootstrap_std()])</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> expression_equivalence():</span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> score(state: TaskState, target: Target):</span>
@@ -749,7 +754,7 @@ <h2 class="anchored" data-anchor-id="sec-biology-qa">Biology QA</h2>
 <section id="eval-4" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-4">Eval</h3>
 <p>Note that in the sample records above the dataset columns are not <strong>input</strong> and <strong>target</strong> so we’ll use a custom <code>FieldSpec</code> in our call to <code>example_dataset</code>. We also call the <code>use_tools()</code> function, passing <code>web_search()</code> as a tool—this gives the model access to a Google Search API that can be used to fill in background knowledge or specific facts. We use a <code>model_graded_qa()</code> scorer to more reliably score longer form model output.</p>
-<div id="e8d5754c" class="cell">
+<div id="2fb9fbc4" class="cell">
 <div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> FieldSpec, example_dataset</span>
 <span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_qa</span>
@@ -813,7 +818,7 @@ <h2 class="anchored" data-anchor-id="sec-arc">ARC</h2>
 <section id="setup-3" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
 <p>We’ll start by importing what we need from Inspect and writing a <code>record_to_sample()</code> function to convert raw records to samples (note that the choices and labels are encoded in JSON within the <strong>choices</strong> field so need some special pre-processing).</p>
-<div id="de02dab6" class="cell">
+<div id="d2a47359" class="cell">
 <div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
 <span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> answer</span>
@@ -841,7 +846,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
 <section id="eval-5" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-5">Eval</h3>
 <p>The ARC dataset has two subsets (ARC-Easy and ARC-Challenge). We’ll create a shared task function that can be used to run either, and then export two <code>@task</code> decorated functions so that they can be run all together or in isolation.</p>
-<div id="a3744d21" class="cell">
+<div id="6c5d4032" class="cell">
 <div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> arc_task(dataset_name):</span>
 <span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>   <span class="cf">return</span> Task(</span>
 <span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>     dataset<span class="op">=</span>hf_dataset(</span>
@@ -881,7 +886,7 @@ <h2 class="anchored" data-anchor-id="sec-tool-use">Tool Use</h2>
 <section id="addition" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
 <p>We’ll start with a simple tool that adds two numbers. We use the <code>@tool</code> decorator to register it with the system, and we provide a documentation comment (including argument types) that is used to provide details to the model about the tool:</p>
-<div id="2cd7a5ea" class="cell">
+<div id="5a0f13f9" class="cell">
 <div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
 <span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes, match</span>
@@ -913,7 +918,7 @@ <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
 </div>
 <p>Note the <code>prompt</code> argument passed to the <code>@tool</code> decorator. This prompt is intended to help the model reason about when to use the tool, and is automatically added to the system prompt.</p>
 <p>Now that we’ve defined the tool, we can use it in an evaluation by passing it to the <code>use_tools()</code> function.</p>
-<div id="90206690" class="cell">
+<div id="52ac730e" class="cell">
 <div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> addition_problem():</span>
 <span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -935,7 +940,7 @@ <h2 class="unlisted anchored" data-anchor-id="file-listing">File Listing</h2>
 <p>When working with subprocesses its important to make sure that they don’t block the rest of the work in Inspect (so they should be invoked with <code>async</code>) and that you don’t run too many of them in parallel (which could overwhelm local compute resources).</p>
 <p>To assist with this, Inspect provides the <code>subprocess()</code> function. This <code>async</code> function takes a command and arguments and invokes the specified command asynchronously, collecting and returning stdout (or stderr in the case of an error). The <code>subprocess()</code> function also automatically limits concurrent child processes to the number of CPUs on your system (<code>os.cpu_count()</code>).</p>
 <p>Here’s an example of using the <code>subprocess()</code> function to create a <code>list_files()</code> tool (note that we imported the <code>subprocess()</code> function from the <code>inspect_ai.util</code> module above):</p>
-<div id="ca3756ec" class="cell">
+<div id="044096c1" class="cell">
 <div class="sourceCode cell-code" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="at">@tool</span>(</span>
 <span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>    prompt<span class="op">=</span><span class="st">"""</span></span>
 <span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a><span class="st">    If you are asked to list the files in a directory you</span></span>
@@ -961,7 +966,7 @@ <h2 class="unlisted anchored" data-anchor-id="file-listing">File Listing</h2>
 <span id="cb24-23"><a href="#cb24-23" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> execute</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Here’s how we might use that tool in an evaluation:</p>
-<div id="51b511ae" class="cell">
+<div id="4f1a4da5" class="cell">
 <div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>SYSTEM_MESSAGE <span class="op">=</span> <span class="st">"""</span></span>
 <span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a><span class="st">Please answer exactly Yes or No with no additional words.</span></span>
 <span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a><span class="st">"""</span></span>
@@ -1023,7 +1028,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-4">Setup</h3>
 <li><code>record_to_sample()</code> to convert raw records to samples. Note that we need a function rather than just mapping field names with a <code>FieldSpec</code> because the <strong>answer</strong> field in the dataset needs to be divided into reasoning and the actual answer (which appears at the very end after <code>####</code>).</li>
 <li><code>sample_to_fewshot()</code> to generate fewshot examples from samples.</li>
 </ol>
-<div id="ba030ec0" class="cell">
+<div id="267a8760" class="cell">
 <div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
 <span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
 <span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> match</span>
@@ -1072,7 +1077,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-4">Setup</h3>
 <section id="eval-6" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-6">Eval</h3>
 <p>We’ll load the dataset from <a href="https://huggingface.co/datasets/gsm8k">HuggingFace</a> using the <code>hf_dataset()</code> function. By default we use 10 fewshot examples, but the <code>fewshot</code> task arg can be used to turn this up, down, or off. The <code>fewshot_seed</code> is provided for stability of fewshot examples across runs.</p>
-<div id="74e29e3a" class="cell">
+<div id="cdea8c5c" class="cell">
 <div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gsm8k(fewshot<span class="op">=</span><span class="dv">10</span>, fewshot_seed<span class="op">=</span><span class="dv">42</span>):</span>
 <span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># build plan dynamically (may or may not be doing fewshot)</span></span>
diff --git a/index.html b/index.html
index 3ec88ea11..0c9fec0e1 100644
--- a/index.html
+++ b/index.html
@@ -213,6 +213,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -443,7 +448,8 @@ <h2 class="anchored" data-anchor-id="learning-more">Learning More</h2>
 <p>These sections provide a more in depth treatment of the various components used in evals. Read them as required as you learn to build evaluations.</p>
 <ul>
 <li><p><a href="solvers.html">Solvers</a> are the heart of Inspect, and encompass prompt engineering and various other elicitation strategies (the <code>plan</code> in the example above). Here we cover using the built-in solvers and creating your own more sophisticated ones.</p></li>
-<li><p><a href="tools.html">Tools</a> provide a means of extending the capabilities of models by registering Python functions for them to call. This section describes how to create custom tools as well as how to run tools within an agent scaffold.</p></li>
+<li><p><a href="tools.html">Tools</a> provide a means of extending the capabilities of models by registering Python functions for them to call. This section describes how to create custom tools and use them in evaluations.</p></li>
+<li><p><a href="agents.html">Agents</a> combine planning, memory, and tool usage to pursue more complex, longer horizon tasks. This section describes how to build agent evaluations with Inspect.</p></li>
 <li><p><a href="scorers.html">Scorers</a> evaluate the work of solvers and aggregate scores into metrics. Sophisticated evals often require custom scorers that use models to evaluate output. This section covers how to create them.</p></li>
 <li><p><a href="datasets.html">Datasets</a> provide samples to evaluation tasks. This section illustrates how to adapt various data sources for use with Inspect, as well as how to include multi-modal data (images, etc.) in your datasets.</p></li>
 <li><p><a href="models.html">Models</a> provide a uniform API for both evaluating a variety of large language models and using models within evaluations (e.g.&nbsp;for critique or grading).</p></li>
@@ -983,7 +989,7 @@ <h2 class="anchored" data-anchor-id="learning-more">Learning More</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","openEffect":"zoom","loop":false,"descPosition":"bottom"});
+<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","closeEffect":"zoom","loop":false,"selector":".lightbox","openEffect":"zoom"});
 window.onload = () => {
   lightboxQuarto.on('slide_before_load', (data) => {
     const { slideIndex, slideNode, slideConfig, player, trigger } = data;
diff --git a/log-viewer.html b/log-viewer.html
index 409f84dc0..e73c1fbce 100644
--- a/log-viewer.html
+++ b/log-viewer.html
@@ -214,6 +214,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -932,7 +937,7 @@ <h2 class="anchored" data-anchor-id="task-information">Task Information</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
+<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"closeEffect":"zoom"});
 window.onload = () => {
   lightboxQuarto.on('slide_before_load', (data) => {
     const { slideIndex, slideNode, slideConfig, player, trigger } = data;
diff --git a/models.html b/models.html
index d81dc0d64..8a6ba93d3 100644
--- a/models.html
+++ b/models.html
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/scorers.html b/scorers.html
index 457a14ec6..adc88b7dc 100644
--- a/scorers.html
+++ b/scorers.html
@@ -65,7 +65,7 @@
 <script src="site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="./">
 <link href="./datasets.html" rel="next">
-<link href="./tools.html" rel="prev">
+<link href="./agents.html" rel="prev">
 <script src="site_libs/quarto-html/quarto.js"></script>
 <script src="site_libs/quarto-html/popper.min.js"></script>
 <script src="site_libs/quarto-html/tippy.umd.min.js"></script>
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1113,8 +1118,8 @@ <h3 class="anchored" data-anchor-id="python-api">Python API</h3>
 </script>
 <nav class="page-navigation">
   <div class="nav-page nav-page-previous">
-      <a href="./tools.html" class="pagination-link" aria-label="Tools">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-title">Tools</span></span>
+      <a href="./agents.html" class="pagination-link" aria-label="Agents">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-title">Agents</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/search.json b/search.json
index 8e1f552df..39d3fe35e 100644
--- a/search.json
+++ b/search.json
@@ -34,7 +34,7 @@
     "href": "index.html#learning-more",
     "title": "Inspect",
     "section": "Learning More",
-    "text": "Learning More\nTo get started with Inspect, we highly recommend you read at least these sections for a high level overview of the system:\n\nWorkflow covers the mechanics of running evaluations, including how to create evals in both scripts and notebooks, specifying configuration and options, how to parameterise tasks for different scenarios, and how to work with eval log files.\nLog Viewer goes into more depth on how to use Inspect View to develop and debug evaluations, including how to provide additional log metadata and how to integrate it with Python’s standard logging module.\nVS Code provides documentation on using the Inspect VS Code Extension to run, tune, debug, and visualise evaluations.\nExamples includes several complete examples with commentary on the use of various features (as with the above example, they are fairly simplistic for the purposes of illustration). You can also find implementations of a few popular LLM benchmarks in the Inspect repository.\n\nThese sections provide a more in depth treatment of the various components used in evals. Read them as required as you learn to build evaluations.\n\nSolvers are the heart of Inspect, and encompass prompt engineering and various other elicitation strategies (the plan in the example above). Here we cover using the built-in solvers and creating your own more sophisticated ones.\nTools provide a means of extending the capabilities of models by registering Python functions for them to call. This section describes how to create custom tools as well as how to run tools within an agent scaffold.\nScorers evaluate the work of solvers and aggregate scores into metrics. Sophisticated evals often require custom scorers that use models to evaluate output. This section covers how to create them.\nDatasets provide samples to evaluation tasks. This section illustrates how to adapt various data sources for use with Inspect, as well as how to include multi-modal data (images, etc.) in your datasets.\nModels provide a uniform API for both evaluating a variety of large language models and using models within evaluations (e.g. for critique or grading).\n\nThese sections discuss more advanced features and workflow. You don’t need to review them at the outset, but be sure to revisit them as you get more comfortable with the basics.\n\nEval Logs describes how to get the most out of evaluation logs for developing, debugging, and analyzing evaluations.\nEval Tuning delves into how to obtain maximum performance for evaluations. Inspect uses a highly parallel async architecture—here we cover how to tune this parallelism (e.g to stay under API rate limits or to not overburden local compute) for optimal throughput.\nEval Suites cover Inspect’s features for describing, running, and analysing larger sets of evaluation tasks.",
+    "text": "Learning More\nTo get started with Inspect, we highly recommend you read at least these sections for a high level overview of the system:\n\nWorkflow covers the mechanics of running evaluations, including how to create evals in both scripts and notebooks, specifying configuration and options, how to parameterise tasks for different scenarios, and how to work with eval log files.\nLog Viewer goes into more depth on how to use Inspect View to develop and debug evaluations, including how to provide additional log metadata and how to integrate it with Python’s standard logging module.\nVS Code provides documentation on using the Inspect VS Code Extension to run, tune, debug, and visualise evaluations.\nExamples includes several complete examples with commentary on the use of various features (as with the above example, they are fairly simplistic for the purposes of illustration). You can also find implementations of a few popular LLM benchmarks in the Inspect repository.\n\nThese sections provide a more in depth treatment of the various components used in evals. Read them as required as you learn to build evaluations.\n\nSolvers are the heart of Inspect, and encompass prompt engineering and various other elicitation strategies (the plan in the example above). Here we cover using the built-in solvers and creating your own more sophisticated ones.\nTools provide a means of extending the capabilities of models by registering Python functions for them to call. This section describes how to create custom tools and use them in evaluations.\nAgents combine planning, memory, and tool usage to pursue more complex, longer horizon tasks. This section describes how to build agent evaluations with Inspect.\nScorers evaluate the work of solvers and aggregate scores into metrics. Sophisticated evals often require custom scorers that use models to evaluate output. This section covers how to create them.\nDatasets provide samples to evaluation tasks. This section illustrates how to adapt various data sources for use with Inspect, as well as how to include multi-modal data (images, etc.) in your datasets.\nModels provide a uniform API for both evaluating a variety of large language models and using models within evaluations (e.g. for critique or grading).\n\nThese sections discuss more advanced features and workflow. You don’t need to review them at the outset, but be sure to revisit them as you get more comfortable with the basics.\n\nEval Logs describes how to get the most out of evaluation logs for developing, debugging, and analyzing evaluations.\nEval Tuning delves into how to obtain maximum performance for evaluations. Inspect uses a highly parallel async architecture—here we cover how to tune this parallelism (e.g to stay under API rate limits or to not overburden local compute) for optimal throughput.\nEval Suites cover Inspect’s features for describing, running, and analysing larger sets of evaluation tasks.",
     "crumbs": [
       "<span class='chapter-number'>1</span>  <span class='chapter-title'>Welcome</span>"
     ]
@@ -385,7 +385,7 @@
     "href": "solvers.html#built-in-solvers",
     "title": "Solvers",
     "section": "Built-In Solvers",
-    "text": "Built-In Solvers\nInspect has a number of built-in solvers, each of which can be customised in some fashion. Built in solvers can be imported from the inspect_ai.solver module. Below is a summary of these solvers. There is not (yet) reference documentation on these functions so the best way to learn about how they can be customised, etc. is to use the Go to Definition command in your source editor.\n\nsystem_message()\nPrepend role=“system” message to the list of messages (will follow any other system messages it finds in the message stream).\nprompt_template()\nModify the user prompt by substituting the current prompt into the {prompt} placeholder within the specified template, as well as any other custom named placeholder passed in params.\nchain_of_thought()\nStandard chain of thought template with {prompt} substitution variable. Asks the model to provide the final answer on a line by itself at the end for easier scoring.\ngenerate()\nAs illustrated above, just a simple call to generate(state). This is the default solver if no plan is specified.\nmultiple_choice()\nA solver which presents A,B,C,D style choices from input samples (in a random order), calls generate() to yield model output, then maps the answer back to the correct index for scoring. Note that you don’t need to call generate() separately when using this solver.\nself_critique()\nPrompts the model to critique the results of a previous call to generate() (note that this need not be the same model as they one you are evaluating—use the model parameter to choose another model). Makes use of {question} and {completion} template variables.\n\n\nMultiple Choice\nHere is the declaration for the multiple_choice() solver:\ndef multiple_choice(\n    multiple_correct: bool = False,\n    shuffle: bool | Random = False,\n    template: str | None = None,\n) -&gt; Solver:\nBy default, multiple choice questions have a single correct answer. Set multiple_correct=True if your target has defined multiple correct answers (for example, a target of [\"B\", \"C\"]). In this case the model is prompted to provide one or more answers, and the sample is scored correct only if each of these answers are provided.\nIf you specify shuffle=True, then the order of the answers presented to the model will be randomised (this may or may not affect results, depending on the nature of the questions and the model being evaluated).\nUse template to provide an alternate prompt template (note that if you do this your template should handle prompting for multiple_correct directly if required).\nWhen using the multiple_choice() solver you should always pair it with the choice() scorer.\n\n\nSelf Critique\nHere is the declaration for the self_critique() solver:\ndef self_critique(\n    critique_template: str | None = None,\n    completion_template: str | None = None,\n    model: str | Model | None = None,\n) -&gt; Solver:\nThere are two templates which correspond to the one used to solicit critique and the one used to play that critique back for a refined answer (default templates are provided for both).\nYou will likely want to experiment with using a distinct model for generating critiques (by default the model being evaluated is used).",
+    "text": "Built-In Solvers\nInspect has a number of built-in solvers, each of which can be customised in some fashion. Built in solvers can be imported from the inspect_ai.solver module. Below is a summary of these solvers. There is not (yet) reference documentation on these functions so the best way to learn about how they can be customised, etc. is to use the Go to Definition command in your source editor.\n\nsystem_message()\nPrepend role=“system” message to the list of messages (will follow any other system messages it finds in the message stream).\nprompt_template()\nModify the user prompt by substituting the current prompt into the {prompt} placeholder within the specified template, as well as any other custom named placeholder passed in params.\nchain_of_thought()\nStandard chain of thought template with {prompt} substitution variable. Asks the model to provide the final answer on a line by itself at the end for easier scoring.\ngenerate()\nAs illustrated above, just a simple call to generate(state). This is the default solver if no plan is specified.\nmultiple_choice()\nA solver which presents A,B,C,D style choices from input samples (in a random order), calls generate() to yield model output, then maps the answer back to the correct index for scoring. Note that you don’t need to call generate() separately when using this solver.\nself_critique()\nPrompts the model to critique the results of a previous call to generate() (note that this need not be the same model as they one you are evaluating—use the model parameter to choose another model). Makes use of {question} and {completion} template variables.\nuse_tools()\nDefine the set tools available for use by the model during generate().\n\n\nMultiple Choice\nHere is the declaration for the multiple_choice() solver:\ndef multiple_choice(\n    multiple_correct: bool = False,\n    shuffle: bool | Random = False,\n    template: str | None = None,\n) -&gt; Solver:\nBy default, multiple choice questions have a single correct answer. Set multiple_correct=True if your target has defined multiple correct answers (for example, a target of [\"B\", \"C\"]). In this case the model is prompted to provide one or more answers, and the sample is scored correct only if each of these answers are provided.\nIf you specify shuffle=True, then the order of the answers presented to the model will be randomised (this may or may not affect results, depending on the nature of the questions and the model being evaluated).\nUse template to provide an alternate prompt template (note that if you do this your template should handle prompting for multiple_correct directly if required).\nWhen using the multiple_choice() solver you should always pair it with the choice() scorer.\n\n\nSelf Critique\nHere is the declaration for the self_critique() solver:\ndef self_critique(\n    critique_template: str | None = None,\n    completion_template: str | None = None,\n    model: str | Model | None = None,\n) -&gt; Solver:\nThere are two templates which correspond to the one used to solicit critique and the one used to play that critique back for a refined answer (default templates are provided for both).\nYou will likely want to experiment with using a distinct model for generating critiques (by default the model being evaluated is used).",
     "crumbs": [
       "Components",
       "<span class='chapter-number'>6</span>  <span class='chapter-title'>Solvers</span>"
@@ -440,7 +440,7 @@
     "href": "tools.html#overview",
     "title": "Tools",
     "section": "",
-    "text": "Tools and Agents\n\n\n\nOne application of tools is to run them within an agent scaffold that pursues an objective over multiple interactions with a model. The scaffold uses the model to help make decisions about which tools to use and when, and orchestrates calls to the model to use the tools. We’ll cover how to use agent scaffolds in Agent Solvers below.",
+    "text": "Tools and Agents\n\n\n\nOne application of tools is to run them within an agent scaffold that pursues an objective over multiple interactions with a model. The scaffold uses the model to help make decisions about which tools to use and when, and orchestrates calls to the model to use the tools. This is covered in more depth in the Agents section.",
     "crumbs": [
       "Components",
       "<span class='chapter-number'>7</span>  <span class='chapter-title'>Tools</span>"
@@ -451,7 +451,7 @@
     "href": "tools.html#tool-basics",
     "title": "Tools",
     "section": "Tool Basics",
-    "text": "Tool Basics\nTo demonstrate the use of tools, we’ll define a simple tool that adds two numbers. We use the @tool decorator to register it with the system, and we provide a documentation comment (including argument types) that is used to provide details to the model about the tool:\n@tool(prompt=\"\"\"\n    If you are given a math problem of any kind,\n    please use the add tool to compute the result.\"\"\"\n)\ndef add():\n    async def execute(x: int, y: int):\n        \"\"\"\n        Tool for adding two numbers.\n\n        Args:\n            x (int): First number to add.\n            y (int): Second number to add.\n\n        Returns:\n            The sum of the two numbers.\n        \"\"\"\n        return x + y\n\n    return execute\nWe can use this tool in an evaluation by passing it to the use_tools() Solver:\n@task\ndef addition_problem():\n    return Task(\n        dataset=[Sample(input=\"What is 1 + 1?\", target=[\"2\"])],\n        plan=[use_tools(add()), generate()],\n        scorer=match(numeric=True),\n    )\nNote that this tool doesn’t make network requests or do heavy computation, so is fine to run as inline Python code. If your tool does do more elaborate things, you’ll want to make sure it plays well with Inspect’s concurrency scheme. For network requests, this amounts to using async HTTP calls with httpx. For heavier computation, tools should use subprocesses as described in the next section.\n\n\n\n\n\n\nNote that when using tools with models, the models do not call the Python function directly. Rather, the model generates a structured request which includes function parameters, and then Inspect calls the function and returns the result to the model.",
+    "text": "Tool Basics\nTo demonstrate the use of tools, we’ll define a simple tool that adds two numbers. We use the @tool decorator to register it with the system, and we provide a documentation comment (including argument types) that is used to provide details to the model about the tool:\n@tool(prompt=\"\"\"\n    If you are given a math problem of any kind,\n    please use the add tool to compute the result.\"\"\"\n)\ndef add():\n    async def execute(x: int, y: int):\n        \"\"\"\n        Tool for adding two numbers.\n\n        Args:\n            x (int): First number to add.\n            y (int): Second number to add.\n\n        Returns:\n            The sum of the two numbers.\n        \"\"\"\n        return x + y\n\n    return execute\nWe can use this tool in an evaluation by passing it to the use_tools() Solver:\n@task\ndef addition_problem():\n    return Task(\n        dataset=[Sample(input=\"What is 1 + 1?\", target=[\"2\"])],\n        plan=[\n            use_tools(add()), \n            generate()\n        ],\n        scorer=match(numeric=True),\n    )\nNote that this tool doesn’t make network requests or do heavy computation, so is fine to run as inline Python code. If your tool does do more elaborate things, you’ll want to make sure it plays well with Inspect’s concurrency scheme. For network requests, this amounts to using async HTTP calls with httpx. For heavier computation, tools should use subprocesses as described in the next section.\n\n\n\n\n\n\nNote that when using tools with models, the models do not call the Python function directly. Rather, the model generates a structured request which includes function parameters, and then Inspect calls the function and returns the result to the model.",
     "crumbs": [
       "Components",
       "<span class='chapter-number'>7</span>  <span class='chapter-title'>Tools</span>"
@@ -469,36 +469,80 @@
     ]
   },
   {
-    "objectID": "tools.html#sec-tool-environments",
-    "href": "tools.html#sec-tool-environments",
+    "objectID": "tools.html#built-in-tools",
+    "href": "tools.html#built-in-tools",
     "title": "Tools",
-    "section": "Tool Environments",
-    "text": "Tool Environments\n\n\n\n\n\n\nImportant\n\n\n\nThe Tool Environments feature described in this section is not yet available in the version of Inspect published to PyPI (it is only available from the development version of Inspect). To install the development version:\n$ pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git\n\n\nThe examples shown above execute tool code within the main process running the evaluation task. In some cases however, you may require the provisioning of dedicated environments for running tool code. This might be the case if:\n\nYou are creating tools that enable execution of arbitrary code (e.g. a tool that executes shell commands or Python code).\nYou need to provision per-sample file system resources.\nYou want to provide access to a more sophisticated evaluation environment (e.g. creating network hosts for a cybersecurity eval).\n\n\nExample: File Listing\nLet’s take a look at a simple example to illustrate. First, we’ll define a list_files() tool. This tool need to access the ls command—it does so by calling the tool_environment() function to get access to the ToolEnvironment instance for the currently executing Sample:\nfrom inspect_ai.solver import tool, tool_environment\n\n@tool(prompt=\"Use the list_files function to enumerate files.\")\ndef list_files():\n    async def execute(dir: str):\n        \"\"\"List the files in a directory.\n\n        Args:\n            dir (str): Directory\n\n        Returns:\n            File listing of the directory\n        \"\"\"\n        result = await tool_environment().exec([\"ls\", dir])\n        if result.success:\n            return result.stdout\n        else:\n            return f\"Error: {result.stderr}\"\n\n    return execute\nThe exec() function is used to list the directory contents. Note that its not immediately clear where or how exec() is implemented (that will be described shortly!).\nHere’s an evaluation that makes use of this tool:\nfrom inspect_ai import task, Task\nfrom inspect_ai.dataset import Sample\nfrom inspect_ai.scorer import includes\nfrom inspect_ai.solver import generate, use_tools\n\ndataset = [\n    Sample(\n        input='Is there a file named \"bar.txt\" ' \n               + 'in the current directory?',\n        target=\"Yes\",\n        files={\"bar.txt\": \"hello\"},\n    )\n]\n\n@task\ndef file_probe()\n    return Task(\n        dataset=dataset,\n        plan=[\n            use_tools([list_files()]), \n            generate()\n        ],\n        tool_environment=\"docker\",\n        scorer=includes(),\n    )\n)\nWe’ve included tool_environment = \"docker\" to indicate that tool environment operations should be executed in a Docker container. Specifying a tool environment (either at the task or evaluation level) is required if your tools call the tool_environment() function.\nNote that files are specified as part of the Sample. Files can be specified inline using plain text (as depicted above), inline using a base64-encoded data URI, or as a path to a file or remote resource (e.g. S3 bucket). Relative file paths are resolved according to the location of the underlying dataset file.\n\n\nEnvironment Interface\nThe following methods are available for all tool environments:\nclass ToolEnvironment:\n   \n    async def exec(\n        self,\n        cmd: list[str],\n        input: str | bytes | None = None,\n        env: dict[str, str] = {},\n        timeout: int | None = None,\n    ) -&gt; ExecResult[str]:\n        ...\n\n    async def write_file(\n        self, file: str, contents: str | bytes\n    ) -&gt; None:\n        ...\n\n    async def read_file(\n        self, file: str, text: bool = True\n    ) -&gt; Union[str | bytes]:\n        ...\n\n\nEnvironment Binding\nThere are two tool environments built in to Inspect:\n\n\n\nEnvironment Type\nDescription\n\n\n\n\nlocal\nRun tool_environment() methods in the same address space and file system as the running evaluation. The local environment should only be used if you are already running your evaluation in another sandbox.\n\n\ndocker\nRun tool_environment() methods within a Docker container (see the Docker Configuration section below for additional details).\n\n\n\nTool environments can be bound at the Task level or at the eval() level (where eval() takes precedence). To bind a tool environment to a Task, use the tool_environment option:\nTask(\n    dataset=dataset,\n    plan([\n        use_tools([read_file(), list_files()])), \n        generate()\n    ]),\n    scorer=match(),\n    tool_environment=\"docker\"\n)\nFor this example, if there is a compose.yaml file in the task directory it will be used to provision Docker services (if there is no compose.yaml then the Docker’s default Python 3.12 image will be used). You can specify an alternate config file using a tuple:\ntool_environment=(\"docker\", \"my-compose.yaml\")\nSimilar conventions exist for eval() and the CLI:\neval(task, tool_environment=\"docker\")\neval(task, tool_environment=(\"docker\",\"my-compose.yaml\"))\n$ inspect eval --tool-environment docker\n$ inspect eval --tool-environment docker:my-compose.yaml\n\n\nDocker Configuration\nWhile --tool-environment can be a default un-configured environment (e.g. “docker”), more commonly you’ll provide explicit configuration in either a Dockerfile or a Docker Compose configuration file (compose.yaml).\nHere is how Docker tool environments are created based on the presence of Dockerfile and/or compose.yml in the task directory:\n\n\n\nConfig Files\nBehavior\n\n\n\n\nNone\nCreates a tool environment based on the official python:3.12-bookworm image.\n\n\nDockerfile\nCreates a tool environment by building the image.\n\n\ncompose.yaml\nCreates tool environment(s) based on compose.yaml.\n\n\n\nHere is what a simple compose.yaml would look like for a single tool environment that uses the ctf-agent-environment Docker image:\n\n\ncompose.yaml\n\nservices:\n  default: \n    image: ctf-agent-environment\n    cpus: 1.0\n    mem_limit: 0.5gb\n\nNote that we’ve also chosen to limit the CPU and memory usage of the container (see the Docker Compose documentation for information on these and other container options).\n\nMultiple Environments\nIn some cases you may want to create multiple tool environments (e.g. if one environment has complex dependencies that conflict with the dependencies of other environments). To do this specify multiple named services:\n\n\ncompose.yaml\n\nservices:\n  default:\n    image: ctf-agent-environment\n    cpus: 1.0\n    mem_limit: 0.5gb\n  ghidra:\n    image: ctf-ghidra-environment\n    cpus: 1.0\n    mem_limit: 1gb\n\nThe first environment listed is the “default” environment, and can be accessed from within a tool with a normal call to tool_environment(). Other environments would be accessed by name, for example:\ntool_environment()          # default tool environment\ntool_environment(\"ghidra\")  # named tool environment\n\n\n\n\n\n\nNote\n\n\n\nIf you define multiple tool environments you are required to name one of them “default” so that Inspect knows which environment to copy samples files to and resolve for calls to tool_environment() without an argument.\n\n\n\n\nInfrastructure\nNote that in many cases you’ll want to provision additional infrastructure (e.g. other hosts or volumes). For example, here we define an additional container (“writer”) as well as a volume shared between the default container and the writer container:\nservices:\n  default: \n    image: ctf-agent-environment\n    volumes:\n      - ctf-challenge-volume:/shared-data\n    \n  writer:\n    image: ctf-challenge-writer\n    volumes:\n      - ctf-challenge-volume:/shared-data\nvolumes:\n  ctf-challenge-volume:\nSee the documentation on Docker Compose files for information on their full schema and feature set.\n\n\n\nResource Management\nCreating and executing code within Docker containers can be expensive both in terms of memory and CPU utilization. Inspect provides some automatic resource management to keep usage reasonable in the default case. This section describes that behavior as well as how you can tune it for your use-cases.\n\nRunning Containers\nAs described above, each Sample is provisioned its own container. The number of running containers for an evaluation is therefore determined by the max_samples option (which is by default set to max_connections, typically 10 unless overridden).\nUse max_samples to dial up or down the number of containers running at any given time. Note that a running container does not necessarily use CPU resources unless it has active background processes.\n\n\nConcurrent Execution\nThe ToolEnvironment.exec() method runs a command within a tool environment, typically consuming CPU resources. To protect against overwhelming the system’s CPUs, the implementation of exec() uses Inspect’s subprocess() function, which automatically limits concurrent child processes to the number of CPUs on your system (os.cpu_count()).\nYou can change the number of permitted concurrent subprocess executions using the max_subprocesses option. You might do this for example if you know that your exec() commands tend to use multiple CPU cores and thus should be executed with less concurrency.\n\n\n\nTroubleshooting\nYou can view more detailed logging around the creation and use of tool environments by using the tools log level. For example:\n$ inspect eval ctf.py --log-level tools\nThe tools log level is just above warning (so it will not show http or debug level messages).",
+    "section": "Built-In Tools",
+    "text": "Built-In Tools\nInspect has several built-in tools, including:\n\nweb_search(), which uses the Google Search API to execute and summarise web searches.\nbash() and python(), for executing arbitrary shell and Python code.\n\n\nWeb Search\nThe web_search() tool provides models the ability to enhance their context window by performing a search. By default web searches retrieve 10 results from a provider, uses a model to determine if the contents is relevant then returns the top 3 relevant search results to the main model. Here is the definition of the web_search() function:\ndef web_search(\n    provider: Literal[\"google\"] = \"google\",\n    num_results: int = 3,\n    max_provider_calls: int = 3,\n    max_connections: int = 10,\n    model: str | Model | None = None,\n) -&gt; Tool:\n    ...\nYou can use the web_search() tool in a plan like this:\nplan=[\n    use_tools(web_search()), \n    generate()\n],\nWeb search options include:\n\nprovider—Web search provider (currently only Google is supported, see below for instructions on setup and configuration for Google).\nnum_results—How many search results to return to the main model (defaults to 5).\nmax_provider_calls—Number of times to retrieve more links from the search provider in case previous ones were irrelevant (defaults to 3).\nmax_connections—Maximum number of concurrent connections to the search API provider (defaults to 10).\nmodel—Model to use to determine if search results are relevant (defaults to the model currently being evaluated).\n\n\nGoogle Provider\nThe web_search() tool uses Google Programmable Search Engine. To use it you will therefore need to setup your own Google Programmable Search Engine and also enable the Programmable Search Element Paid API. Then, ensure that the following environment variables are defined:\n\nGOOGLE_CSE_ID — Google Custom Search Engine ID\nGOOGLE_CSE_API_KEY — Google API key used to enable the Search API\n\n\n\n\nBash and Python\nThe bash() and python() tools enable execution of arbitrary shell commands and Python code, respectively. These tools require the use of a Tool Environment, which can provide sandboxing for untrusted code. For example, here is how you might use them in an evaluation where the model is asked to write code in order to solve capture the flag (CTF) challenges:\nCMD_TIMEOUT = 180\n\n@task\ndef intercode_ctf():\n    return Task(\n        dataset=read_dataset(),\n        plan=[\n            system_message(\"system.txt\"),\n            use_tools([\n                bash(CMD_TIMEOUT), \n                python(CMD_TIMEOUT)\n            ]),\n            generate(),\n        ],\n        scorer=includes(),\n        max_messages=30,\n        tool_environment=\"docker\",\n    )\nWe specify a 3-minute timeout for execution of the bash and python tools to ensure that they don’t perform extremely long running operations.\nSee the Agents section for more details on how to build evaluations that allow models to take arbitrary actions over a longer time horizon.\nNote that the bash() and python() tools are not yet available in the version of Inspect published to PyPI (it is only available from the development version of Inspect). To install the development version:\n$ pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git",
     "crumbs": [
       "Components",
       "<span class='chapter-number'>7</span>  <span class='chapter-title'>Tools</span>"
     ]
   },
   {
-    "objectID": "tools.html#built-in-tools",
-    "href": "tools.html#built-in-tools",
-    "title": "Tools",
-    "section": "Built-In Tools",
-    "text": "Built-In Tools\nInspect has several built-in tools, including:\n\nweb_search(), which uses the Google Search API to execute and summarise web searches.\nbash() and python(), for executing arbitrary shell and Python code.\n\n\nWeb Search\nThe web_search() tool provides models the ability to enhance their context window by performing a search. By default web searches retrieve 10 results from a provider, uses a model to determine if the contents is relevant then returns the top 3 relevant search results to the main model. Here is the definition of the web_search() function:\ndef web_search(\n    provider: Literal[\"google\"] = \"google\",\n    num_results: int = 3,\n    max_provider_calls: int = 3,\n    max_connections: int = 10,\n    model: str | Model | None = None,\n) -&gt; Tool:\n    ...\nYou can use the web_search() tool in a plan like this:\nplan=[\n    use_tools(web_search()), \n    generate()\n],\nWeb search options include:\n\nprovider—Web search provider (currently only Google is supported, see below for instructions on setup and configuration for Google).\nnum_results—How many search results to return to the main model (defaults to 5).\nmax_provider_calls—Number of times to retrieve more links from the search provider in case previous ones were irrelevant (defaults to 3).\nmax_connections—Maximum number of concurrent connections to the search API provider (defaults to 10).\nmodel—Model to use to determine if search results are relevant (defaults to the model currently being evaluated).\n\n\nGoogle Provider\nThe web_search() tool uses Google Programmable Search Engine. To use it you will therefore need to setup your own Google Programmable Search Engine and also enable the Programmable Search Element Paid API. Then, ensure that the following environment variables are defined:\n\nGOOGLE_CSE_ID — Google Custom Search Engine ID\nGOOGLE_CSE_API_KEY — Google API key used to enable the Search API\n\n\n\n\nBash and Python\nThe bash() and python() tools enable execution of arbitrary shell command lines and Python code, respectively. These tools require the use of a Tool Environment, which can provide sandboxing for untrusted code. For example, here is how you might use them in an evaluation where the model is asked to write code in order to solve capture the flag (CTF) challenges:\n@task\ndef intercode_ctf():\n    return Task(\n        dataset=read_dataset(),\n        plan=[\n            system_message(\"system.txt\"),\n            use_tools([bash(), python()]),\n            generate(),\n        ],\n        scorer=includes(),\n        max_messages=30,\n        tool_environment=\"docker\",\n    )\nNote that the bash() and python() tools are not yet available in the version of Inspect published to PyPI (it is only available from the development version of Inspect). To install the development version:\n$ pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git",
+    "objectID": "agents.html",
+    "href": "agents.html",
+    "title": "Agents",
+    "section": "",
+    "text": "Overview\nAgents combine planning, memory, and tool usage to pursue more complex, longer horizon tasks (e.g. a Capture the Flag challenge). Agents are an area of active research, and many schemes for implementing them have been developed, including AutoGPT, ReAct, and Reflexion.\nInspect supports a variety of approaches to agent evaluations, including:\nWe’ll cover the basics of all of these approaches below.\nAn important additional consideration for agent evaluations is sandboxing (providing a secure environment for models to execute code within). The Tool Environments section goes into more depth on this.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>7</span>  <span class='chapter-title'>Tools</span>"
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
     ]
   },
   {
-    "objectID": "tools.html#agent-solvers",
-    "href": "tools.html#agent-solvers",
-    "title": "Tools",
-    "section": "Agent Solvers",
-    "text": "Agent Solvers\nAgent solvers typically have multiple interactions with a model, generating completions, orchestrating the use of tools, and using the model to plan their next action. Agents are an area of active research, and many schemes for implementing them have been developed, including AutoGPT, ReAct, and Reflexion. There are also Python libraries such LangChain and Langroid which facilitate using these techniques with various LLMs.\nInspect supports a wide variety of approaches to agents and agent libraries. Agent libraries generally take chat history as an input and produce a completion string as output—this interface can be easily adapted to solvers, with chat history coming from TaskState and completions being set as ModelOutput.\nThere are several approaches to creating an Inspect solver that uses an agent scaffold:\n\nImplement your own scaffolding (potentially implementing the ReAct algorithm or a derivative). This will involve repeated calls to generate() with various tools being made available in the TaskState for each call. It will also involve using the model to help determine what actions to take next.\nAdapt another scaffolding scheme provided by a research paper or open source library.\nIntegrate a 3rd party agent library like LangChain and Langroid.\n\nIf you are adapting research code or using a 3rd party library, it’s important that the agent scaffolding use Inspect’s model API rather than whatever interface is built in to the existing code or library (otherwise you might be evaluating the wrong model!) We’ll describe how to do that for LangChain in the example below.\n\nExample: Wikipedia Search\nIn this example we’ll demonstrate how to integrate a LangChain OpenAI tools agent with Inspect. This agent will use Wikipedia via the Tavili Search API to perform question answering tasks. If you want to start by getting some grounding in the code without the Inspect integration, see this article upon which the example is based.\nThe main thing that an integration with an agent framework needs to account for is:\n\nBridging Inspect’s model API into the API of the agent framework. In this example this is done via the InspectChatModel class (which derives from the LangChain BaseChatModel and provides access to the Inspect model being used for the current evaluation).\nBridging from the Inspect solver interface to the standard input and output types of the agent library. In this example this is provided by the langchain_solver() function, which takes a LangChain agent function and converts it to an Inspect solver.\n\nHere’s the implementation of langchain_solver() (imports excluded for brevity):\n# Interface for LangChain agent function\nclass LangChainAgent(Protocol):\n    async def __call__(self, llm: BaseChatModel, input: dict[str, Any]): ...\n\n# Convert a LangChain agent function into a Solver\ndef langchain_solver(agent: LangChainAgent) -&gt; Solver:\n\n    async def solve(state: TaskState, generate: Generate) -&gt; TaskState:\n\n        # create the inspect model api bridge\n        llm = InspectChatModel()\n\n        # call the agent\n        await agent(\n            llm = llm,\n            input = dict(\n                input=state.user_prompt.text,\n                chat_history=as_langchain_chat_history(\n                    state.messages[1:]\n                ),\n            )\n        )\n\n        # collect output from llm interface\n        state.messages = llm.messages\n        state.output = llm.output\n        state.output.completion = output\n        \n        # return state\n        return state\n\n    return solve\n\n# LangChain BaseChatModel for Inspect Model API\nclass InspectChatModel(BaseChatModel):\n     async def _agenerate(\n        self,\n        messages: list[BaseMessage],\n        stop: list[str] | None = None,\n        run_manager: AsyncCallbackManagerForLLMRun | None = None,\n        **kwargs: dict[str, Any],\n    ) -&gt; ChatResult:\n        ...\n\n\n\n\n\n\nNote that the the inspect_langchain module imported here is not a built in feature of Inspect. Rather, you can find its source code as part of the example. You can use this to create your own LangChain agents or as the basis for creating similar integrations with other agent frameworks.\n\n\n\nNow here’s the wikipedia_search() solver (imports again excluded for brevity):\n@solver\ndef wikipedia_search(\n    max_iterations: int | None = 15,\n    max_execution_time: float | None = None\n) -&gt; Solver:\n    # standard prompt for tools agent\n    prompt = hub.pull(\"hwchase17/openai-tools-agent\")\n\n1    # tavily and wikipedia tools\n    tavily_api = TavilySearchAPIWrapper()  # type: ignore\n    tools = (\n        [TavilySearchResults(api_wrapper=tavily_api)] + \n        load_tools([\"wikipedia\"])\n    )\n\n2    # agent function\n    async def agent(\n        llm: BaseChatModel, \n        input: dict[str, Any]\n    ) -&gt; str | list[str | dict[str,Any]]:  \n        # create agent\n        tools_agent = create_openai_tools_agent(\n          llm, tools, prompt\n        )\n        executor = AgentExecutor.from_agent_and_tools(\n            agent=cast(BaseMultiActionAgent, tools_agent),\n            tools=tools,\n            name=\"wikipedia_search\",\n            max_iterations=max_iterations,  \n            max_execution_time=max_execution_time\n        )\n\n3        # execute the agent and return output\n        result = await executor.ainvoke(input)  \n        return result[\"output\"]\n\n4    # return agent function as inspect solver\n    return langchain_solver(agent)\n\n1\n\nNote that we register native LangChain tools. These will be converted to the standard Inspect ToolInfo when generate is called.\n\n2\n\nThis is the standard interface to LangChain agents. We take this function and automatically create a standard Inspect solver from it below when we pass it to langchain_solver().\n\n3\n\nInvoke the agent using the chat history passed in input. We call the async executor API to play well with Inspect’s concurrency.\n\n4\n\nThe langchain_solver() function maps the simpler agent function semantics into the standard Inspect solver API.\n\n\nIf you reviewed the original article that this example was based on, you’ll see that most of the code is unchanged (save for the fact that we have switched from a function agent to a tools agent). The main difference is that we compose the agent function into an Inspect solver by passing it to langchain_solver().\nFinally, here’s a task that uses the wikipedia_search() solver:\n@task\ndef wikipedia() -&gt; Task:\n    return Task(\n        dataset=json_dataset(\"wikipedia.jsonl\"),\n        plan=wikipedia_search(),\n        scorer=model_graded_fact(),\n    )\nSee the working version of this example if you want to run and experiment with it.",
+    "objectID": "agents.html#overview",
+    "href": "agents.html#overview",
+    "title": "Agents",
+    "section": "",
+    "text": "Using Inspect’s built in tool-use loop along with a ReAct prompt that encourages the model to explicitly reason about each tool usage. When you call generate() and the model responds with a tool call, Inspect will automatically re-prompt the model for another generation.\nImplementing your own outer scaffolding loop on top of the default generate() behavior. This will involve repeated calls to generate() with various tools being made available in the TaskState for each call. It may also involve using the model to help determine what actions to take next.\nAdapting another scaffolding scheme provided by a research paper or open source library (for example, using a 3rd party agent library like LangChain or Langroid).\n\n\n\n\n\n\n\n\n\nImportant\n\n\n\nThe features described in this section are not yet available in the version of Inspect published to PyPI (rather they are only available in the development version of Inspect). To install the development version:\n$ pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git\nIf you are building agent evaluations based on the documentation here, you should install the development version before proceeding.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>7</span>  <span class='chapter-title'>Tools</span>"
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
+    ]
+  },
+  {
+    "objectID": "agents.html#tool-use-loop",
+    "href": "agents.html#tool-use-loop",
+    "title": "Agents",
+    "section": "Tool Use Loop",
+    "text": "Tool Use Loop\nA basic agent can be implemented by providing tools to the model with use_tools() and then calling generate(). Every time the model calls a tool, the approriate Python function is called and then the model is re-prompted to generate based on the output of the function. This is typically combined with a ReAct prompt that urges the model to reason about each action it takes. For example:\nprompt_template(template = \"\"\"\n    Each message may perform one function call. You will\n    see the result of the function right after sending \n    the message. If you need to perform multiple actions,\n    you can always send more messages with subsequent \n    function calls. Do some reasoning before your actions,\n    describing what function calls you are going to use \n    and how they fit into your plan. \n\n    {prompt}\n\"\"\")\nNote that this is merely an example! A production ReAct prompt would typically be longer and more detailed. It would also typically have some fewshot examples from the dataset domain. See Prompt Engineering Guide: React for additional details.\n\nExample: InterCode CTF\nThis example implements the CTF Benchmark from the InterCode paper (click the numbers in the right margin for additional explanation of the code):\nfrom dataset import read_dataset\nfrom inspect_ai import Task, task\nfrom inspect_ai.scorer import includes\nfrom inspect_ai.solver import (\n    Generate, TaskState, bash, generate, python, solver,\n    system_message, tool_environment, use_tools\n)\n\nCMD_TIMEOUT = 180  # max seconds to run bash/python cmds\nMAX_MESSAGES = 30  # max chat messages before giving up\n\n@task\ndef intercode_ctf(shuffle = False):\n    return Task(\n1        dataset=read_dataset(shuffle),\n        plan=[\n2            system_message(\"system.txt\"),\n            use_tools([\n3                bash(timeout=CMD_TIMEOUT),\n                python(timeout=CMD_TIMEOUT)\n            ]),\n4            sample_init(),\n            generate(),\n        ],\n        scorer=includes(),\n        max_messages=MAX_MESSAGES,            \n5        tool_environment=\"docker\",\n    )\n\n@solver\ndef sample_init():\n    async def solve(state: TaskState, generate: Generate):\n6        # run setup code if provided\n        if state.metadata.get(\"setup\") is not None:\n            await tool_environment().exec(\n                [\"bash\", \"-c\", state.metadata[\"setup\"]]\n            )\n\n        # add available files (if any) to prompt\n7        files = state.metadata[\"files\"]\n        if len(files) &gt; 0:\n            state.user_prompt.text = (\n                \"The following files are located in the \" \n                + f\"working directory: {' '.join(files)}\" \n                + f\"\\n\\n{state.user_prompt.text}\")\n\n        # return state\n        return state\n\n    return solve\n\n1\n\nThe read_dataset() function (imported from dataset.py) downloads the data from the InterCode GH repo and converts it into a native Inspect Dataset).\n\n2\n\nThe system prompt (system.txt) describes the CTF challenge, provides a ReAct prompt, and includes several fewshot examples.\n\n3\n\nMake the bash() and python() tools available (with a timeout to ensure they don’t perform extremely long running operations). Note that using these tools requires a tool environment, which you can see is provided below).\n\n4\n\nFor each sample we run some initialization code that executes a custom setup command (if provided) as well as adds a list of files included with the sample to the prompt.\n\n5\n\nSpecify that Docker should be used as the tool environemnt (the container is built from the provided Dockerfile)\n\n6\n\nInterCode samples can include a “setup” field that includes a command to run before executing the sample.\n\n7\n\nAmmend the prompt with a list of files copied to the working directory for the sample.\n\n\nThe full source code for this example can be found in the Inspect GitHub repo at examples/agents/intercode-ctf.",
+    "crumbs": [
+      "Components",
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
+    ]
+  },
+  {
+    "objectID": "agents.html#custom-scaffolding",
+    "href": "agents.html#custom-scaffolding",
+    "title": "Agents",
+    "section": "Custom Scaffolding",
+    "text": "Custom Scaffolding\nThe default tool use loop above will work fine for some evaluations, but in other cases you may need to provide more custom logic. For example, you might want to:\n\nUrge the model to continue (or take a different path) if it gives up; or\nHave mutliple generate() passes each with a distinct set of tools.\n\nHere’s a solver that prompts the model to keep going after a failure to come up with a valid submission. Note that you might implement this with a limited number of “retries” or (as illustrated below) you might rely on max_messages to terminate the evaulation:\n@task\ndef ctf()\n    return Task(\n        dataset=csv_dataset(\"data\"),\n        plan=[\n1            use_tools(bash(180), python(180)),\n2            generate_until_submission()\n        ],\n3        max_messages=30,\n        tool_environment=\"docker\"\n    )\n\n@solver\ndef generate_until_submission():\n    async def solve(state: TaskState, generate: Generate):\n        \n4        while not state.completed:\n\n5            state = await generate(state)\n            if has_submission(state.output.completion)\n                break\n            \n6            state.messages.append(ChatMessageUser(\n                content = \"Keep going, you can do it!\"\n            ))\n    \n    return solve\n\n1\n\nSet a timeout of 3 minutes for tool execution.\n\n2\n\nCustom solver that re-prompts the model after it gives up.\n\n3\n\nThe loop in generate_until_submission() will not always terminate unless a max_messages is supplied!\n\n4\n\nWhen max_messages is exceeded, state.completed will be set to False.\n\n5\n\nRun standard generate tool use loop and check to see if the model came up with a submission (or alternatively gave up).\n\n6\n\nProvide user message that urges the model to continue.\n\n\nHere’s an example of a Solver that filters the available tools between calls to generate():\n@solver\ndef generate_ctf():\n    async def solve(state: TaskState, generate: Generate):\n        \n        # first pass w/ tools\n        state.tools = [decompile(), dissasemble(), bash()]\n        state = await generate(state)\n\n        # second pass w/ prompt and different tools\n        state.tools = [python()]\n        state.messages.append(ChatMessageUser( \n            content = \"Use Python to extract the flag.\" \n        ))  \n        state = await generate(state)\n\n        # clear tools and return\n        state.tools = []\n        return state\n    \n    return solve\nYou can imagine many other variations on the examples above. The key thing to take from these examples is that you can use custom solvers to wrap code around the default generate() tool use loop.",
+    "crumbs": [
+      "Components",
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
+    ]
+  },
+  {
+    "objectID": "agents.html#agent-libraries",
+    "href": "agents.html#agent-libraries",
+    "title": "Agents",
+    "section": "Agent Libraries",
+    "text": "Agent Libraries\nYou can also adapt code from a research paper or 3rd party agent library to run within an Inspect solver. Below we’ll provide an example of doing this for a LangChain Agent.\nWhen adapting 3rd party agent code, it’s important that the agent scaffolding use Inspect’s model API rather than whatever interface is built in to the existing code or library (otherwise you might be evaluating the wrong model!). If the agent is executing arbitrary code, it’s also beneficial to use Inspect Tool Environments for sandboxing.\n\nExample: LangChain\nThis example demonstrates how to integrate a LangChain Agent with Inspect. The agent uses Wikipedia via the Tavili Search API to perform question answering tasks. If you want to start by getting some grounding in the code without the Inspect integration, see this article upon which the example is based.\nThe main thing that an integration with an agent framework needs to account for is:\n\nBridging Inspect’s model API into the API of the agent framework. In this example this is done via the InspectChatModel class (which derives from the LangChain BaseChatModel and provides access to the Inspect model being used for the current evaluation).\nBridging from the Inspect solver interface to the standard input and output types of the agent library. In this example this is provided by the langchain_solver() function, which takes a LangChain agent function and converts it to an Inspect solver.\n\nHere’s the implementation of langchain_solver() (imports excluded for brevity):\n# Interface for LangChain agent function\nclass LangChainAgent(Protocol):\n    async def __call__(self, llm: BaseChatModel, input: dict[str, Any]): ...\n\n# Convert a LangChain agent function into a Solver\ndef langchain_solver(agent: LangChainAgent) -&gt; Solver:\n\n    async def solve(state: TaskState, generate: Generate) -&gt; TaskState:\n\n        # create the inspect model api bridge\n        llm = InspectChatModel()\n\n        # call the agent\n        await agent(\n            llm = llm,\n            input = dict(\n                input=state.user_prompt.text,\n                chat_history=as_langchain_chat_history(\n                    state.messages[1:]\n                ),\n            )\n        )\n\n        # collect output from llm interface\n        state.messages = llm.messages\n        state.output = llm.output\n        state.output.completion = output\n        \n        # return state\n        return state\n\n    return solve\n\n# LangChain BaseChatModel for Inspect Model API\nclass InspectChatModel(BaseChatModel):\n     async def _agenerate(\n        self,\n        messages: list[BaseMessage],\n        stop: list[str] | None = None,\n        run_manager: AsyncCallbackManagerForLLMRun | None = None,\n        **kwargs: dict[str, Any],\n    ) -&gt; ChatResult:\n        ...\n\n\n\n\n\n\nNote that the the inspect_langchain module imported here is not a built in feature of Inspect. Rather, you can find its source code as part of the example. You can use this to create your own LangChain agents or as the basis for creating similar integrations with other agent frameworks.\n\n\n\nNow here’s the wikipedia_search() solver (imports again excluded for brevity):\n@solver\ndef wikipedia_search(\n    max_iterations: int | None = 15,\n    max_execution_time: float | None = None\n) -&gt; Solver:\n    # standard prompt for tools agent\n    prompt = hub.pull(\"hwchase17/openai-tools-agent\")\n\n1    # tavily and wikipedia tools\n    tavily_api = TavilySearchAPIWrapper()  # type: ignore\n    tools = (\n        [TavilySearchResults(api_wrapper=tavily_api)] + \n        load_tools([\"wikipedia\"])\n    )\n\n2    # agent function\n    async def agent(\n        llm: BaseChatModel, \n        input: dict[str, Any]\n    ) -&gt; str | list[str | dict[str,Any]]:  \n        # create agent\n        tools_agent = create_openai_tools_agent(\n          llm, tools, prompt\n        )\n        executor = AgentExecutor.from_agent_and_tools(\n            agent=cast(BaseMultiActionAgent, tools_agent),\n            tools=tools,\n            name=\"wikipedia_search\",\n            max_iterations=max_iterations,  \n            max_execution_time=max_execution_time\n        )\n\n3        # execute the agent and return output\n        result = await executor.ainvoke(input)  \n        return result[\"output\"]\n\n4    # return agent function as inspect solver\n    return langchain_solver(agent)\n\n1\n\nNote that we register native LangChain tools. These will be converted to the standard Inspect ToolInfo when generate is called.\n\n2\n\nThis is the standard interface to LangChain agents. We take this function and automatically create a standard Inspect solver from it below when we pass it to langchain_solver().\n\n3\n\nInvoke the agent using the chat history passed in input. We call the async executor API to play well with Inspect’s concurrency.\n\n4\n\nThe langchain_solver() function maps the simpler agent function semantics into the standard Inspect solver API.\n\n\nIf you reviewed the original article that this example was based on, you’ll see that most of the code is unchanged (save for the fact that we have switched from a function agent to a tools agent). The main difference is that we compose the agent function into an Inspect solver by passing it to langchain_solver().\nFinally, here’s a task that uses the wikipedia_search() solver:\n@task\ndef wikipedia() -&gt; Task:\n    return Task(\n        dataset=json_dataset(\"wikipedia.jsonl\"),\n        plan=wikipedia_search(),\n        scorer=model_graded_fact(),\n    )\nThe full source code for this example can be found in the Inspect GitHub repo at examples/agents/langchain.",
+    "crumbs": [
+      "Components",
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
+    ]
+  },
+  {
+    "objectID": "agents.html#sec-tool-environments",
+    "href": "agents.html#sec-tool-environments",
+    "title": "Agents",
+    "section": "Tool Environments",
+    "text": "Tool Environments\nThe examples shown above execute tool code within the main process running the evaluation task. In some cases however, you may require the provisioning of dedicated environments for running tool code. This might be the case if:\n\nYou are creating tools that enable execution of arbitrary code (e.g. a tool that executes shell commands or Python code).\nYou need to provision per-sample file system resources.\nYou want to provide access to a more sophisticated evaluation environment (e.g. creating network hosts for a cybersecurity eval).\n\n\n\n\n\n\n\nImportant\n\n\n\nTool environments are not yet available in the version of Inspect published to PyPI (they are rather only available from the development version of Inspect). To install the development version:\n$ pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git\n\n\n\nExample: File Listing\nLet’s take a look at a simple example to illustrate. First, we’ll define a list_files() tool. This tool need to access the ls command—it does so by calling the tool_environment() function to get access to the ToolEnvironment instance for the currently executing Sample:\nfrom inspect_ai.solver import tool, tool_environment\n\n@tool(prompt=\"Use the list_files function to enumerate files.\")\ndef list_files():\n    async def execute(dir: str):\n        \"\"\"List the files in a directory.\n\n        Args:\n            dir (str): Directory\n\n        Returns:\n            File listing of the directory\n        \"\"\"\n        result = await tool_environment().exec([\"ls\", dir])\n        if result.success:\n            return result.stdout\n        else:\n            return f\"Error: {result.stderr}\"\n\n    return execute\nThe exec() function is used to list the directory contents. Note that its not immediately clear where or how exec() is implemented (that will be described shortly!).\nHere’s an evaluation that makes use of this tool:\nfrom inspect_ai import task, Task\nfrom inspect_ai.dataset import Sample\nfrom inspect_ai.scorer import includes\nfrom inspect_ai.solver import generate, use_tools\n\ndataset = [\n    Sample(\n        input='Is there a file named \"bar.txt\" ' \n               + 'in the current directory?',\n        target=\"Yes\",\n        files={\"bar.txt\": \"hello\"},\n    )\n]\n\n@task\ndef file_probe()\n    return Task(\n        dataset=dataset,\n        plan=[\n            use_tools([list_files()]), \n            generate()\n        ],\n        tool_environment=\"docker\",\n        scorer=includes(),\n    )\n)\nWe’ve included tool_environment = \"docker\" to indicate that tool environment operations should be executed in a Docker container. Specifying a tool environment (either at the task or evaluation level) is required if your tools call the tool_environment() function.\nNote that files are specified as part of the Sample. Files can be specified inline using plain text (as depicted above), inline using a base64-encoded data URI, or as a path to a file or remote resource (e.g. S3 bucket). Relative file paths are resolved according to the location of the underlying dataset file.\n\n\nEnvironment Interface\nThe following methods are available for all tool environments:\nclass ToolEnvironment:\n   \n    async def exec(\n        self,\n        cmd: list[str],\n        input: str | bytes | None = None,\n        env: dict[str, str] = {},\n        timeout: int | None = None,\n    ) -&gt; ExecResult[str]:\n        ...\n\n    async def write_file(\n        self, file: str, contents: str | bytes\n    ) -&gt; None:\n        ...\n\n    async def read_file(\n        self, file: str, text: bool = True\n    ) -&gt; Union[str | bytes]:\n        ...\n\n\nEnvironment Binding\nThere are two tool environments built in to Inspect:\n\n\n\nEnvironment Type\nDescription\n\n\n\n\nlocal\nRun tool_environment() methods in the same address space and file system as the running evaluation. The local environment should only be used if you are already running your evaluation in another sandbox.\n\n\ndocker\nRun tool_environment() methods within a Docker container (see the Docker Configuration section below for additional details).\n\n\n\nTool environments can be bound at the Task level or at the eval() level (where eval() takes precedence). To bind a tool environment to a Task, use the tool_environment option:\nTask(\n    dataset=dataset,\n    plan([\n        use_tools([read_file(), list_files()])), \n        generate()\n    ]),\n    scorer=match(),\n    tool_environment=\"docker\"\n)\nFor this example, if there is a compose.yaml file in the task directory it will be used to provision Docker services (if there is no compose.yaml then the Docker’s default Python 3.12 image will be used). You can specify an alternate config file using a tuple:\ntool_environment=(\"docker\", \"my-compose.yaml\")\nSimilar conventions exist for eval() and the CLI:\neval(task, tool_environment=\"docker\")\neval(task, tool_environment=(\"docker\",\"my-compose.yaml\"))\n$ inspect eval --tool-environment docker\n$ inspect eval --tool-environment docker:my-compose.yaml\n\n\nDocker Configuration\nWhile --tool-environment can be a default un-configured environment (e.g. “docker”), more commonly you’ll provide explicit configuration in either a Dockerfile or a Docker Compose configuration file (compose.yaml).\nHere is how Docker tool environments are created based on the presence of Dockerfile and/or compose.yml in the task directory:\n\n\n\nConfig Files\nBehavior\n\n\n\n\nNone\nCreates a tool environment based on the official python:3.12-bookworm image.\n\n\nDockerfile\nCreates a tool environment by building the image.\n\n\ncompose.yaml\nCreates tool environment(s) based on compose.yaml.\n\n\n\nHere is what a simple compose.yaml would look like for a single tool environment that uses the ctf-agent-environment Docker image:\n\n\ncompose.yaml\n\nservices:\n  default: \n    image: ctf-agent-environment\n    cpus: 1.0\n    mem_limit: 0.5gb\n\nNote that we’ve also chosen to limit the CPU and memory usage of the container (see the Docker Compose documentation for information on these and other container options).\n\nMultiple Environments\nIn some cases you may want to create multiple tool environments (e.g. if one environment has complex dependencies that conflict with the dependencies of other environments). To do this specify multiple named services:\n\n\ncompose.yaml\n\nservices:\n  default:\n    image: ctf-agent-environment\n    cpus: 1.0\n    mem_limit: 0.5gb\n  ghidra:\n    image: ctf-ghidra-environment\n    cpus: 1.0\n    mem_limit: 1gb\n\nThe first environment listed is the “default” environment, and can be accessed from within a tool with a normal call to tool_environment(). Other environments would be accessed by name, for example:\ntool_environment()          # default tool environment\ntool_environment(\"ghidra\")  # named tool environment\n\n\n\n\n\n\nNote\n\n\n\nIf you define multiple tool environments you are required to name one of them “default” so that Inspect knows which environment to copy samples files to and resolve for calls to tool_environment() without an argument.\n\n\n\n\nInfrastructure\nNote that in many cases you’ll want to provision additional infrastructure (e.g. other hosts or volumes). For example, here we define an additional container (“writer”) as well as a volume shared between the default container and the writer container:\nservices:\n  default: \n    image: ctf-agent-environment\n    volumes:\n      - ctf-challenge-volume:/shared-data\n    \n  writer:\n    image: ctf-challenge-writer\n    volumes:\n      - ctf-challenge-volume:/shared-data\nvolumes:\n  ctf-challenge-volume:\nSee the documentation on Docker Compose files for information on their full schema and feature set.\n\n\n\nResource Management\nCreating and executing code within Docker containers can be expensive both in terms of memory and CPU utilization. Inspect provides some automatic resource management to keep usage reasonable in the default case. This section describes that behavior as well as how you can tune it for your use-cases.\n\nRunning Containers\nAs described above, each Sample is provisioned its own container. The number of running containers for an evaluation is therefore determined by the max_samples option (which is by default set to max_connections, typically 10 unless overridden).\nUse max_samples to dial up or down the number of containers running at any given time. Note that a running container does not necessarily use CPU resources unless it has active background processes.\n\n\nConcurrent Execution\nThe ToolEnvironment.exec() method runs a command within a tool environment, typically consuming CPU resources. To protect against overwhelming the system’s CPUs, the implementation of exec() uses Inspect’s subprocess() function, which automatically limits concurrent child processes to the number of CPUs on your system (os.cpu_count()).\nYou can change the number of permitted concurrent subprocess executions using the max_subprocesses option. You might do this for example if you know that your exec() commands tend to use multiple CPU cores and thus should be executed with less concurrency.\n\n\n\nTroubleshooting\nYou can view more detailed logging around the creation and use of tool environments by using the tools log level. For example:\n$ inspect eval ctf.py --log-level tools\nThe tools log level is just above warning (so it will not show http or debug level messages).",
+    "crumbs": [
+      "Components",
+      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Agents</span>"
     ]
   },
   {
@@ -509,7 +553,7 @@
     "text": "Overview\nScorers evaluate whether solvers were successful in finding the right output for the target defined in the dataset, and in what measure. Scorers generally take one of the following forms:\nScorers also define one or more metrics which are used to aggregate scores (e.g. accuracy() which computes what percentage of scores are correct, or mean() which provides an average for scores that exist on a continuum).",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -520,7 +564,7 @@
     "text": "Extracting a specific answer out of a model’s completion output using a variety of heuristics.\nApplying a text similarity algorithm to see if the model’s completion is close to what is set out in the target.\nUsing another model to assess whether the model’s completion satisfies a description of the ideal answer in target.\nUsing another rubric entirely (e.g. did the model produce a valid version of a file format, etc.)",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -531,7 +575,7 @@
     "text": "Built-In Scorers\nInspect includes some simple text matching scorers as well as a couple of model graded scorers. Built in scorers can be imported from the inspect_ai.scorer module. Below is a summary of these scorers. There is not (yet) reference documentation on these functions so the best way to learn about how they can be customised, etc. is to use the Go to Definition command in your source editor.\n\nincludes()\nDetermine whether the target from the Sample appears anywhere inside the model output. Can be case sensitive or insensitive (defaults to the latter).\nmatch()\nDetermine whether the target from the Sample appears at the beginning or end of model output (defaults to looking at the end). Has options for ignoring case, white-space, and punctuation (all are ignored by default).\npattern()\nExtract the answer from model output using a regular expression.\nanswer()\nScorer for model output that preceded answers with “ANSWER:”. Can extract letters, words, or the remainder of the line.\nmodel_graded_qa()\nHave another model assess whether the model output is a correct answer based on the grading guidance contained in target. Has a built-in template that can be customised.\nmodel_graded_fact()\nHave another model assess whether the model output contains a fact that is set out in target. This is a more narrow assessment than model_graded_qa(), and is used when model output is too complex to be assessed using a simple match() or pattern() scorer.\n\nScorers provide one or more built-in metrics (each of the scorers above provides accuracy as a metric). You can also provide your own custom metrics in Task definitions. For example:\nTask(\n    dataset=dataset,\n    plan=[\n        system_message(SYSTEM_MESSAGE),\n        multiple_choice()\n    ],\n    scorer=match(),\n    metrics=[custom_metric()]\n)",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -542,7 +586,7 @@
     "text": "Model Graded\nModel graded scorers are well suited to assessing open ended answers as well as factual answers that are embedded in a longer narrative. The built-in model graded scorers can be customised in several ways—you can also create entirely new model scorers (see the model graded example below for a starting point).\nHere is the declaration for the model_graded_qa() function:\n@scorer(metrics=[accuracy(), bootstrap_std()])\ndef model_graded_qa(\n    template: str | None = None,\n    instructions: str | None = None,\n    grade_pattern: str | None = None,\n    partial_credit: bool = False,\n    model: list[str | Model] | str | Model | None = None,\n) -&gt; Scorer:\n    ...\nThe default model graded QA scorer is tuned to grade answers to open ended questions. The default template and instructions ask the model to produce a grade in the format GRADE: C or GRADE: I, and this grade is extracted using the default grade_pattern regular expression. The grading is by default done with the model currently being evaluated. There are a few ways you can customise the default behaviour:\n\nProvide alternate instructions—the default instructions ass the model to use chain of thought reasoning and provide grades in the format GRADE: C or GRADE: I. Note that if you provide instructions that ask the model to format grades in a different way, you will also want to customise the grade_pattern.\nSpecify partial_credit = True to prompt the model to assign partial credit to answers that are not entirely right but come close (metrics by default convert this to a value of 0.5). Note that this parameter is only valid when using the default instructions.\nSpecify an alternate model to perform the grading (e.g. a more powerful model or a model fine tuned for grading).\nSpecify a different template—note that templates are passed these variables: question, criterion, answer, and instructions.\n\nThe model_graded_fact() scorer works identically to model_graded_qa(), and simply provides an alternate template oriented around judging whether a fact is included in the model output.\nIf you want to understand how the default templates for model_graded_qa() and model_graded_fact() work, see their source code.\n\nMultiple Models\nThe built-in model graded scorers also support using multiple grader models (whereby the final grade is chosen by majority vote). For example, here we specify that 3 models should be used for grading:\nmodel_graded_qa(\n    model = [\n        \"google/gemini-1.0-pro\",\n        \"anthropic/claude-3-opus-20240229\" \n        \"together/meta-llama/Llama-3-70b-chat-hf\",\n    ]\n)\nThe implementation of multiple grader models takes advantage of the multi_scorer() and majority_vote() functions, both of which can be used in your own scorers (as described in the Multi Scorer section below).",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -553,7 +597,7 @@
     "text": "Custom Scorers\nCustom scorers are functions that take a TaskState and Target, and yield a Score.\nasync def score(state: TaskState, target: Target):\n     # Compare state / model output with target\n     # to yield a score\n     return Score(value=...)\nFirst we’ll talk about the core Score and Value objects, then provide some examples of custom scorers to make things more concrete.\n\n\n\n\n\n\nNote that score() above is declared as an async function. When creating custom scorers, it’s critical that you understand Inspect’s concurrency model. More specifically, if your scorer is doing non-trivial work (e.g. calling REST APIs, executing external processes, etc.) please review Eval Tuning before proceeding.\n\n\n\n\nScore\nThe components of Score include:\n\n\n\n\n\n\n\n\nField\nType\nDescription\n\n\n\n\nvalue\nValue\nValue assigned to the sample (e.g. “C” or “I”, or a raw numeric value).\n\n\nanswer\nstr\nText extracted from model output for comparison (optional).\n\n\nexplanation\nstr\nExplanation of score, e.g. full model output or grader model output (optional).\n\n\nmetadata\ndict[str,Any]\nAdditional metadata about the score to record in the log file (optional).\n\n\n\nFor example, the following are all valid Score objects:\nScore(value=\"C\")\nScore(value=\"I\")\nScore(value=0.6)\nScore(\n    value=\"C\" if extracted == target.text else \"I\", \n    answer=extracted, \n    explanation=state.output.completion\n)\nIf you are extracting an answer from within a completion (e.g. looking for text using a regex pattern, looking at the beginning or end of the completion, etc.) you should strive to always return an answer as part of your Score, as this makes it much easier to understand the details of scoring when viewing the eval log file.\n\n\nValue\nValue is union over the main scalar types as well as a list or dict of the same types:\nValue = Union[\n    str | int | float | bool,\n    list[str | int | float | bool],\n    dict[str, str | int | float | bool],\n]\nThe vast majority of scorers will use str (e.g. for correct/incorrect via “C” and “I”) or float (the other types are there to meet more complex scenarios). One thing to keep in mind is that whatever Value type you use in a scorer must be supported by the metrics declared for the scorer (more on this below).\nNext, we’ll take a look at the source code for a couple of the built in scorers as a jumping off point for implementing your own scorers. If you are working on custom scorers, you should also review the Scorer Workflow section below for tips on optimising your development process.\n\n\nExample: Includes\nHere is the source code for the built-in includes() scorer:\n1@scorer(metrics=[accuracy(), bootstrap_std()])\ndef includes(ignore_case: bool = True):\n\n2    async def score(state: TaskState, target: Target):\n\n        # check for correct\n        answer = state.output.completion\n3        target = target.text\n        if ignore_case:\n            correct = answer.lower().rfind(target.lower()) != -1\n        else:\n            correct = answer.rfind(target) != -1\n\n        # return score\n        return Score(\n4            value = CORRECT if correct else INCORRECT,\n5            answer=answer\n        )\n\n    return score\n\n1\n\nThe function applies the @scorer decorator and registers two metrics for use with the scorer.\n\n2\n\nThe score() function is declared as async. This is so that it can participate in Inspect’s optimised scheduling for expensive model generation calls (this scorer doesn’t call a model but others will).\n\n3\n\nWe make use of the text property on the Target. This is a convenience property to get a simple text value out of the Target (as targets can technically be a list of strings).\n\n4\n\nWe use the special constants CORRECT and INCORRECT for the score value (as the accuracy() and bootstrap_std() metrics know how to convert these special constants to float values (1.0 and 0.0 respectively).\n\n5\n\nWe provide the full model completion as the answer for the score (answer is optional, but highly recommended as it is often useful to refer to during evaluation development).\n\n\n\n\nExample: Model Grading\nHere’s a somewhat simplified version of the code for the model_graded_qa() scorer:\n\n@scorer(metrics=[accuracy(), bootstrap_std()])\ndef model_graded_qa(\n    template: str = DEFAULT_MODEL_GRADED_QA_TEMPLATE,\n    instructions: str = DEFAULT_MODEL_GRADED_QA_INSTRUCTIONS,\n    grade_pattern: str = DEFAULT_GRADE_PATTERN,\n    model: str | Model | None = None,\n) -&gt; Scorer:\n   \n    # resolve grading template and instructions, \n    # (as they could be file paths or URLs)\n    template = resource(template)\n    instructions = resource(instructions)\n\n    # resolve model\n    grader_model = get_model(model)\n\n    async def score(state: TaskState, target: Target) -&gt; Score:\n        # format the model grading template\n        score_prompt = template.format(\n            question=state.input_text,\n            answer=state.output.completion,\n            criterion=target.text,\n            instructions=instructions,\n        )\n\n        # query the model for the score\n        result = await grader_model.generate(score_prompt)\n\n        # extract the grade\n        match = re.search(grade_pattern, result.completion)\n        if match:\n            return Score(\n                value=match.group(1),\n                answer=match.group(0),\n                explanation=result.completion,\n            )\n        else:\n            return Score(\n                value=INCORRECT,\n                explanation=\"Grade not found in model output: \"\n                + f\"{result.completion}\",\n            )\n\n    return score\nNote that the call to model_grader.generate() is done with await—this is critical to ensure that the scorer participates correctly in the scheduling of generation work.\nNote also we use the input_text property of the TaskState to access a string version of the original user input to substitute it into the grading template. Using the input_text has two benefits: (1) It is guaranteed to cover the original input from the dataset (rather than a transformed prompt in messages); and (2) It normalises the input to a string (as it could have been a message list).",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -564,7 +608,7 @@
     "text": "Multi Scorer\nIt’s possible to use multiple scorers in parallel, then combine their output into a final overall score. This is done using the multi_scorer() function. For example, this is roughly how the built in model graders use multiple models for grading:\nmulti_scorer(\n    scorers = [model_graded_qa(model=model) for model in models],\n    reducer = majority_vote\n)\nUse of multi_scorer() requires both a list of scorers as well as a reducer which is a function that takes a list of scores and turns it into a single score. In this case we use the built in majority_vote() reducer which returns the score that appeared most frequently in the answers.\nYou can imagine a variety of different strategies for reducing scores (take the average, take the high or low, majority vote, etc.). For example, here’s a reducer that computes the average score:\nimport numpy as np\n\ndef average_score(scores: list[Score]) -&gt; Score:\n    values = [score.as_float() for score in scores]\n    avg = np.mean(values).item()\n    return Score(\n        value=avg,\n        explanation=f\"average of {', '.join(values)}\"\n    )\nFurther, you will need to wrap your use of multi_scorer() inside a @scorer decorated function (with the requisite metrics specified). For example:\n@scorer(metrics=[mean()])\ndef multi_model_graded(models)\n    return multi_scorer(\n        scorers = [model_graded_qa(model=model) for model in models],\n        reducer = average_score\n    )",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -575,7 +619,7 @@
     "text": "Metrics\nEach scorer provides one or more built-in metrics (typically accuracy and bootstrap_std). In addition, you can specify other metrics (either built-in or custom) to compute when defining a Task:\nTask(\n    dataset=dataset,\n    plan=[\n        system_message(SYSTEM_MESSAGE),\n        multiple_choice()\n    ],\n    scorer=match(),\n    metrics=[custom_metric()]\n)\n\nBuilt-In Metrics\nInspect includes some simple built in metrics for calculating accuracy, mean, etc. Built in metrics can be imported from the inspect_ai.scorer module. Below is a summary of these metrics. There is not (yet) reference documentation on these functions so the best way to learn about how they can be customised, etc. is to use the Go to Definition command in your source editor.\n\naccuracy()\nCompute proportion of total answers which are correct. For correct/incorrect scores assigned 1 or 0, can optionally assign 0.5 for partially correct answers.\nmean()\nMean of all scores.\nvar()\nVariance over all scores.\nbootstrap_std()\nStandard deviation of a bootstrapped estimate of the mean. 1000 samples are taken by default (modify this using the num_samples option).\n\n\n\nCustom Metrics\nYou can also add your own metrics with @metric decorated functions. For example, here is the implementation of the variance metric:\nimport numpy as np\n\nfrom inspect_ai.scorer import Metric, Score, metric\n\ndef var() -&gt; Metric:\n    \"\"\"Compute variance over all scores.\"\"\"\n\n    def metric(scores: list[Score]) -&gt; float:\n        return np.var([score.as_float() for score in scores]).item()\n\n    return metric\nNote that the Score class contains a Value that is a union over several scalar and collection types. As a convenience, Score includes a set of accessor methods to treat the value as a simpler form (e.g. above we use the score.as_float() accessor).",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -586,7 +630,7 @@
     "text": "Workflow\n\nScore Command\nBy default, model output in evaluations is automatically scored. However, you can separate generation and scoring by using the --no-score option. For example:\ninspect eval popularity.py --model openai/gpt-4 --no-score\nYou can score an evaluation previously run this way using the inspect score command:\n# score last eval\ninspect score popularity.py\n\n# score specific log file\ninspect score popularity.py ./logs/2024-02-23_task_gpt-4_TUhnCn473c6.json\n\n\n\n\n\n\nTip\n\n\n\nUsing a distinct scoring step is particularly useful during scorer development, as it bypasses the entire generation phase, saving lots of time and inference costs.\n\n\n\n\nLog Overwriting\nBy default, inspect score overwrites the file it scores. If don’t want to overwrite target files, pass the --no-overwrite flag:\ninspect score popularity.py --no-overwrite\nWhen specifying --no-overwrite, a -scored suffix will be added to the original log file name:\n./logs/2024-02-23_task_gpt-4_TUhnCn473c6-scored.json\nNote that the --no-overwrite flag does not apply to log files that already have the -scored suffix—those files are always overwritten by inspect score. If you plan on scoring multiple times and you want to save each scoring output, you will want to copy the log to another location before re-scoring.\n\n\nPython API\nIf you are exploring the performance of different scorers, you might find it more useful to call the score() function using varying scorers or scorer options. For example:\nlog = eval(popularity, model=\"openai/gpt-4\")[0]\n\ngrader_models = [\n    \"openai/gpt-4\",\n    \"anthropic/claude-3-opus-20240229\",\n    \"google/gemini-1.0-pro\",\n    \"mistral/mistral-large-latest\"\n]\n\nscoring_logs = [score(log, model_graded_qa(model=model)) \n                for model in grader_models]\n\nplot_results(scoring_logs)",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>8</span>  <span class='chapter-title'>Scorers</span>"
+      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Scorers</span>"
     ]
   },
   {
@@ -597,7 +641,7 @@
     "text": "Overview\nInspect has native support for reading datasets in the CSV, JSON, and JSON Lines formats, as well as from Hugging Face. In addition, the core dataset interface for the evaluation pipeline is flexible enough to accept data read from just about any source.\nIf your data is already in a format amenable for direct reading as an Inspect Sample, reading a dataset is as simple as this:\nOf course, many real-world datasets won’t be so trivial to read. Below we’ll discuss the various ways you can adapt your datasets for use with Inspect.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -608,7 +652,7 @@
     "text": "from inspect_ai.dataset import csv_dataset, json_dataset\ndataset1 = csv_dataset(\"dataset1.csv\")\ndataset2 = json_dataset(\"dataset2.json\")",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -619,7 +663,7 @@
     "text": "Dataset Samples\nThe core data type underlying the use of datasets with Inspect is the Sample, which consists of a required input field and several other optional fields:\nClass inspect_ai.dataset.Sample\n\n\n\n\n\n\n\n\nField\nType\nDescription\n\n\n\n\ninput\nstr | list[ChatMessage]\nThe input to be submitted to the model.\n\n\nchoices\nlist[str] | None\nOptional. Multiple choice answer list.\n\n\ntarget\nstr | list[str] | None\nOptional. Ideal target output. May be a literal value or narrative text to be used by a model grader.\n\n\nid\nstr | None\nOptional. Unique identifier for sample.\n\n\nmetadata\ndict[str | Any] | None\nOptional. Arbitrary metadata associated with the sample.\n\n\nfiles\ndict[str | str] | None\nOptional. Files that go along with the sample (copied to default ToolEnvironment).\n\n\n\nSo a CSV dataset with the following structure:\n\n\n\n\n\n\n\ninput\ntarget\n\n\n\n\nWhat cookie attributes should I use for strong security?\nsecure samesite and httponly\n\n\nHow should I store passwords securely for an authentication system database?\nstrong hashing algorithms with salt like Argon2 or bcrypt\n\n\n\nCan be read directly with:\ndataset = csv_dataset(\"security_guide.csv\")\nNote that samples from datasets without an id field will automatically be assigned ids based on an auto-incrementing integer starting with 1.\nIf your samples include choices, then the target should be a numeric index into the available choices rather than a letter (this is an implicit assumption of the multiple_choice() solver).",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -630,7 +674,7 @@
     "text": "Field Mapping\nIf your dataset contains inputs and targets that don’t use input and target as field names, you can map them into a Dataset using a FieldSpec. This same mechanism also enables you to collect arbitrary additional fields into the Sample metadata bucket. For example:\nfrom inspect_ai.dataset import FieldSpec, json_dataset\n\ndataset = json_dataset(\n    \"popularity.jsonl\",\n    FieldSpec(\n        input=\"question\",\n        target=\"answer_matching_behavior\",\n        id=\"question_id\",\n        metadata=[\"label_confidence\"],\n    ),\n)\nIf you need to do more than just map field names and actually do custom processing of the data, you can instead pass a function which takes a record (represented as a dict) from the underlying file and returns a Sample. For example:\nfrom inspect_ai.dataset import Sample, json_dataset\n\ndef record_to_sample(record):\n    return Sample(\n        input=record[\"question\"],\n        target=record[\"answer_matching_behavior\"].strip(),\n        id=record[\"question_id\"],\n        metadata={\n            \"label_confidence\": record[\"label_confidence\"]\n        }\n    )\n\ndataset = json_dataset(\"popularity.jsonl\", record_to_sample)",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -641,7 +685,7 @@
     "text": "Filter and Shuffle\nThe Dataset class includes filter() and shuffle() methods, as well as support for the slice operator.\nTo select a subset of the dataset, use filter():\ndataset = json_dataset(\"popularity.jsonl\", record_to_sample)\ndataset = dataset.filter(\n    lambda sample : sample.metadata[\"category\"] == \"advanced\"\n)\nTo select a subset of records, use standard Python slicing:\ndataset = dataset[0:100]\nShuffling is often helpful when you want to vary the samples used during evaluation development. To do this, either use the shuffle() method or the shuffle parameter of the dataset loading functions:\n# shuffle method\ndataset = dataset.shuffle()\n\n# shuffle on load\ndataset = json_dataset(\"data.jsonl\", shuffle=True)\nNote that both of these methods optionally support specifying a random seed for shuffling.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -652,7 +696,7 @@
     "text": "Hugging Face\nHugging Face Datasets is a library for easily accessing and sharing datasets for machine learning, and features integration with Hugging Face Hub, a repository with a broad selection of publicly shared datasets. Typically datasets on Hugging Face will require specification of which split within the dataset to use (e.g. train, test, or validation) as well as some field mapping. Use the hf_dataset() function to read a dataset and specify the requisite split and field names:\nfrom inspect_ai.dataset import FieldSpec, hf_dataset\n\ndataset=hf_dataset(\"openai_humaneval\", \n  split=\"test\", \n  sample_fields=FieldSpec(\n    id=\"task_id\",\n    input=\"prompt\",\n    target=\"canonical_solution\",\n    metadata=[\"test\", \"entry_point\"]\n  )\n)\nNote that some HuggingFace datasets execute Python code in order to resolve the underlying dataset files. Since this code is run on your local machine, you need to specify trust = True in order to perform the download. This option should only be set to True for repositories you trust and in which you have read the code. Here’s an example of using the trust option (note that it defaults to False if not specified):\ndataset=hf_dataset(\"openai_humaneval\", \n  split=\"test\", \n  trust=True,\n  ...\n)\nUnder the hood, the hf_dataset() function is calling the load_dataset() function in the Hugging Face datasets package. You can additionally pass arbitrary parameters on to load_dataset() by including them in the call to hf_dataset(). For example hf_dataset(..., cache_dir=\"~/my-cache-dir\").",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -663,7 +707,7 @@
     "text": "Amazon S3\nInspect has integrated support for storing datasets on Amazon S3. Compared to storing data on the local file-system, using S3 can provide more flexible sharing and access control, and a more reliable long term store than local files.\nUsing S3 is mostly a matter of substituting S3 URLs (e.g. s3://my-bucket-name) for local file-system paths. For example, here is how you load a dataset from S3:\njson_dataset(\"s3://my-bucket/dataset.jsonl\")\nS3 buckets are normally access controlled so require authentication to read from. There are a wide variety of ways to configure your client for AWS authentication, all of which work with Inspect. See the article on Configuring the AWS CLI for additional details.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -674,7 +718,7 @@
     "text": "Chat Messages\nThe most important data structure within Sample is the ChatMessage. Note that often datasets will contain a simple string as their input (which is then internally converted to a ChatMessageUser). However, it is possible to include a full message history as the input via ChatMessage. Another useful application of ChatMessage is providing multi-modal input (e.g. images).\nClass inspect_ai.model.ChatMessage\n\n\n\n\n\n\n\n\nField\nType\nDescription\n\n\n\n\nrole\n\"system\" | \"user\" | \"assistant\" | \"tool\"\nRole of this chat message.\n\n\ncontent\nstr | list[ChatContent]\nThe content of the message. Can be a simple string or a list of content parts intermixing text and images.\n\n\n\nAn input with chat messages in your dataset might will look something like this:\n\"input\": [\n  {\n    \"role\": \"user\",\n    \"content\": \"What cookie attributes should I use for strong security?\"\n  }\n]\nNote that for this example we wouldn’t normally use a full chat message object (rather we’d just provide a simple string). Chat message objects are more useful when you want to include a system prompt or prime the conversation with “assistant” responses.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -685,7 +729,7 @@
     "text": "Image Input\nTo include an image, your dataset input would look like this:\n\"input\": [\n  {\n    \"role\": \"user\",\n    \"content\": [\n        { \"type\": \"text\", \"text\": \"What is this a picture of?\"},\n        { \"type\": \"image\", \"image\": \"picture.png\"}\n    ]\n  }\n]\nWhere \"picture.png\" is located in the directory where your task runs. The image can be specified either as a URL (accessible to the model), a local file path, or a base64 encoded Data URL.\nIf you are constructing chat messages programmatically, then the equivalent to the above would be:\nChatMessageUser(content = [\n    ContentText(text=\"What is this a picture of?\"),\n    ContentImage(image=\"picture.png\")\n])\n\n\n\n\n\n\nNote that image input is currently only supported for OpenAI vision models (e.g. gpt-4-vision-preview), Google Gemini vision models (e.g. gemini-pro-vision), and Anthropic Claude 3 models.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -696,7 +740,7 @@
     "text": "Custom Reader\nYou are not restricted to the built in dataset functions for reading samples. Since the dataset field of the Task class takes either a Dataset or a sequences ofSample, the following is also valid:\nfrom inspect_ai import Task, task\nfrom inspect_ai.dataset import Sample\nfrom inspect_ai.scorer import model_graded_fact\nfrom inspect_ai.solver import generate, system_message\n\ndataset=[\n    Sample(\n        input=\"What cookie attributes should I use for strong security?\",\n        target=\"secure samesite and httponly\",\n    )\n]\n\n@task\ndef security_guide():\n    return Task(\n        dataset=dataset,\n        plan=[system_message(SYSTEM_MESSAGE), generate()],\n        scorer=model_graded_fact(),\n    )\nSo if the built in dataset functions don’t meet your needs, you can create a custom function that yields a list of Sample instances and pass those directly to your Task.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>9</span>  <span class='chapter-title'>Datasets</span>"
+      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Datasets</span>"
     ]
   },
   {
@@ -707,7 +751,7 @@
     "text": "Overview\nInspect has built in support for a variety of language model API providers and can be extended to support arbitrary additions ones. Built-in model API providers, their dependencies, and environment variables required to use them are as follows:",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -718,7 +762,7 @@
     "text": "Model API\nDependencies\nEnvironment Variables\n\n\n\n\nOpenAI\npip install openai\nOPENAI_API_KEY\n\n\nAnthropic\npip install anthropic\nANTHROPIC_API_KEY\n\n\nGoogle\npip install google-generativeai\nGOOGLE_API_KEY\n\n\nMistral\npip install mistralai\nMISTRAL_API_KEY\n\n\nHugging Face\npip install transformers\nHF_TOKEN\n\n\nOllama\npip install openai\nNone required\n\n\nTogetherAI\npip install openai\nTOGETHER_API_KEY\n\n\nAWS Bedrock\npip install boto3\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_DEFAULT_REGION\n\n\nAzure AI\nNone required\nAZURE_API_KEY and INSPECT_EVAL_MODEL_BASE_URL\n\n\nCloudflare\nNone required\nCLOUDFLARE_ACCOUNT_ID and CLOUDFLARE_API_TOKEN\n\n\n\n\n\n\n\n\n\nNote that some providers (Ollama and TogetherAI) support the OpenAI Python package as a client, which is why you need to pip install openai for these providers even though you aren’t actually interacting with the OpenAI service when you use them.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -729,7 +773,7 @@
     "text": "Using Models\nTo select a model for use in an evaluation task you specify it using a model name. Model names include their API provider and the specific model to use (e.g. openai/gpt-4) Here are the supported providers along with example model names and links to documentation on all available models:\n\n\n\n\n\n\n\n\nProvider\nModel Name\nDocs\n\n\n\n\nOpenAI\nopenai/gpt-3.5-turbo\nOpenAI Models\n\n\nAnthropic\nanthropic/claude-2.1\nAnthropic Models\n\n\nGoogle\ngoogle/gemini-1.0-pro\nGoogle Models\n\n\nMistral\nmistral/mistral-large-latest\nMistral Models\n\n\nHugging Face\nhf/openai-community/gpt2\nHugging Face Models\n\n\nOllama\nollama/llama3\nOllama Models\n\n\nTogetherAI\ntogether/lmsys/vicuna-13b-v1.5\nTogetherAI Models\n\n\nAWS Bedrock\nbedrock/meta.llama2-70b-chat-v1\nAWS Bedrock Models\n\n\nAzure AI\nazureai/azure-deployment-name\nAzure AI Models\n\n\nCloudflare\ncf/meta/llama-2-7b-chat-fp16\nCloudflare Models\n\n\n\nTo select a model for an evaluation, pass it’s name on the command line or use the model argument of the eval() function:\n$ inspect eval security_guide --model openai/gpt-3.5-turbo\n$ inspect eval security_guide --model anthropic/claude-instant-1.2\nOr:\neval(security_guide, model=\"openai/gpt-3.5-turbo\")\neval(security_guide, model=\"anthropic/claude-instant-1.2\")\nAlternatively, you can set the INSPECT_EVAL_MODEL environment variable (either in the shell or a .env file) to select a model externally:\nINSPECT_EVAL_MODEL=google/gemini-1.0-pro\n\n\n\n\n\n\nIf are using Azure AI, AWS Bedrock, or Hugging Face, you should additionally consult the sections below on using the Azure AI, AWS Bedrock, and Hugging Face providers to learn more about available models and their usage and authentication requirements.\n\n\n\n\nModel Base URL\nEach model also can use a different base URL than the default (e.g. if running through a proxy server). The base URL can be specified with the same prefix as the API_KEY, for example, the following are all valid base URLs:\n\n\n\n\n\n\n\nProvider\nEnvironment Variable\n\n\n\n\nOpenAI\nOPENAI_BASE_URL\n\n\nAnthropic\nANTHROPIC_BASE_URL\n\n\nGoogle\nGOOGLE_BASE_URL\n\n\nMistral\nMISTRAL_BASE_URL\n\n\nTogetherAI\nTOGETHER_BASE_URL\n\n\nOllama\nOLLAMA_BASE_URL\n\n\nAWS Bedrock\nBEDROCK_BASE_URL\n\n\nAzure AI\nAZUREAI_BASE_URL\n\n\nCloudflare\nCLOUDFLARE_BASE_URL\n\n\n\nIn addition, there are separate base URL variables for running various frontier models on Azure and Bedrock:\n\n\n\n\n\n\n\nProvider (Model)\nEnvironment Variable\n\n\n\n\nAzureAI (OpenAI)\nAZUREAI_OPENAI_BASE_URL\n\n\nAzureAI (Mistral)\nAZUREAI_MISTRAL_BASE_URL\n\n\nBedrock (Anthropic)\nBEDROCK_ANTHROPIC_BASE_URL",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -740,7 +784,7 @@
     "text": "Generation Config\nThere are a variety of configuration options that affect the behaviour of model generation. There are options which affect the generated tokens (temperature, top_p, etc.) as well as the connection to model providers (timeout, max_retries, etc.)\nYou can specify generation options either on the command line or in direct calls to eval(). For example:\n$ inspect eval --model openai/gpt-4 --temperature 0.9\n$ inspect eval --model google/gemini-1.0-pro --max-connections 20\nOr:\neval(security_guide, model=\"openai/gpt-4\", temperature=0.9)\neval(security_guide, model=\"google/gemini-1.0-pro\", max_connections=20)\nUse inspect eval --help to learn about all of the available generation config options. |\n\nConnections and Rate Limits\nInspect uses an asynchronous architecture to run task samples in parallel. If your model provider can handle 100 concurrent connections, then Inspect can utilise all of those connections to get the highest possible throughput. The limiting factor on parallelism is therefore not typically local parallelism (e.g. number of cores) but rather what the underlying rate limit is for your interface to the provider.\nIf you are experiencing rate-limit errors you will need to experiment with the max_connections option to find the optimal value that keeps you under the rate limit (the section on Eval Tuning includes additional documentation on how to do this). Note that the next section describes how you can set a model-provider specific value for max_connections as well as other generation options.\n\n\nModel Specific Configuration\nIn some cases you’ll want to vary generation configuration options by model provider. You can do this by adding a model argument to your task function. You can use the model in a pattern matching statement to condition on different models. For example:\n@task\ndef popularity(model):\n    # condition temperature on model\n    config = GenerateConfig()\n    match model:\n        case \"gpt\" | \"gemini\":\n            config.temperature = 0.9\n        case \"claude\":\n            config.temperature = 0.8\n\n    return Task(\n        dataset=json_dataset(\"popularity.jsonl\"),\n        plan=[system_message(SYSTEM_MESSAGE), generate()],\n        scorer=match(),\n        config=config,\n    )",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -751,7 +795,7 @@
     "text": "Provider Notes\nThis section provides additional documentation on using the Azure AI, AWS Bedrock, and Hugging Face providers.\n\nAzure AI\nAzure AI provides hosting of models from OpenAI and Mistral as well as a wide variety of other open models. One special requirement for models hosted on Azure is that you need to specify a model base URL. You can do this using the AZUREAI_OPENAI_BASE_URL and AZUREAI_MISTRAL_BASE_URL environment variables or the --model-base-url command line parameter. You can find the model base URL for your specific deployment in the Azure model admin interface.\n\nOpenAI\nTo use OpenAI models on Azure AI, specify an AZUREAI_OPENAI_API_KEY along with an AZUREAI_OPENAI_BASE_URL. You can then use the normal openai provider, but you’ll need to specify a model name that corresponds to the Azure Deployment Name of your model. For example, if your deployed model name was gpt4-1106-preview-ythre:\n$ export AZUREAI_OPENAI_API_KEY=key\n$ export AZUREAI_OPENAI_BASE_URL=https://your-url-at.azure.com\n$ inspect eval --model openai/gpt4-1106-preview-ythre\nThe complete list of environment variables (and how they map to the parameters of the AzureOpenAI client) is as follows:\n\napi_key from AZUREAI_OPENAI_API_KEY\nazure_endpoint from AZUREAI_OPENAI_BASE_URL\norganization from OPENAI_ORG_ID\napi_version from OPENAI_API_VERSION\n\n\n\nMistral\nTo use Mistral models on Azure AI, specify an AZURE_MISTRAL_API_KEY along with an INSPECT_EVAL_MODEL_BASE_URL. You can then use the normal mistral provider, but you’ll need to specify a model name that corresponds to the Azure Deployment Name of your model. For example, if your deployment model name was mistral-large-ctwi:\n$ export AZUREAI_MISTRAL_API_KEY=key\n$ export AZUREAI_MISTRAL_BASE_URL=https://your-url-at.azure.com\n$ inspect eval --model mistral/mistral-large-ctwi\n\n\nOther Models\nAzure AI supports many other model types, you can access these using the azureai model provider. As with OpenAI and Mistral, you’ll need to specify an AZUREAI_API_KEY along with an AZUREAI_BASE_URL, as well as use the Azure Deployment Name of your model as the model name. For example:\n$ export AZUREAI_API_KEY=key\n$ export AZUREAI_BASE_URL=https://your-url-at.azure.com\n$ inspect eval --model azureai/llama-2-70b-chat-wnsnw\n\n\n\nAWS Bedrock\nAWS Bedrock provides hosting of models from Anthropic as well as a wide variety of other open models. Note that all models on AWS Bedrock require that you request model access before using them in a deployment (in some cases access is granted immediately, in other cases it could one or more days).\nYou should be sure that you have the appropriate AWS credentials before accessing models on Bedrock. Once credentials are configured, use the bedrock provider along with the requisite Bedrock model name. For example, here’s how you would access models from a variety of providers:\n$ export AWS_ACCESS_KEY_ID=ACCESSKEY\n$ export AWS_SECRET_ACCESS_KEY=SECRETACCESSKEY\n$ export AWS_DEFAULT_REGION=us-east-1\n\n$ inspect eval bedrock/anthropic.claude-3-haiku-20240307-v1:0\n$ inspect eval bedrock/mistral.mistral-7b-instruct-v0:2\n$ inspect eval bedrock/meta.llama2-70b-chat-v1\nYou aren’t likely to need to, but you can also specify a custom base URL for AWS Bedrock using the BEDROCK_BASE_URL environment variable.\n\n\nHugging Face\nThe Hugging Face provider implements support for local models using the transformers package. You can use any Hugging Face model by specifying it with the hf/ prefix. For example:\n$ inspect eval popularity --model hf/openai-community/gpt2\n\nBatching\nConcurrency for REST API based models is managed using the max_connections option. The same option is used for transformers inference—up to max_connections calls to generate() will be batched together (note that batches will proceed at a smaller size if no new calls to generate() have occurred in the last 2 seconds).\nThe default batch size for Hugging Face is 32, but you should tune your max_connections to maximise performance and ensure that batches don’t exceed available GPU memory. The Pipeline Batching section of the transformers documentation is a helpful guide to the ways batch size and performance interact.\n\n\nDevice\nThe PyTorch cuda device will be used automatically if CUDA is available (as will the Mac OS mps device). If you want to override the device used, use the device model argument. For example:\n$ inspect eval popularity --model hf/openai-community/gpt2 -M device=cuda:0\nThis also works in calls to eval():\neval(popularity, model=\"hf/openai-community/gpt2\", model_args=dict(device=\"cuda:0\"))\nOr in a call to get_model()\nmodel = get_model(\"hf/openai-community/gpt2\", device=\"cuda:0\")\n\n\nLocal Models\nIn addition to using models from the Hugging Face Hub, the Hugging Face provider can also use local model weights and tokenizers (e.g. for a locally fine tuned model). Use hf/local along with the model_path, and (optionally) tokenizer_path arguments to select a local model. For example, from the command line, use the -M flag to pass the model arguments:\n$ inspect eval popularity --model hf/local -M model_path=./my-model\nOr using the eval() function:\neval(popularity, model=\"hf/local\", model_args=dict( model_path=\"./my-model\"))\nOr in a call to get_model()\nmodel = get_model(\"hf/local\", model_path=\"./my-model\")",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -762,7 +806,7 @@
     "text": "Helper Models\nOften you’ll want to use language models in the implementation of Solvers and Scorers. Inspect includes some critique solvers and model graded scorers that do this, and you’ll often want to do the same in your own.\nHelper models will by default use the same model instance and configuration as the model being evaluated, however this can be overridden using the model argument.\nself_critique(model = \"google/gemini-1.0-pro\")\nYou can also pass a fully instantiated Model object (for example, if you wanted to override its default configuration) by using the get_model() function. For example, here we’ll provide custom models for both critique and scoring:\nfrom inspect_ai import Task, task\nfrom inspect_ai.dataset import json_dataset\nfrom inspect_ai.model import GenerationConfig, get_model\nfrom inspect_ai.scorer import model_graded_fact\nfrom inspect_ai.solver import chain_of_thought, generate, self_critique\n\n@task\ndef theory_of_mind():\n\n  critique_model = get_model(\"google/gemini-1.0-pro\")\n\n  grader_model = get_model(\"anthropic/claude-2.1\", config = GenerationConfig(\n    temperature = 0.9,\n    max_connections = 10\n  ))\n\n  return Task(\n     dataset=json_dataset(\"theory_of_mind.jsonl\"),\n     plan=[\n         chain_of_thought(),\n         generate(),\n         self_critique(model = critique_model)\n     ],\n     scorer=model_graded_fact(model = grader_model),\n  )",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -773,7 +817,7 @@
     "text": "Model Args\nThe section above illustrates passing model specific arguments to local models on the command line, in eval(), and in get_model(). This actually works for all model types, so if there is an additional aspect of a model you want to tweak that isn’t covered by the GenerationConfig, you can use this method to do it. For example, here we specify the transport option for a Google Gemini model:\ninspect eval popularity --model google/gemini-1.0-pro -M transport:grpc\nThe additional model_args are forwarded as follows for the various providers:\n\n\n\n\n\n\n\nProvider\nForwarded to\n\n\n\n\nOpenAI\nAsyncOpenAI\n\n\nAnthropic\nAsyncAnthropic\n\n\nGoogle\ngenai.configure\n\n\nMistral\nMistralAsyncClient\n\n\nHugging Face\nAutoModelForCausalLM.from_pretrained\n\n\nOllama\nAsyncOpenAI\n\n\nTogetherAI\nAsyncOpenAI\n\n\nAzureAI\nChat HTTP Post Body\n\n\nCloudflare\nChat HTTP Post Body\n\n\n\nSee the OpenAI, Anthropic, Google, Mistral, Hugging Face, Ollama, TogetherAI, Azure AI, and Cloudflare provider documentation for more information on the additional options available.",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -784,7 +828,7 @@
     "text": "Custom Models\nYou can add a model provider by deriving a new class from ModelAPI and adding the @modelapi decorator to it. For example:\n@modelapi(name=\"custom\")\nclass CustomModelAPI(ModelAPI):\n    def __init__(\n        self, \n        model_name: str,\n        base_url: str | None = None,\n        config: GenerateConfig = GenerateConfig(),\n        **model_args: dict[str,Any]\n    ) -&gt; None:\n        super().__init__(model_name, base_url, config)\n  \n    async def generate(\n        self,\n        input: list[ChatMessage],\n        tools: list[ToolInfo],\n        tool_choice: ToolChoice,\n        config: GenerateConfig,\n    ) -&gt; ModelOutput:\n        ...\nThe __init__() method must call the super().__init__() method, and typically instantiates the model client library.\nThe generate() method handles interacting with the model. In addition, there are some optional methods you can override to specify various behaviours and constraints (default max tokens and connections, identifying rate limit errors, etc.)\n\nModel Registration\nIf you are publishing a custom model within a Python package, you should register an inspect_ai setuptools entry point. This will ensure that inspect loads your extension before it attempts to resolve a model name that uses your provider.\nFor example, if your package was named custom_models and your model provider was exported from a source file named inspect_ai.py at the root of your package, pyproject.toml that would look like this:\n[project.entry-points.inspect_ai]\ncustom_models = \"custom_models.inspect_ai\"\n\n\nUsing the Model\nOnce you’ve created the class, decorated it with @modelapi as shown above, and registered it, then you can use it as follows:\ninspect eval ctf.py --model custom/my-model\nWhere my-model is the name of some model supported by your provider (this will be passed to __init()__ in the model_name argument).\nYou can also reference it from within Python calls to get_model() or eval():\n# get a model instance\nmodel = get_model(\"custom/my-model\")\n\n# run an eval with the model\neval(math, model = \"custom/my-model\")",
     "crumbs": [
       "Components",
-      "<span class='chapter-number'>10</span>  <span class='chapter-title'>Models</span>"
+      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Models</span>"
     ]
   },
   {
@@ -795,7 +839,7 @@
     "text": "Overview\nEvery time you use inspect eval or call the eval() function, an evaluation log is written for each task evaluated. By default, logs are written to the ./logs sub-directory of the current working directory (we’ll cover how to change this below). You will find a link to the log at the bottom of the results for each task:\nYou can also use the Inspect log viewer for interactive exploration of logs. Run this command once at the beginning of a working session (the view will update automatically when new evaluations are run):\nThis section won’t cover using inspect view though. Rather, it will cover the details of managing log usage from the CLI as well as the Python API for reading logs. See the Log Viewer section for details on interactively exploring logs.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -806,7 +850,7 @@
     "text": "$ inspect eval security_guide.py --model openai/gpt-4\n\n\n$ inspect view",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -817,7 +861,7 @@
     "text": "Log Location\nBy default, logs are written to the ./logs sub-directory of the current working directory You can change where logs are written using eval options or an environment variable:\n$ inspect eval popularity.py --model openai/gpt-4 --log-dir ./experiment-log\nOr:\nlog = eval(popularity, model=\"openai/gpt-4\", log_dir = \"./experiment-log\")\nNote that in addition to logging the eval() function also returns an EvalLog object for programmatic access to the details of the evaluation. We’ll talk more about how to use this object below.\nThe INSPECT_LOG_DIR environment variable can also be specified to override the default ./logs location. You may find it convenient to define this in a .env file from the location where you run your evals:\nINSPECT_LOG_DIR=./experiment-log\nINSPECT_LOG_LEVEL=warning\nIf you define a relative path to INSPECT_LOG_DIR in a .env file, then its location will always be resolved as relative to that .env file (rather than relative to whatever your current working directory is when you run inspect eval).\n\n\n\n\n\n\nIf you are running in VS Code, then you should restart terminals and notebooks using Inspect when you change the INSPECT_LOG_DIR in a .env file. This is because the VS Code Python extension also reads variables from .env files, and your updated INSPECT_LOG_DIR won’t be re-read by VS Code until after a restart.\n\n\n\nSee the Amazon S3 section below for details on logging evaluations to Amazon S3 buckets.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -828,7 +872,7 @@
     "text": "EvalLog\nThe EvalLog object returned from eval() provides programmatic interface to the contents of log files:\nClass inspect_ai.log.EvalLog\n\n\n\n\n\n\n\n\nField\nType\nDescription\n\n\n\n\nstatus\nstr\nStatus of evaluation (\"started\", \"success\", or \"error\").\n\n\neval\nEvalSpec\nTop level eval details including task, model, creation time, etc.\n\n\nplan\nEvalPlan\nList of solvers and model generation config used for the eval.\n\n\nsamples\nlist[EvalSample]\nEach sample evaluated, including its input, output, target, and score.\n\n\nresults\nEvalResults\nAggregate results computed by scorer metrics.\n\n\nstats\nEvalStats\nModel usage statistics (input and output tokens)\n\n\nlogging\nlist[LoggingMessage]\nLogging messages (e.g. from log.info(), log.debug(), etc.\n\n\nerror\nEvalError\nError information (if status == \"error) including traceback.\n\n\n\nBefore analysing results from a log, you should always check their status to ensure they represent a successful run:\nlog = log = eval(popularity, model=\"openai/gpt-4\")\nif log.status == \"success\":\n   ...\nIn the section below we’ll talk more about how to deal with logs from failed evaluations (e.g. retrying the eval).\nYou can enumerate, read, and write EvalLog objects using the following helper functions from the inspect_ai.log module:\n\n\n\n\n\n\n\nFunction\nDescription\n\n\n\n\nlist_eval_logs()\nList all of the eval logs at a given location.\n\n\nread_eval_log(log_file)\nRead an EvalLog from a log file path.\n\n\nwrite_eval_log(log, log_file)\nWrite an EvalLog to a log file path.\n\n\n\nA common workflow is to define an INSPECT_LOG_DIR for running a set of evaluations, then calling list_eval_logs() to analyse the results when all the work is done:\n# setup log dir context\nos.environ[\"INSPECT_LOG_DIR\"] = \"./experiment-logs\"\n\n# do a bunch of evals\neval(popularity, model=\"openai/gpt-4\")\neval(security_guide, model=\"openai/gpt-4\")\n\n# analyze the results in the logs\nlogs = list_eval_logs()\nNote that list_eval_logs() lists log files recursively. Pass recursive=False to list only the log files at the root level.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -839,7 +883,7 @@
     "text": "Errors and Retries\nWhen an evaluation task fails due to an error or is otherwise interrupted (e.g. by a Ctrl+C), an evaluation log is still written. In many cases errors are transient (e.g. due to network connectivity or a rate limit) and can be subsequently retried.\nFor these cases, Inspect includes an eval-retry command and eval_retry() function that you can use to resume tasks interrupted by errors (including preserving samples already completed within the original task). For example, if you had a failing task with log file logs/2024-05-29T12-38-43_math_Gprr29Mv.json, you could retry it from the shell with:\n$ inspect eval-retry logs/2024-05-29T12-38-43_math_43_math_Gprr29Mv.json\nOr from Python with:\neval_retry(\"logs/2024-05-29T12-38-43_math_43_math_Gprr29Mv.json\")\nNote that eval_retry() does not overwrite the previous log file, but rather creates a new one (preserving the task_id from the original file).\nHere’s an example of retrying a failed eval with a lower number of max_connections (the theory being that too many concurrent connections may have caused a rate limit error):\nlog = eval(my_task)[0]\nif log.status != \"success\":\n  eval_retry(log, max_connections = 3)\n\nSample Preservation\nWhen retrying a log file, Inspect will attempt to re-use completed samples from the original task. This can result in substantial time and cost savings compared to starting over from the beginning.\n\nIDs and Shuffling\nAn important constraint on the ability to re-use completed samples is matching them up correctly with samples in the new task. To do this, Inspect requires stable unique identifiers for each sample. This can be achieved in 1 of 2 ways:\n\nSamples can have an explicit id field which contains the unique identifier; or\nYou can rely on Inspect’s assignment of an auto-incrementing id for samples, however this will not work correctly if your dataset is shuffled. Inspect will log a warning and not re-use samples if it detects that the dataset.shuffle() method was called, however if you are shuffling by some other means this automatic safeguard won’t be applied.\n\nIf dataset shuffling is important to your evaluation and you want to preserve samples for retried tasks, then you should include an explicit id field in your dataset.\n\n\nMax Samples\nAnother consideration is max_samples, which is the maximum number of samples to run concurrently within a task. Larger numbers of concurrent samples will result in higher throughput, but will also result in completed samples being written less frequently to the log file, and consequently less total recovable samples in the case of an interrupted task.\nBy default, Inspect sets the value of max_samples to max_connections + 1, ensuring that the model API is always fully saturated (note that it would rarely make sense to set it lower than max_connections). The default max_connections is 10, which will typically result in samples being written to the log frequently. On the other hand, setting a very large max_connections (e.g. 100 max_connections for a dataset with 100 samples) may result in very few recoverable samples in the case of an interruption.\n\n\n\n\n\n\nEval Suites\n\n\n\nWe’ve discussed how to manage retries for a single evaluation run interactively. For the case of running many evaluation tasks in batch and retrying those which failed, see the documentation on retrying tasks within Eval Suites",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -850,7 +894,7 @@
     "text": "Amazon S3\nStoring evaluation logs on S3 provides a more permanent and secure store than using the local filesystem. While the inspect eval command has a --log-dir argument which accepts an S3 URL, the most convenient means of directing inspect to an S3 bucket is to add the INSPECT_LOG_DIR environment variable to the .env file (potentially alongside your S3 credentials). For example:\nINSPECT_LOG_DIR=s3://my-s3-inspect-log-bucket\nAWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\nAWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\nAWS_DEFAULT_REGION=eu-west-2\nOne thing to keep in mind if you are storing logs on S3 is that they will no longer be easily viewable using a local text editor. You will likely want to configure a FUSE filesystem so you can easily browse the S3 logs locally.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -861,7 +905,7 @@
     "text": "Log CLI Commands\nWe’ve shown a number of Python functions that let you work with eval logs from code. However, you may be writing an orchestration or visualisation tool in another language (e.g. TypeScript) where its not particularly convenient to call the Python API. The Inspect CLI has a few commands intended to make it easier to work with Inspect logs from other languages.\n\nListing Logs\nYou can use the inspect list logs command to enumerate all of the logs for a given log directory. This command will utilise the INSPECT_LOG_DIR if it is set (alternatively you can specify a --log-dir directly). You’ll likely also want to use the --json flag to get more granular and structured information on the log files. For example:\n$ inspect list logs --json           # uses INSPECT_LOG_DIR\n$ inspect list logs --json --log-dir ./security_04-07-2024\nYou can also use the --status option to list only logs with a success or error status:\n$ inspect list logs --json --status success\n$ inspect list logs --json --status error\nYou can use the --retryable option to list only logs that are retryable\n$ inspect list logs --json --retryable\n\n\nReading Logs\nThe inspect list logs command will return set of URIs to log files which will use a variety of protocols (e.g. file://, s3://, gcs://, etc.). You might be tempted to try to read these URIs directly, however you should always do so using the inspect info log-file command. This is because log files can be located on remote storage systems (e.g. Amazon S3) that users have configured read/write credentials for within their Inspect environment, and you’ll want to be sure to take advantage of these credentials.\nFor example, here we read a local log file and a log file on Amazon S3:\n$ inspect info log-file file:///home/user/log/logfile.json\n$ inspect info log-file s3://my-evals-bucket/logfile.json\nLog files are stored in JSON. You can get the JSON schema and TypeScript type definitions for the log file format with the following calls to inspect info:\n$ inspect info log-schema\n$ inspect info log-types\n\n\n\n\n\n\nNaN and Inf\n\n\n\nBecause evaluation logs contain lots of numerical data and calculations, it is possible that some number values will be NaN or Inf. These numeric values are supported natively by Python’s JSON parser, however are not supported by the JSON parsers built in to browsers and Node JS.\nTo correctly read Nan and Inf values from eval logs in JavaScript, we recommend that you use the JSON5 Parser. For other languages, Nan and Inf may be natively supported (if not, see these JSON 5 implementations for other languages).",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>11</span>  <span class='chapter-title'>Eval Logs</span>"
+      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Logs</span>"
     ]
   },
   {
@@ -872,7 +916,7 @@
     "text": "Overview\nMost of the examples in the documentation run a single evaluation task by either passing a script name to inspect eval or by calling the eval() function directly. While this is a good workflow for developing evaluations, once you’ve settled on a group of evaluations you want to run frequently, you’ll typically want to run them all together as an evaluation suite. Below we’ll cover the various tools and techniques available to create eval suites.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Suites</span>"
+      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Suites</span>"
     ]
   },
   {
@@ -883,7 +927,7 @@
     "text": "Prerequisites\nBefore describing the various ways you can define and run eval suites, we’ll cover some universal prerequisites related to logging and task definitions.\n\nLogging Context\nA precursor to running any evaluation suite is to establish an isolated logging context for it. This enables you to enumerate and analyse all of the eval logs in the suite as a cohesive whole (rather than having them intermixed with the results of other runs). Generally, you’ll do this by setting the INSPECT_LOG_DIR prior to running the suite. For example:\nexport INSPECT_LOG_DIR = ./security-mistral_04-07-2024\nexport INSPECT_EVAL_MODEL = mistral/mistral-large-latest\ninspect eval security\nThis will group all of the log files for the suite, enabling you to call list_eval_logs() to collect and analyse all of the tasks.\n\n\nTask Definitions\nWhether you are working on evaluations in Python scripts or Jupyter Notebooks, you likely have a lot of code that looks roughly like this:\n@task\ndef security_guide():\n    return Task(\n        dataset=example_dataset(\"security_guide\"),\n        plan=[\n          system_message(SYSTEM_MESSAGE),\n          generate()\n        ],\n        scorer=model_graded_fact(),\n    )\n\neval(security_guide, model=\"google/gemini-1.0-pro\")\nThis is a natural and convenient way to run evals during development, but in a task suite you’ll want inspect eval to do the execution rather than direct calls to eval() (as this allows for varying the model, generation config, and task parameters dynamically). You can keep your existing code more or less as-is, but you’ll just want to add one line above eval():\nif __name__ == \"__main__\":\n    eval(security_guide, model=\"google/gemini-1.0-pro\")\nDoing this allows your source file to be both a Python script that is convenient to run during development as well as be a Python module that tasks can be read from without executing the eval. There is no real downside to this, and it’s a good way in general to write all of your eval scripts and notebooks (see the docs on __main__ for additional details.)",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Suites</span>"
+      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Suites</span>"
     ]
   },
   {
@@ -894,7 +938,7 @@
     "text": "Use Cases\n\nMultiple Tasks in a File\nThe simplest possible eval suite would be multiple tasks defined in a single source file. Consider this source file (ctf.py) with two tasks in it:\n@task\ndef jeopardy():\n  return Task(\n    ...\n  )\n\n@task\ndef attack_defense():\n  return Task(\n    ...\n  )\nWe can run both of these tasks with the following command (note for this and the remainder of examples we’ll assume that you have let an INSPECT_EVAL_MODEL environment variable so you don’t need to pass the --model argument explicitly):\n$ inspect eval ctf.py \nNote we could also run the tasks individually as follows (e.g. for development and debugging):\n$ inspect eval ctf.py@jeopardy\n$ inspect eval ctf.py@attack_defense\n\n\nMultiple Tasks in a Directory\nNext, let’s consider a multiple tasks in a directory. Imagine you have the following directory structure, where jeopardy.py and attack_defense.py each have one or more @task functions defined:\nsecurity/\n  import.py\n  analyze.py\n  jeopardy.py\n  attack_defense.py\nHere is the listing of all the tasks in the suite:\n$ inspect list tasks security\njeopardy.py@crypto\njeopardy.py@decompile\njeopardy.py@packet\njeopardy.py@heap_trouble\nattack_defense.py@saar\nattack_defense.py@bank\nattack_defense.py@voting\nattack_defense.py@dns\nYou can run this eval suite as follows:\n$ inspect eval security\nNote that some of the files in this directory don’t contain evals (e.g. import.py and analyze.py). These files are not read or executed by inspect eval (which only executes files that contain @task definitions).\nIf we wanted to run more than one directory we could do so by just passing multiple directory names. For example:\n$ inspect eval security persuasion\n\n\nEval Function\nNote that all of the above example uses of inspect eval apply equally to the eval() function. in the context of the above, all of these statements would work as expected:\neval(\"ctf.py\")\neval(\"ctf.py@jeopardy\")\neval(\"ctf.py@attack_defense\")\n\neval(\"security\")\neval([\"security\", \"persuasion\"])",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Suites</span>"
+      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Suites</span>"
     ]
   },
   {
@@ -905,7 +949,7 @@
     "text": "Listing and Filtering\n\nRecursive Listings\nNote that directories or expanded globs of directory names passed to eval are recursively scanned for tasks. So you could have a very deep hierarchy of directories, with a mix of task and non task scripts, and the eval command or function will discover all of the tasks automatically.\nThere are some rules for how recursive directory scanning works that you should keep in mind:\n\nSources files and directories that start with . or _ are not scanned for tasks.\nDirectories named env, venv, and tests are not scanned for tasks.\n\n\n\nAttributes and Filters\nEval suites will sometimes be defined purely by directory structure, but there will be cross-cutting concerns that are also used to filter what is run. For example, you might want to define some tasks as part of a “light” suite that is less expensive and time consuming to run. This is supported by adding attributes to task decorators. For example:\n@task(light=True)\ndef jeopardy():\n  return Task(\n    ...\n  )\nGiven this, you could list all of the light tasks in security and pass them to eval() as follows:\nlight_suite = list_tasks(\n  \"security\", \n  filter = lambda task: task.attribs.get(\"light\") is True\n)\nlogs = eval(light_suite)\nNote that the inspect list tasks command can also be used to enumerate tasks in plain text or JSON (use one or more -F options if you want to filter tasks):\n$ inspect list tasks security\n$ inspect list tasks security --json\n$ inspect list tasks security --json -F light=true\n\n\n\n\n\n\nOne important thing to keep in mind when using attributes to filter tasks is that both inspect list tasks (and the underlying list_tasks() function) do not execute code when scanning for tasks (rather they parse it). This means that if you want to use a task attribute in a filtering expression it needs to be a constant (rather than the result of function call). For example:\n# this is valid for filtering expressions\n@task(light=True)\ndef jeopardy():\n  ...\n\n# this is NOT valid for filtering expressions\n@task(light=light_enabled(\"ctf\"))\ndef jeopardy():\n  ...",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Suites</span>"
+      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Suites</span>"
     ]
   },
   {
@@ -916,7 +960,7 @@
     "text": "Errors and Retries\nIf a runtime error occurs during an evaluation, it is caught, logged, and reported, and then the eval() function returns as normal. The returned EvalLog has a status field which can be used to see which tasks need to be retried, and the failed log file can be passed directly to eval_retry(), for example:\n# list the security suite and run it\ntask_suite = list_tasks(\"security\")\neval_logs = eval(task_suite)\n\n# check for failed evals and retry\nerror_logs = [log in eval_logs if log.status != \"success\"]\neval_retry(error_logs)\nNote that eval_retry() does not overwrite previous log files, but rather creates a new one (preserving the task_id from the original file). In addition, completed samples from the original file are preserved and copied to the new eval.\n\nRetry Workflow\nIf you want to create a task suite supervisor that can robustly retry failed evaluations until all work is completed, we recommend the following approach:\n\nFor a given suite of tasks, provision a dedicated log directory where all work will be recorded (you might track this independently in a supervisor database so retries can happen “later” as opposed to immediately after the first run).\nRun the task suite.\nAfter the initial run (and perhaps after a delay), query the log directory for retryable tasks, and then execute those retries (possibly using a lower max_connections if rate limiting was the source of failures).\nRepeat (3) as required until there are no more retryable tasks.\nCollect up all of the successful task logs from the log directory for analysis.\n\nHere is a somewhat simplfied version of the code required to implement this workflow. We start by creating a log directory (imagine we have a create_log_dir() function that will provision a new log_dir with a unique name) and running our evals (contained in a directory named “suite”):\nfrom inspect_ai import eval\n\n# create a new log dir with a unique path/name\nlog_dir = create_log_dir()\n\n# run the suite aginst two models (using the log_dir)\nfor model in [\"openai/gpt-4\", \"google/gemini-1.0\"]:\n    eval(\"suite\", model=model, log_dir=log_dir)\nAfter this first pass, all of the evals may have completed succesfully, or there could be some errors. We use the retryable_eval_logs() function to filter the list of logs in the directory to those which need a retry to complete. After the retries, there still could be failures, so we run in a loop until there are no more retryable logs:\nfrom inspect_ai.log import list_eval_logs, retryable_eval_logs\n\nretryable = retryable_eval_logs(list_eval_logs(log_dir))\nwhile (len(retryable) &gt; 0):\n    eval_retry(retryable, log_dir = log_dir)\n    retryable = retryable_eval_logs(list_eval_logs(log_dir))\nThis is oversimplified because we’d likely also want to (a) Wait for some time between retries; (b) Have a maximum number of iterations before giving up; and (c) Analyse the errors and try to remedy (e.g. reduce max_connections for rate limit errors).\nThe retryable_eval_logs() function takes a log listing and filters it as follows:\n\nFinds all logs with status \"error\" or \"cancelled\"\nChecks to see if another log with the same task_id has a status of \"success\" (in that case, discard the log from the retryable pool).\nFor each retryable log not found to have been subsequently completed, take the most recent one associated with the task_id (for handling multiple retries).\n\nWhen retryable_eval_logs() returns an empty list, it indicates that all of the tasks have run successfully. At this point, we’ll likely want to collect up all of the successful logs (note that there will still be logs with errors in the log_dir as logs aren’t overwritten on retry). We can do this with a filter as follows:\nlogs = list_eval_logs(\n  log_dir=log_dir\n  filter=lambda log : log.status == \"success\")\n)\n\n\nSample Preservation\nWhen retrying a log file, Inspect will attempt to re-use completed samples from the original task. This can result in substantial time and cost savings compared to starting over from the beginning.\n\nIDs and Shuffling\nAn important constraint on the ability to re-use completed samples is matching them up correctly with samples in the new task. To do this, Inspect requires stable unique identifiers for each sample. This can be achieved in 1 of 2 ways:\n\nSamples can have an explicit id field which contains the unique identifier; or\nYou can rely on Inspect’s assignment of an auto-incrementing id for samples, however this will not work correctly if your dataset is shuffled. Inspect will log a warning and not re-use samples if it detects that the dataset.shuffle() method was called, however if you are shuffling by some other means this automatic safeguard won’t be applied.\n\nIf dataset shuffling is important to your evaluation and you want to preserve samples for retried tasks, then you should include an explicit id field in your dataset.\n\n\nMax Samples\nAnother consideration is max_samples, which is the maximum number of samples to run concurrently within a task. Larger numbers of concurrent samples will result in higher throughput, but will also result in completed samples being written less frequently to the log file, and consequently less total recovable samples in the case of an interrupted task.\nBy default, Inspect sets the value of max_samples to max_connections + 1, ensuring that the model API is always fully saturated (note that it would rarely make sense to set it lower than max_connections). The default max_connections is 10, which will typically result in samples being written to the log frequently. On the other hand, setting a very large max_connections (e.g. 100 max_connections for a dataset with 100 samples) may result in very few recoverable samples in the case of an interruption.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>12</span>  <span class='chapter-title'>Eval Suites</span>"
+      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Suites</span>"
     ]
   },
   {
@@ -927,7 +971,7 @@
     "text": "Overview\nInspect runs evaluations using a highly parallel async architecture. Rather than processing a batch at a time, all samples are processed concurrently. This is possible because evaluations generally use relatively little local compute, but rather spend most of their time waiting for model API calls and web requests to complete. Consequently, Inspect eagerly executes as much local computation as it can and at the same time ensures that model APIs are not over-saturated by enforcing a maximum number of concurrent connections.\nThis section describes how to tune Inspect’s concurrency, as well as how to handle situations where more local compute is required.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Tuning</span>"
+      "<span class='chapter-number'>14</span>  <span class='chapter-title'>Eval Tuning</span>"
     ]
   },
   {
@@ -938,7 +982,7 @@
     "text": "Model APIs\n\nMax Connections\nConnections to model APIs are the most fundamental unit of concurrency to manage. The main thing that limits model API concurrency is not local compute or network availability, but rather rate limits imposed by model API providers. Here we run an evaluation and set the maximum connections to 20:\n$ inspect eval --model openai/gpt-4 --max-connections 20\nThe default value for max connections is 10. By increasing it we might get better performance due to higher parallelism, however we might get worse performance if this causes us to frequently hit rate limits (which are retried with exponential backoff). The “correct” max connections for your evaluations will vary based on your actual rate limit and the size and complexity of your evaluations.\n\n\nRate Limits\nWhen you run an eval you’ll see information reported on the current active connection usage as well as the number of HTTP rate limit errors that have been encountered (note that Inspect will automatically retry on rate limits and other errors likely to be transient):\n\nHere we’ve set a higher max connections than the default (30). While you might be tempted to set this very high to see how much concurrent traffic you can sustain, more often than not setting too high a max connections will result in slower evaluations, because retries are done using exponential backoff, and bouncing off of rate limits too frequently will have you waiting minutes for retries to fire.\nYou should experiment with various values for max connections at different times of day (evening is often very different than daytime!). Generally speaking, you want to see some number of HTTP rate limits enforced so you know that are somewhere close to ideal utilisation, but if you see hundreds of these you are likely over-saturating and experiencing a net slowdown.\n\n\nLimiting Retries\nBy default, inspect will continue to retry model API calls (with exponential backoff) indefinitely when a rate limit error (HTTP status 429) is returned . You can limit these retries by using the max_retries and timeout eval options. For example:\n$ inspect eval --model openai/gpt-4 --max-retries 10 --timeout 600\nIf you want more insight into Model API connections and retries, specify log_level=http. For example:\n$ inspect eval --model openai/gpt-4 --log-level=http\n\n\n\n\n\n\nNote that max connections is applied per-model. This means that if you use a grader model from a provider distinct from the one you are evaluating you will get extra concurrency (as each model will enforce its own max connections).",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Tuning</span>"
+      "<span class='chapter-number'>14</span>  <span class='chapter-title'>Eval Tuning</span>"
     ]
   },
   {
@@ -949,7 +993,7 @@
     "text": "Other APIs\nIt’s possible that your custom solvers, tools, or scorers will call other REST APIs. Two things to keep in mind when doing this are:\n\nIt’s critical that connections to other APIs use async HTTP APIs (i.e. the httpx model rather than the requests module). This is because Inspect’s parallelism relies on everything being async, so if you make a blocking HTTP call with requests it will actually hold up all of the rest of the work in system!\nAs with model APIs, rate limits may be in play, so it’s important not to over-saturate these connections. Recall that Inspect runs all samples in parallel so if you have 500 samples and don’t do anything to limit concurrency, you will likely end up making hundreds of calls at a time to the API.\n\nHere’s some (oversimplified) example code that illustrates how to call a REST API within an Inspect component. We use the async interface of the httpx module, and we use Inspect’s concurrency() function to limit simultaneous connections to 10:\nimport httpx\nfrom inspect_ai.util import concurrency\nfrom inspect_ai.solver import Generate, TaskState\n\nclient = httpx.AsyncClient()\n\nasync def solve(state: TaskState, generate: Generate):\n  ...\n  # wrap the call to client.get() in an async concurrency \n  # block to limit simultaneous connections to 10\n  async with concurrency(\"my-rest-api\", 10):\n    response = await client.get(\"https://example.com/api\")\nNote that we pass a name (“my-rest-api”) to the concurrency() function. This provides a named scope for managing concurrency for calls to that specific API/service.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Tuning</span>"
+      "<span class='chapter-number'>14</span>  <span class='chapter-title'>Eval Tuning</span>"
     ]
   },
   {
@@ -960,7 +1004,7 @@
     "text": "Subprocesses\nIt’s possible that your custom solvers, tools, or scorers will need to launch child processes to perform various tasks. Subprocesses have similar considerations as calling APIs: you want to make sure that they don’t block the rest of the work in Inspect (so they should be invoked with async) and you also want to make sure they don’t provide too much concurrency (i.e. you wouldn’t want to launch 200 processes at once on a 4 core machine!)\nTo assist with this, Inspect provides the subprocess() function. This async function takes a command and arguments and invokes the specified command asynchronously, collecting and returning stdout and stderr. The subprocess() function also automatically limits concurrent child processes to the number of CPUs on your system (os.cpu_count()). Here’s an example from the implementation of a list_files() tool:\n@tool(prompt=(\n   \"If you are asked to list the files in a directory you \"\n   + \"should call the list_files function to access the listing.\"\n))\ndef list_files():\n    async def execute(dir: str):\n        \"\"\"List the files in a directory.\n\n        Args:\n            dir (str): Directory\n\n        Returns:\n            File listing of the directory\n        \"\"\"\n        result = await subprocess([\"ls\", dir])\n        if result.success:\n            return result.stdout\n        else:\n            return f\"Error: {result.stderr}\"\n\n    return execute\nThe maximum number of concurrent subprocesses can be modified using the --max-subprocesses option. For example:\n$ inspect eval --model openai/gpt-4 --max-subprocesses 4\nNote that if you need to execute computationally expensive code in an eval, you should always factor it into a call to subprocess() so that you get optimal concurrency and performance.\n\nTimeouts\nIf you need to ensure that your subprocess runs for no longer than a specified interval, you can use the timeout option. For example:\nresult = await subprocess([\"ls\", dir], timeout = 30)\nIf a timeout occurs, then the result.status will be False and a timeout error message will be included in result.stderr.",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Tuning</span>"
+      "<span class='chapter-number'>14</span>  <span class='chapter-title'>Eval Tuning</span>"
     ]
   },
   {
@@ -971,7 +1015,7 @@
     "text": "Parallel Code\nGenerally speaking, you should try to make all of the code you write within Inspect solvers, tools, and scorers as parallel as possible. The main idea is to eagerly post as much work as you can, and then allow the various concurrency gates described above to take care of not overloading remote APIs or local resources. There are two keys to writing parallel code:\n\nUse async for all potentially expensive operations. If you are calling a remote API, use the httpx.AsyncClient. If you are running local code, use the subprocess() function described above.\nIf your async work can be parallelised, do it using asyncio.gather(). For example, if you are calling three different model APIs to score a task, you can call them all in parallel. Or if you need to retrieve 10 web pages you don’t need to do it in a loop—rather, you can fetch them all at once.\n\n\nModel Requests\nLet’s say you have a scorer that uses three different models to score based on majority vote. You could make all of the model API calls in parallel as follows:\nfrom inspect_ai.model import get_model\n\nmodels = [\n  get_model(\"openai/gpt-4\"),\n  get_model(\"anthropic/claude-3-sonnet-20240229\"),\n  get_model(\"mistral/mistral-large-latest\")\n]\n\noutput = \"Output to be scored\"\nprompt = f\"Could you please score the following output?\\n\\n{output}\"\n\ngraders = [model.generate(prompt) for model in models]\n\ngrader_outputs = await asyncio.gather(*graders)\nNote that we don’t await the call to model.generate() when building our list of graders. Rather the call to asyncio.gather() will await each of these requests and return when they have all completed. Inspect’s internal handling of max_connections for model APIs will apply to these requests, so you need now worry about how many you put in flight, they will be throttled as appropriate.\n\n\nWeb Requests\nHere’s an examples of using asyncio.gather() to parallelise web requests:\nimport asyncio\nimport httpx\nclient = httpx.AsyncClient()\n\npages = [\n  \"https://www.openai.com\",\n  \"https://www.anthropic.com\",\n  \"https://www.google.com\",\n  \"https://mistral.ai/\"\n]\n\ndownloads = [client.get(page) for page in pages]\n\nresults = await asyncio.gather(*downloads)\nNote that we don’t await the client requests when building up our list of downloads. Rather, we let asyncio.gather() await all of them, returning only when all of the results are available. Compared to looping over each page download this will execute much, much quicker. Note that if you are sending requests to a REST API that might have rate limits, you should consider wrapping your HTTP requests in a concurrency() block. For example:\nfrom inspect_ai.util import concurrency\n\nasync def download(page):\n  async with concurrency(\"my-web-api\", 2):\n    return await client.get(page)\n  \ndownloads = [download(page) for page in pages]\n\nresults = await asyncio.gather(*downloads)",
     "crumbs": [
       "Advanced",
-      "<span class='chapter-number'>13</span>  <span class='chapter-title'>Eval Tuning</span>"
+      "<span class='chapter-number'>14</span>  <span class='chapter-title'>Eval Tuning</span>"
     ]
   }
 ]
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index a0690fe34..f3ee6d919 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,7 +2,7 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/index.html</loc>
-    <lastmod>2024-05-29T13:38:47.737Z</lastmod>
+    <lastmod>2024-06-09T16:30:10.928Z</lastmod>
   </url>
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/workflow.html</loc>
@@ -18,15 +18,19 @@
   </url>
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/examples.html</loc>
-    <lastmod>2024-06-07T15:22:10.708Z</lastmod>
+    <lastmod>2024-06-09T16:32:58.505Z</lastmod>
   </url>
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/solvers.html</loc>
-    <lastmod>2024-06-06T14:39:44.506Z</lastmod>
+    <lastmod>2024-06-09T16:30:10.928Z</lastmod>
   </url>
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/tools.html</loc>
-    <lastmod>2024-06-07T13:37:22.173Z</lastmod>
+    <lastmod>2024-06-09T16:30:10.928Z</lastmod>
+  </url>
+  <url>
+    <loc>https://UKGovernmentBEIS.github.io/inspect_ai/agents.html</loc>
+    <lastmod>2024-06-09T16:30:10.928Z</lastmod>
   </url>
   <url>
     <loc>https://UKGovernmentBEIS.github.io/inspect_ai/scorers.html</loc>
diff --git a/solvers.html b/solvers.html
index 58a2f0b5c..51b631906 100644
--- a/solvers.html
+++ b/solvers.html
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -397,6 +402,8 @@ <h2 class="anchored" data-anchor-id="built-in-solvers">Built-In Solvers</h2>
 <p>A solver which presents A,B,C,D style <code>choices</code> from input samples (in a random order), calls <code>generate()</code> to yield model output, then maps the answer back to the correct index for scoring. Note that you don’t need to call <code>generate()</code> separately when using this solver.</p></li>
 <li><p><code>self_critique()</code></p>
 <p>Prompts the model to critique the results of a previous call to <code>generate()</code> (note that this need not be the same model as they one you are evaluating—use the <code>model</code> parameter to choose another model). Makes use of <code>{question}</code> and <code>{completion}</code> template variables.</p></li>
+<li><p><code>use_tools()</code></p>
+<p>Define the set tools available for use by the model during <code>generate()</code>.</p></li>
 </ul>
 <section id="multiple-choice" class="level3">
 <h3 class="anchored" data-anchor-id="multiple-choice">Multiple Choice</h3>
diff --git a/tools.html b/tools.html
index 437a1ba5f..1d57c9183 100644
--- a/tools.html
+++ b/tools.html
@@ -64,7 +64,7 @@
 <script src="site_libs/quarto-search/fuse.min.js"></script>
 <script src="site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="./">
-<link href="./scorers.html" rel="next">
+<link href="./agents.html" rel="next">
 <link href="./solvers.html" rel="prev">
 <script src="site_libs/quarto-html/quarto.js"></script>
 <script src="site_libs/quarto-html/popper.min.js"></script>
@@ -211,6 +211,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link active"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -268,24 +273,11 @@ <h2 id="toc-title">Table of contents</h2>
   <li><a href="#overview" id="toc-overview" class="nav-link active" data-scroll-target="#overview">Overview</a></li>
   <li><a href="#tool-basics" id="toc-tool-basics" class="nav-link" data-scroll-target="#tool-basics">Tool Basics</a></li>
   <li><a href="#tool-choice" id="toc-tool-choice" class="nav-link" data-scroll-target="#tool-choice">Tool Choice</a></li>
-  <li><a href="#sec-tool-environments" id="toc-sec-tool-environments" class="nav-link" data-scroll-target="#sec-tool-environments">Tool Environments</a>
-  <ul class="collapse">
-  <li><a href="#example-file-listing" id="toc-example-file-listing" class="nav-link" data-scroll-target="#example-file-listing">Example: File Listing</a></li>
-  <li><a href="#environment-interface" id="toc-environment-interface" class="nav-link" data-scroll-target="#environment-interface">Environment Interface</a></li>
-  <li><a href="#environment-binding" id="toc-environment-binding" class="nav-link" data-scroll-target="#environment-binding">Environment Binding</a></li>
-  <li><a href="#sec-docker-configuration" id="toc-sec-docker-configuration" class="nav-link" data-scroll-target="#sec-docker-configuration">Docker Configuration</a></li>
-  <li><a href="#resource-management" id="toc-resource-management" class="nav-link" data-scroll-target="#resource-management">Resource Management</a></li>
-  <li><a href="#troubleshooting" id="toc-troubleshooting" class="nav-link" data-scroll-target="#troubleshooting">Troubleshooting</a></li>
-  </ul></li>
   <li><a href="#built-in-tools" id="toc-built-in-tools" class="nav-link" data-scroll-target="#built-in-tools">Built-In Tools</a>
   <ul class="collapse">
   <li><a href="#web-search" id="toc-web-search" class="nav-link" data-scroll-target="#web-search">Web Search</a></li>
   <li><a href="#bash-and-python" id="toc-bash-and-python" class="nav-link" data-scroll-target="#bash-and-python">Bash and Python</a></li>
   </ul></li>
-  <li><a href="#agent-solvers" id="toc-agent-solvers" class="nav-link" data-scroll-target="#agent-solvers">Agent Solvers</a>
-  <ul class="collapse">
-  <li><a href="#example-wikipedia-search" id="toc-example-wikipedia-search" class="nav-link" data-scroll-target="#example-wikipedia-search">Example: Wikipedia Search</a></li>
-  </ul></li>
   </ul>
 <div class="toc-actions"><ul><li><a href="https://github.com/UKGovernmentBEIS/inspect_ai/issues/new" class="toc-action"><i class="bi bi-github"></i>Report an issue</a></li></ul></div></nav>
     </div>
@@ -325,7 +317,7 @@ <h2 class="anchored" data-anchor-id="overview">Overview</h2>
 </div>
 </div>
 <div class="callout-body-container callout-body">
-<p>One application of tools is to run them within an agent scaffold that pursues an objective over multiple interactions with a model. The scaffold uses the model to help make decisions about which tools to use and when, and orchestrates calls to the model to use the tools. We’ll cover how to use agent scaffolds in <a href="#agent-solvers">Agent Solvers</a> below.</p>
+<p>One application of tools is to run them within an agent scaffold that pursues an objective over multiple interactions with a model. The scaffold uses the model to help make decisions about which tools to use and when, and orchestrates calls to the model to use the tools. This is covered in more depth in the <a href="agents.html">Agents</a> section.</p>
 </div>
 </div>
 </section>
@@ -356,9 +348,12 @@ <h2 class="anchored" data-anchor-id="tool-basics">Tool Basics</h2>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> addition_problem():</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
 <span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>[Sample(<span class="bu">input</span><span class="op">=</span><span class="st">"What is 1 + 1?"</span>, target<span class="op">=</span>[<span class="st">"2"</span>])],</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[use_tools(add()), generate()],</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>match(numeric<span class="op">=</span><span class="va">True</span>),</span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>            use_tools(add()), </span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>            generate()</span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>        ],</span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>match(numeric<span class="op">=</span><span class="va">True</span>),</span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Note that this tool doesn’t make network requests or do heavy computation, so is fine to run as inline Python code. If your tool does do more elaborate things, you’ll want to make sure it plays well with Inspect’s concurrency scheme. For network requests, this amounts to using <code>async</code> HTTP calls with <code>httpx</code>. For heavier computation, tools should use subprocesses as described in the next section.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
@@ -391,256 +386,6 @@ <h2 class="anchored" data-anchor-id="tool-choice">Tool Choice</h2>
 <span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>  generate()</span>
 <span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </section>
-<section id="sec-tool-environments" class="level2">
-<h2 class="anchored" data-anchor-id="sec-tool-environments">Tool Environments</h2>
-<div class="callout callout-style-default callout-important callout-titled">
-<div class="callout-header d-flex align-content-center">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Important
-</div>
-</div>
-<div class="callout-body-container callout-body">
-<p>The Tool Environments feature described in this section is not yet available in the version of Inspect published to PyPI (it is only available from the development version of Inspect). To install the development version:</p>
-<div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</div>
-<p>The examples shown above execute tool code within the main process running the evaluation task. In some cases however, you may require the provisioning of dedicated environments for running tool code. This might be the case if:</p>
-<ul>
-<li><p>You are creating tools that enable execution of arbitrary code (e.g.&nbsp;a tool that executes shell commands or Python code).</p></li>
-<li><p>You need to provision per-sample file system resources.</p></li>
-<li><p>You want to provide access to a more sophisticated evaluation environment (e.g.&nbsp;creating network hosts for a cybersecurity eval).</p></li>
-</ul>
-<section id="example-file-listing" class="level3">
-<h3 class="anchored" data-anchor-id="example-file-listing">Example: File Listing</h3>
-<p>Let’s take a look at a simple example to illustrate. First, we’ll define a <code>list_files()</code> tool. This tool need to access the <code>ls</code> command—it does so by calling the <code>tool_environment()</code> function to get access to the <code>ToolEnvironment</code> instance for the currently executing <code>Sample</code>:</p>
-<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.solver <span class="im">import</span> tool, tool_environment</span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="at">@tool</span>(prompt<span class="op">=</span><span class="st">"Use the list_files function to enumerate files."</span>)</span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> list_files():</span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> execute(<span class="bu">dir</span>: <span class="bu">str</span>):</span>
-<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>        <span class="co">"""List the files in a directory.</span></span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">        Args:</span></span>
-<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">            dir (str): Directory</span></span>
-<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">        Returns:</span></span>
-<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co">            File listing of the directory</span></span>
-<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co">        """</span></span>
-<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a>        result <span class="op">=</span> <span class="cf">await</span> tool_environment().<span class="bu">exec</span>([<span class="st">"ls"</span>, <span class="bu">dir</span>])</span>
-<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> result.success:</span>
-<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a>            <span class="cf">return</span> result.stdout</span>
-<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>        <span class="cf">else</span>:</span>
-<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a>            <span class="cf">return</span> <span class="ss">f"Error: </span><span class="sc">{</span>result<span class="sc">.</span>stderr<span class="sc">}</span><span class="ss">"</span></span>
-<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> execute</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>The <code>exec()</code> function is used to list the directory contents. Note that its not immediately clear where or how <code>exec()</code> is implemented (that will be described shortly!).</p>
-<p>Here’s an evaluation that makes use of this tool:</p>
-<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> task, Task</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.solver <span class="im">import</span> generate, use_tools</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>dataset <span class="op">=</span> [</span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>    Sample(</span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span><span class="op">=</span><span class="st">'Is there a file named "bar.txt" '</span> </span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>               <span class="op">+</span> <span class="st">'in the current directory?'</span>,</span>
-<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>        target<span class="op">=</span><span class="st">"Yes"</span>,</span>
-<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>        files<span class="op">=</span>{<span class="st">"bar.txt"</span>: <span class="st">"hello"</span>},</span>
-<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>]</span>
-<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
-<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> file_probe()</span>
-<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
-<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>dataset,</span>
-<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
-<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a>            use_tools([list_files()]), </span>
-<span id="cb7-21"><a href="#cb7-21" aria-hidden="true" tabindex="-1"></a>            generate()</span>
-<span id="cb7-22"><a href="#cb7-22" aria-hidden="true" tabindex="-1"></a>        ],</span>
-<span id="cb7-23"><a href="#cb7-23" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span>,</span>
-<span id="cb7-24"><a href="#cb7-24" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>includes(),</span>
-<span id="cb7-25"><a href="#cb7-25" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb7-26"><a href="#cb7-26" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>We’ve included <code>tool_environment = "docker"</code> to indicate that tool environment operations should be executed in a Docker container. Specifying a tool environment (either at the task or evaluation level) is required if your tools call the <code>tool_environment()</code> function.</p>
-<p>Note that <code>files</code> are specified as part of the <code>Sample</code>. Files can be specified inline using plain text (as depicted above), inline using a base64-encoded data URI, or as a path to a file or remote resource (e.g.&nbsp;S3 bucket). Relative file paths are resolved according to the location of the underlying dataset file.</p>
-</section>
-<section id="environment-interface" class="level3">
-<h3 class="anchored" data-anchor-id="environment-interface">Environment Interface</h3>
-<p>The following methods are available for all tool environments:</p>
-<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> ToolEnvironment:</span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>   </span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> <span class="bu">exec</span>(</span>
-<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>,</span>
-<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>        cmd: <span class="bu">list</span>[<span class="bu">str</span>],</span>
-<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span>: <span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
-<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>        env: <span class="bu">dict</span>[<span class="bu">str</span>, <span class="bu">str</span>] <span class="op">=</span> {},</span>
-<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>        timeout: <span class="bu">int</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
-<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> ExecResult[<span class="bu">str</span>]:</span>
-<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a>        ...</span>
-<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> write_file(</span>
-<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>, <span class="bu">file</span>: <span class="bu">str</span>, contents: <span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span></span>
-<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> <span class="va">None</span>:</span>
-<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a>        ...</span>
-<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> read_file(</span>
-<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>, <span class="bu">file</span>: <span class="bu">str</span>, text: <span class="bu">bool</span> <span class="op">=</span> <span class="va">True</span></span>
-<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> Union[<span class="bu">str</span> <span class="op">|</span> <span class="bu">bytes</span>]:</span>
-<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a>        ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</section>
-<section id="environment-binding" class="level3">
-<h3 class="anchored" data-anchor-id="environment-binding">Environment Binding</h3>
-<p>There are two tool environments built in to Inspect:</p>
-<table class="table">
-<thead>
-<tr class="header">
-<th>Environment Type</th>
-<th>Description</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td><code>local</code></td>
-<td>Run <code>tool_environment()</code> methods in the same address space and file system as the running evaluation. The local environment should <em>only be used</em> if you are already running your evaluation in another sandbox.</td>
-</tr>
-<tr class="even">
-<td><code>docker</code></td>
-<td>Run <code>tool_environment()</code> methods within a Docker container (see the <a href="#sec-docker-configuration">Docker Configuration</a> section below for additional details).</td>
-</tr>
-</tbody>
-</table>
-<p>Tool environments can be bound at the <code>Task</code> level or at the <code>eval()</code> level (where <code>eval()</code> takes precedence). To bind a tool environment to a <code>Task</code>, use the <code>tool_environment</code> option:</p>
-<div class="sourceCode" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>Task(</span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    dataset<span class="op">=</span>dataset,</span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    plan([</span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>        use_tools([read_file(), list_files()])), </span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>        generate()</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>    ]),</span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>    scorer<span class="op">=</span>match(),</span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>    tool_environment<span class="op">=</span><span class="st">"docker"</span></span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>For this example, if there is a <code>compose.yaml</code> file in the task directory it will be used to provision Docker services (if there is no <code>compose.yaml</code> then the Docker’s default Python 3.12 image will be used). You can specify an alternate config file using a tuple:</p>
-<div class="sourceCode" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>tool_environment<span class="op">=</span>(<span class="st">"docker"</span>, <span class="st">"my-compose.yaml"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>Similar conventions exist for <code>eval()</code> and the CLI:</p>
-<div class="sourceCode" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="bu">eval</span>(task, tool_environment<span class="op">=</span><span class="st">"docker"</span>)</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="bu">eval</span>(task, tool_environment<span class="op">=</span>(<span class="st">"docker"</span>,<span class="st">"my-compose.yaml"</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="sourceCode" id="cb12"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval <span class="at">--tool-environment</span> docker</span>
-<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval <span class="at">--tool-environment</span> docker:my-compose.yaml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</section>
-<section id="sec-docker-configuration" class="level3">
-<h3 class="anchored" data-anchor-id="sec-docker-configuration">Docker Configuration</h3>
-<p>While <code>--tool-environment</code> can be a default un-configured environment (e.g.&nbsp;“docker”), more commonly you’ll provide explicit configuration in either a <code>Dockerfile</code> or a <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> configuration file (<code>compose.yaml</code>).</p>
-<p>Here is how Docker tool environments are created based on the presence of <code>Dockerfile</code> and/or <code>compose.yml</code> in the task directory:</p>
-<table class="table">
-<thead>
-<tr class="header">
-<th>Config Files</th>
-<th>Behavior</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td>None</td>
-<td>Creates a tool environment based on the official <a href="https://hub.docker.com/_/python">python:3.12-bookworm</a> image.</td>
-</tr>
-<tr class="even">
-<td><code>Dockerfile</code></td>
-<td>Creates a tool environment by building the image.</td>
-</tr>
-<tr class="odd">
-<td><code>compose.yaml</code></td>
-<td>Creates tool environment(s) based on <code>compose.yaml</code>.</td>
-</tr>
-</tbody>
-</table>
-<p>Here is what a simple <code>compose.yaml</code> would look like for a single tool environment that uses the <code>ctf-agent-environment</code> Docker image:</p>
-<div class="code-with-filename">
-<div class="code-with-filename-file">
-<pre><strong>compose.yaml</strong></pre>
-</div>
-<div class="sourceCode" id="cb13" data-filename="compose.yaml"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
-<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span><span class="at"> </span></span>
-<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
-<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
-<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 0.5gb</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Note that we’ve also chosen to limit the CPU and memory usage of the container (see the <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> documentation for information on these and other container options).</p>
-<section id="multiple-environments" class="level4">
-<h4 class="anchored" data-anchor-id="multiple-environments">Multiple Environments</h4>
-<p>In some cases you may want to create multiple tool environments (e.g.&nbsp;if one environment has complex dependencies that conflict with the dependencies of other environments). To do this specify multiple named services:</p>
-<div class="code-with-filename">
-<div class="code-with-filename-file">
-<pre><strong>compose.yaml</strong></pre>
-</div>
-<div class="sourceCode" id="cb14" data-filename="compose.yaml"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span></span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 0.5gb</span></span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ghidra</span><span class="kw">:</span></span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-ghidra-environment</span></span>
-<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">cpus</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
-<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">mem_limit</span><span class="kw">:</span><span class="at"> 1gb</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The first environment listed is the “default” environment, and can be accessed from within a tool with a normal call to <code>tool_environment()</code>. Other environments would be accessed by name, for example:</p>
-<div class="sourceCode" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>tool_environment()          <span class="co"># default tool environment</span></span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>tool_environment(<span class="st">"ghidra"</span>)  <span class="co"># named tool environment</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="callout callout-style-default callout-note callout-titled" data-apperance="simple">
-<div class="callout-header d-flex align-content-center">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Note
-</div>
-</div>
-<div class="callout-body-container callout-body">
-<p>If you define multiple tool environments you are <em>required</em> to name one of them “default” so that Inspect knows which environment to copy samples files to and resolve for calls to <code>tool_environment()</code> without an argument.</p>
-</div>
-</div>
-</section>
-<section id="infrastructure" class="level4">
-<h4 class="anchored" data-anchor-id="infrastructure">Infrastructure</h4>
-<p>Note that in many cases you’ll want to provision additional infrastructure (e.g.&nbsp;other hosts or volumes). For example, here we define an additional container (“writer”) as well as a volume shared between the default container and the writer container:</p>
-<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">services</span><span class="kw">:</span></span>
-<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">default</span><span class="kw">:</span><span class="at"> </span></span>
-<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-agent-environment</span></span>
-<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">volumes</span><span class="kw">:</span></span>
-<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="kw">-</span><span class="at"> ctf-challenge-volume:/shared-data</span></span>
-<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="at">    </span></span>
-<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">writer</span><span class="kw">:</span></span>
-<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">image</span><span class="kw">:</span><span class="at"> ctf-challenge-writer</span></span>
-<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">volumes</span><span class="kw">:</span></span>
-<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="kw">-</span><span class="at"> ctf-challenge-volume:/shared-data</span></span>
-<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a><span class="fu">volumes</span><span class="kw">:</span></span>
-<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ctf-challenge-volume</span><span class="kw">:</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>See the documentation on <a href="https://docs.docker.com/compose/compose-file/">Docker Compose</a> files for information on their full schema and feature set.</p>
-</section>
-</section>
-<section id="resource-management" class="level3">
-<h3 class="anchored" data-anchor-id="resource-management">Resource Management</h3>
-<p>Creating and executing code within Docker containers can be expensive both in terms of memory and CPU utilization. Inspect provides some automatic resource management to keep usage reasonable in the default case. This section describes that behavior as well as how you can tune it for your use-cases.</p>
-<section id="running-containers" class="level4">
-<h4 class="anchored" data-anchor-id="running-containers">Running Containers</h4>
-<p>As described above, each <code>Sample</code> is provisioned its own container. The number of running containers for an evaluation is therefore determined by the <code>max_samples</code> option (which is by default set to <code>max_connections</code>, typically 10 unless overridden).</p>
-<p>Use <code>max_samples</code> to dial up or down the number of containers running at any given time. Note that a running container does not necessarily use CPU resources unless it has active background processes.</p>
-</section>
-<section id="concurrent-execution" class="level4">
-<h4 class="anchored" data-anchor-id="concurrent-execution">Concurrent Execution</h4>
-<p>The <code>ToolEnvironment.exec()</code> method runs a command within a tool environment, typically consuming CPU resources. To protect against overwhelming the system’s CPUs, the implementation of <code>exec()</code> uses Inspect’s <code>subprocess()</code> function, which automatically limits concurrent child processes to the number of CPUs on your system (<code>os.cpu_count()</code>).</p>
-<p>You can change the number of permitted concurrent subprocess executions using the <code>max_subprocesses</code> option. You might do this for example if you know that your <code>exec()</code> commands tend to use <em>multiple</em> CPU cores and thus should be executed with less concurrency.</p>
-</section>
-</section>
-<section id="troubleshooting" class="level3">
-<h3 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h3>
-<p>You can view more detailed logging around the creation and use of tool environments by using the <code>tools</code> log level. For example:</p>
-<div class="sourceCode" id="cb17"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> inspect eval ctf.py <span class="at">--log-level</span> tools</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>The tools log level is just above <code>warning</code> (so it will not show <code>http</code> or <code>debug</code> level messages).</p>
-</section>
-</section>
 <section id="built-in-tools" class="level2">
 <h2 class="anchored" data-anchor-id="built-in-tools">Built-In Tools</h2>
 <p>Inspect has several built-in tools, including:</p>
@@ -651,19 +396,19 @@ <h2 class="anchored" data-anchor-id="built-in-tools">Built-In Tools</h2>
 <section id="web-search" class="level3">
 <h3 class="anchored" data-anchor-id="web-search">Web Search</h3>
 <p>The <code>web_search()</code> tool provides models the ability to enhance their context window by performing a search. By default web searches retrieve 10 results from a provider, uses a model to determine if the contents is relevant then returns the top 3 relevant search results to the main model. Here is the definition of the <code>web_search()</code> function:</p>
-<div class="sourceCode" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> web_search(</span>
-<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>    provider: Literal[<span class="st">"google"</span>] <span class="op">=</span> <span class="st">"google"</span>,</span>
-<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>    num_results: <span class="bu">int</span> <span class="op">=</span> <span class="dv">3</span>,</span>
-<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>    max_provider_calls: <span class="bu">int</span> <span class="op">=</span> <span class="dv">3</span>,</span>
-<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>    max_connections: <span class="bu">int</span> <span class="op">=</span> <span class="dv">10</span>,</span>
-<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>    model: <span class="bu">str</span> <span class="op">|</span> Model <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
-<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a>) <span class="op">-&gt;</span> Tool:</span>
-<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a>    ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> web_search(</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    provider: Literal[<span class="st">"google"</span>] <span class="op">=</span> <span class="st">"google"</span>,</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    num_results: <span class="bu">int</span> <span class="op">=</span> <span class="dv">3</span>,</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>    max_provider_calls: <span class="bu">int</span> <span class="op">=</span> <span class="dv">3</span>,</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>    max_connections: <span class="bu">int</span> <span class="op">=</span> <span class="dv">10</span>,</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>    model: <span class="bu">str</span> <span class="op">|</span> Model <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>) <span class="op">-&gt;</span> Tool:</span>
+<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>    ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>You can use the <code>web_search()</code> tool in a plan like this:</p>
-<div class="sourceCode" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>plan<span class="op">=</span>[</span>
-<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>    use_tools(web_search()), </span>
-<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>    generate()</span>
-<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>],</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>plan<span class="op">=</span>[</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>    use_tools(web_search()), </span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>    generate()</span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>],</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Web search options include:</p>
 <ul>
 <li><p><code>provider</code>—Web search provider (currently only Google is supported, see below for instructions on setup and configuration for Google).</p></li>
@@ -683,164 +428,29 @@ <h4 class="anchored" data-anchor-id="google-provider">Google Provider</h4>
 </section>
 <section id="bash-and-python" class="level3">
 <h3 class="anchored" data-anchor-id="bash-and-python">Bash and Python</h3>
-<p>The <code>bash()</code> and <code>python()</code> tools enable execution of arbitrary shell command lines and Python code, respectively. These tools require the use of a <a href="#sec-tool-environment">Tool Environment</a>, which can provide sandboxing for untrusted code. For example, here is how you might use them in an evaluation where the model is asked to write code in order to solve capture the flag (CTF) challenges:</p>
-<div class="sourceCode" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
-<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> intercode_ctf():</span>
-<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
-<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>read_dataset(),</span>
-<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
-<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>            system_message(<span class="st">"system.txt"</span>),</span>
-<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>            use_tools([bash(), python()]),</span>
-<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>            generate(),</span>
-<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>        ],</span>
-<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>includes(),</span>
-<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>        max_messages<span class="op">=</span><span class="dv">30</span>,</span>
-<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span>,</span>
-<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The <code>bash()</code> and <code>python()</code> tools enable execution of arbitrary shell commands and Python code, respectively. These tools require the use of a <a href="#sec-tool-environment">Tool Environment</a>, which can provide sandboxing for untrusted code. For example, here is how you might use them in an evaluation where the model is asked to write code in order to solve capture the flag (CTF) challenges:</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>CMD_TIMEOUT <span class="op">=</span> <span class="dv">180</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> intercode_ctf():</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>read_dataset(),</span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>[</span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>            system_message(<span class="st">"system.txt"</span>),</span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>            use_tools([</span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>                bash(CMD_TIMEOUT), </span>
+<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>                python(CMD_TIMEOUT)</span>
+<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>            ]),</span>
+<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>            generate(),</span>
+<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a>        ],</span>
+<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>includes(),</span>
+<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>        max_messages<span class="op">=</span><span class="dv">30</span>,</span>
+<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>        tool_environment<span class="op">=</span><span class="st">"docker"</span>,</span>
+<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We specify a 3-minute timeout for execution of the bash and python tools to ensure that they don’t perform extremely long running operations.</p>
+<p>See the <a href="agents.html">Agents</a> section for more details on how to build evaluations that allow models to take arbitrary actions over a longer time horizon.</p>
 <p>Note that the <code>bash()</code> and <code>python()</code> tools are not yet available in the version of Inspect published to PyPI (it is only available from the development version of Inspect). To install the development version:</p>
-<div class="sourceCode" id="cb21"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</section>
-</section>
-<section id="agent-solvers" class="level2">
-<h2 class="anchored" data-anchor-id="agent-solvers">Agent Solvers</h2>
-<p>Agent solvers typically have multiple interactions with a model, generating completions, orchestrating the use of tools, and using the model to plan their next action. Agents are an area of active research, and many schemes for implementing them have been developed, including <a href="https://arxiv.org/abs/2306.02224">AutoGPT</a>, <a href="https://arxiv.org/pdf/2303.11366.pdf">ReAct</a>, and <a href="https://arxiv.org/pdf/2303.11366.pdf">Reflexion</a>. There are also Python libraries such <a href="https://python.langchain.com/docs/modules/agents/">LangChain</a> and <a href="https://langroid.github.io/langroid/">Langroid</a> which facilitate using these techniques with various LLMs.</p>
-<p>Inspect supports a wide variety of approaches to agents and agent libraries. Agent libraries generally take chat history as an input and produce a completion string as output—this interface can be easily adapted to solvers, with chat history coming from <code>TaskState</code> and completions being set as <code>ModelOutput</code>.</p>
-<p>There are several approaches to creating an Inspect solver that uses an agent scaffold:</p>
-<ol type="1">
-<li><p>Implement your own scaffolding (potentially implementing the ReAct algorithm or a derivative). This will involve repeated calls to <code>generate()</code> with various <code>tools</code> being made available in the <code>TaskState</code> for each call. It will also involve using the model to help determine what actions to take next.</p></li>
-<li><p>Adapt another scaffolding scheme provided by a research paper or open source library.</p></li>
-<li><p>Integrate a 3rd party agent library like <a href="https://python.langchain.com/docs/modules/agents/">LangChain</a> and <a href="https://langroid.github.io/langroid/">Langroid</a>.</p></li>
-</ol>
-<p>If you are adapting research code or using a 3rd party library, it’s important that the agent scaffolding use Inspect’s model API rather than whatever interface is built in to the existing code or library (otherwise you might be evaluating the wrong model!) We’ll describe how to do that for <a href="https://python.langchain.com/docs/modules/agents/">LangChain</a> in the example below.</p>
-<section id="example-wikipedia-search" class="level3">
-<h3 class="anchored" data-anchor-id="example-wikipedia-search">Example: Wikipedia Search</h3>
-<p>In this example we’ll demonstrate how to integrate a LangChain OpenAI tools agent with Inspect. This agent will use Wikipedia via the <a href="https://tavily.com/">Tavili Search API</a> to perform question answering tasks. If you want to start by getting some grounding in the code <em>without</em> the Inspect integration, see <a href="https://brightinventions.pl/blog/introducing-langchain-agents-tutorial-with-example/">this article</a> upon which the example is based.</p>
-<p>The main thing that an integration with an agent framework needs to account for is:</p>
-<ol type="1">
-<li><p>Bridging Inspect’s model API into the API of the agent framework. In this example this is done via the <code>InspectChatModel</code> class (which derives from the LangChain <code>BaseChatModel</code> and provides access to the Inspect model being used for the current evaluation).</p></li>
-<li><p>Bridging from the Inspect solver interface to the standard input and output types of the agent library. In this example this is provided by the <code>langchain_solver()</code> function, which takes a LangChain agent function and converts it to an Inspect solver.</p></li>
-</ol>
-<p>Here’s the implementation of <code>langchain_solver()</code> (imports excluded for brevity):</p>
-<div class="sourceCode" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Interface for LangChain agent function</span></span>
-<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> LangChainAgent(Protocol):</span>
-<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> <span class="fu">__call__</span>(<span class="va">self</span>, llm: BaseChatModel, <span class="bu">input</span>: <span class="bu">dict</span>[<span class="bu">str</span>, Any]): ...</span>
-<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert a LangChain agent function into a Solver</span></span>
-<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> langchain_solver(agent: LangChainAgent) <span class="op">-&gt;</span> Solver:</span>
-<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> solve(state: TaskState, generate: Generate) <span class="op">-&gt;</span> TaskState:</span>
-<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># create the inspect model api bridge</span></span>
-<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a>        llm <span class="op">=</span> InspectChatModel()</span>
-<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a>        <span class="co"># call the agent</span></span>
-<span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a>        <span class="cf">await</span> agent(</span>
-<span id="cb22-15"><a href="#cb22-15" aria-hidden="true" tabindex="-1"></a>            llm <span class="op">=</span> llm,</span>
-<span id="cb22-16"><a href="#cb22-16" aria-hidden="true" tabindex="-1"></a>            <span class="bu">input</span> <span class="op">=</span> <span class="bu">dict</span>(</span>
-<span id="cb22-17"><a href="#cb22-17" aria-hidden="true" tabindex="-1"></a>                <span class="bu">input</span><span class="op">=</span>state.user_prompt.text,</span>
-<span id="cb22-18"><a href="#cb22-18" aria-hidden="true" tabindex="-1"></a>                chat_history<span class="op">=</span>as_langchain_chat_history(</span>
-<span id="cb22-19"><a href="#cb22-19" aria-hidden="true" tabindex="-1"></a>                    state.messages[<span class="dv">1</span>:]</span>
-<span id="cb22-20"><a href="#cb22-20" aria-hidden="true" tabindex="-1"></a>                ),</span>
-<span id="cb22-21"><a href="#cb22-21" aria-hidden="true" tabindex="-1"></a>            )</span>
-<span id="cb22-22"><a href="#cb22-22" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="cb22-23"><a href="#cb22-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-24"><a href="#cb22-24" aria-hidden="true" tabindex="-1"></a>        <span class="co"># collect output from llm interface</span></span>
-<span id="cb22-25"><a href="#cb22-25" aria-hidden="true" tabindex="-1"></a>        state.messages <span class="op">=</span> llm.messages</span>
-<span id="cb22-26"><a href="#cb22-26" aria-hidden="true" tabindex="-1"></a>        state.output <span class="op">=</span> llm.output</span>
-<span id="cb22-27"><a href="#cb22-27" aria-hidden="true" tabindex="-1"></a>        state.output.completion <span class="op">=</span> output</span>
-<span id="cb22-28"><a href="#cb22-28" aria-hidden="true" tabindex="-1"></a>        </span>
-<span id="cb22-29"><a href="#cb22-29" aria-hidden="true" tabindex="-1"></a>        <span class="co"># return state</span></span>
-<span id="cb22-30"><a href="#cb22-30" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> state</span>
-<span id="cb22-31"><a href="#cb22-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-32"><a href="#cb22-32" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> solve</span>
-<span id="cb22-33"><a href="#cb22-33" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb22-34"><a href="#cb22-34" aria-hidden="true" tabindex="-1"></a><span class="co"># LangChain BaseChatModel for Inspect Model API</span></span>
-<span id="cb22-35"><a href="#cb22-35" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> InspectChatModel(BaseChatModel):</span>
-<span id="cb22-36"><a href="#cb22-36" aria-hidden="true" tabindex="-1"></a>     <span class="cf">async</span> <span class="kw">def</span> _agenerate(</span>
-<span id="cb22-37"><a href="#cb22-37" aria-hidden="true" tabindex="-1"></a>        <span class="va">self</span>,</span>
-<span id="cb22-38"><a href="#cb22-38" aria-hidden="true" tabindex="-1"></a>        messages: <span class="bu">list</span>[BaseMessage],</span>
-<span id="cb22-39"><a href="#cb22-39" aria-hidden="true" tabindex="-1"></a>        stop: <span class="bu">list</span>[<span class="bu">str</span>] <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
-<span id="cb22-40"><a href="#cb22-40" aria-hidden="true" tabindex="-1"></a>        run_manager: AsyncCallbackManagerForLLMRun <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span>,</span>
-<span id="cb22-41"><a href="#cb22-41" aria-hidden="true" tabindex="-1"></a>        <span class="op">**</span>kwargs: <span class="bu">dict</span>[<span class="bu">str</span>, Any],</span>
-<span id="cb22-42"><a href="#cb22-42" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> ChatResult:</span>
-<span id="cb22-43"><a href="#cb22-43" aria-hidden="true" tabindex="-1"></a>        ...</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="callout callout-style-simple callout-note">
-<div class="callout-body d-flex">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-body-container">
-<p>Note that the the <code>inspect_langchain</code> module imported here is not a built in feature of Inspect. Rather, you can find its <a href="https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/examples/agents/langchain/inspect_langchain.py">source code</a> as part of the example. You can use this to create your own LangChain agents or as the basis for creating similar integrations with other agent frameworks.</p>
-</div>
-</div>
-</div>
-<p>Now here’s the <code>wikipedia_search()</code> solver (imports again excluded for brevity):</p>
-<div class="sourceCode" id="annotated-cell-22"><pre class="sourceCode python code-annotation-code code-with-copy code-annotated"><code class="sourceCode python"><span id="annotated-cell-22-1"><a href="#annotated-cell-22-1" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
-<span id="annotated-cell-22-2"><a href="#annotated-cell-22-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> wikipedia_search(</span>
-<span id="annotated-cell-22-3"><a href="#annotated-cell-22-3" aria-hidden="true" tabindex="-1"></a>    max_iterations: <span class="bu">int</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="dv">15</span>,</span>
-<span id="annotated-cell-22-4"><a href="#annotated-cell-22-4" aria-hidden="true" tabindex="-1"></a>    max_execution_time: <span class="bu">float</span> <span class="op">|</span> <span class="va">None</span> <span class="op">=</span> <span class="va">None</span></span>
-<span id="annotated-cell-22-5"><a href="#annotated-cell-22-5" aria-hidden="true" tabindex="-1"></a>) <span class="op">-&gt;</span> Solver:</span>
-<span id="annotated-cell-22-6"><a href="#annotated-cell-22-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># standard prompt for tools agent</span></span>
-<span id="annotated-cell-22-7"><a href="#annotated-cell-22-7" aria-hidden="true" tabindex="-1"></a>    prompt <span class="op">=</span> hub.pull(<span class="st">"hwchase17/openai-tools-agent"</span>)</span>
-<span id="annotated-cell-22-8"><a href="#annotated-cell-22-8" aria-hidden="true" tabindex="-1"></a></span>
-<button class="code-annotation-anchor" data-target-cell="annotated-cell-22" data-target-annotation="1">1</button><span id="annotated-cell-22-9" class="code-annotation-target"><a href="#annotated-cell-22-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># tavily and wikipedia tools</span></span>
-<span id="annotated-cell-22-10"><a href="#annotated-cell-22-10" aria-hidden="true" tabindex="-1"></a>    tavily_api <span class="op">=</span> TavilySearchAPIWrapper()  <span class="co"># type: ignore</span></span>
-<span id="annotated-cell-22-11"><a href="#annotated-cell-22-11" aria-hidden="true" tabindex="-1"></a>    tools <span class="op">=</span> (</span>
-<span id="annotated-cell-22-12"><a href="#annotated-cell-22-12" aria-hidden="true" tabindex="-1"></a>        [TavilySearchResults(api_wrapper<span class="op">=</span>tavily_api)] <span class="op">+</span> </span>
-<span id="annotated-cell-22-13"><a href="#annotated-cell-22-13" aria-hidden="true" tabindex="-1"></a>        load_tools([<span class="st">"wikipedia"</span>])</span>
-<span id="annotated-cell-22-14"><a href="#annotated-cell-22-14" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="annotated-cell-22-15"><a href="#annotated-cell-22-15" aria-hidden="true" tabindex="-1"></a></span>
-<button class="code-annotation-anchor" data-target-cell="annotated-cell-22" data-target-annotation="2">2</button><span id="annotated-cell-22-16" class="code-annotation-target"><a href="#annotated-cell-22-16" aria-hidden="true" tabindex="-1"></a>    <span class="co"># agent function</span></span>
-<span id="annotated-cell-22-17"><a href="#annotated-cell-22-17" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> agent(</span>
-<span id="annotated-cell-22-18"><a href="#annotated-cell-22-18" aria-hidden="true" tabindex="-1"></a>        llm: BaseChatModel, </span>
-<span id="annotated-cell-22-19"><a href="#annotated-cell-22-19" aria-hidden="true" tabindex="-1"></a>        <span class="bu">input</span>: <span class="bu">dict</span>[<span class="bu">str</span>, Any]</span>
-<span id="annotated-cell-22-20"><a href="#annotated-cell-22-20" aria-hidden="true" tabindex="-1"></a>    ) <span class="op">-&gt;</span> <span class="bu">str</span> <span class="op">|</span> <span class="bu">list</span>[<span class="bu">str</span> <span class="op">|</span> <span class="bu">dict</span>[<span class="bu">str</span>,Any]]:  </span>
-<span id="annotated-cell-22-21"><a href="#annotated-cell-22-21" aria-hidden="true" tabindex="-1"></a>        <span class="co"># create agent</span></span>
-<span id="annotated-cell-22-22"><a href="#annotated-cell-22-22" aria-hidden="true" tabindex="-1"></a>        tools_agent <span class="op">=</span> create_openai_tools_agent(</span>
-<span id="annotated-cell-22-23"><a href="#annotated-cell-22-23" aria-hidden="true" tabindex="-1"></a>          llm, tools, prompt</span>
-<span id="annotated-cell-22-24"><a href="#annotated-cell-22-24" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="annotated-cell-22-25"><a href="#annotated-cell-22-25" aria-hidden="true" tabindex="-1"></a>        executor <span class="op">=</span> AgentExecutor.from_agent_and_tools(</span>
-<span id="annotated-cell-22-26"><a href="#annotated-cell-22-26" aria-hidden="true" tabindex="-1"></a>            agent<span class="op">=</span>cast(BaseMultiActionAgent, tools_agent),</span>
-<span id="annotated-cell-22-27"><a href="#annotated-cell-22-27" aria-hidden="true" tabindex="-1"></a>            tools<span class="op">=</span>tools,</span>
-<span id="annotated-cell-22-28"><a href="#annotated-cell-22-28" aria-hidden="true" tabindex="-1"></a>            name<span class="op">=</span><span class="st">"wikipedia_search"</span>,</span>
-<span id="annotated-cell-22-29"><a href="#annotated-cell-22-29" aria-hidden="true" tabindex="-1"></a>            max_iterations<span class="op">=</span>max_iterations,  </span>
-<span id="annotated-cell-22-30"><a href="#annotated-cell-22-30" aria-hidden="true" tabindex="-1"></a>            max_execution_time<span class="op">=</span>max_execution_time</span>
-<span id="annotated-cell-22-31"><a href="#annotated-cell-22-31" aria-hidden="true" tabindex="-1"></a>        )</span>
-<span id="annotated-cell-22-32"><a href="#annotated-cell-22-32" aria-hidden="true" tabindex="-1"></a></span>
-<button class="code-annotation-anchor" data-target-cell="annotated-cell-22" data-target-annotation="3">3</button><span id="annotated-cell-22-33" class="code-annotation-target"><a href="#annotated-cell-22-33" aria-hidden="true" tabindex="-1"></a>        <span class="co"># execute the agent and return output</span></span>
-<span id="annotated-cell-22-34"><a href="#annotated-cell-22-34" aria-hidden="true" tabindex="-1"></a>        result <span class="op">=</span> <span class="cf">await</span> executor.ainvoke(<span class="bu">input</span>)  </span>
-<span id="annotated-cell-22-35"><a href="#annotated-cell-22-35" aria-hidden="true" tabindex="-1"></a>        <span class="cf">return</span> result[<span class="st">"output"</span>]</span>
-<span id="annotated-cell-22-36"><a href="#annotated-cell-22-36" aria-hidden="true" tabindex="-1"></a></span>
-<button class="code-annotation-anchor" data-target-cell="annotated-cell-22" data-target-annotation="4">4</button><span id="annotated-cell-22-37" class="code-annotation-target"><a href="#annotated-cell-22-37" aria-hidden="true" tabindex="-1"></a>    <span class="co"># return agent function as inspect solver</span></span>
-<span id="annotated-cell-22-38"><a href="#annotated-cell-22-38" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> langchain_solver(agent)</span><div class="code-annotation-gutter-bg"></div><div class="code-annotation-gutter"></div></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<dl class="code-annotation-container-hidden code-annotation-container-grid">
-<dt data-target-cell="annotated-cell-22" data-target-annotation="1">1</dt>
-<dd>
-<span data-code-cell="annotated-cell-22" data-code-lines="9" data-code-annotation="1">Note that we register native LangChain tools. These will be converted to the standard Inspect <code>ToolInfo</code> when generate is called.</span>
-</dd>
-<dt data-target-cell="annotated-cell-22" data-target-annotation="2">2</dt>
-<dd>
-<span data-code-cell="annotated-cell-22" data-code-lines="16" data-code-annotation="2">This is the standard interface to LangChain agents. We take this function and automatically create a standard Inspect solver from it below when we pass it to <code>langchain_solver()</code>.</span>
-</dd>
-<dt data-target-cell="annotated-cell-22" data-target-annotation="3">3</dt>
-<dd>
-<span data-code-cell="annotated-cell-22" data-code-lines="33" data-code-annotation="3">Invoke the agent using the chat history passed in <code>input</code>. We call the async executor API to play well with Inspect’s concurrency.</span>
-</dd>
-<dt data-target-cell="annotated-cell-22" data-target-annotation="4">4</dt>
-<dd>
-<span data-code-cell="annotated-cell-22" data-code-lines="37" data-code-annotation="4">The <code>langchain_solver()</code> function maps the simpler agent function semantics into the standard Inspect solver API.</span>
-</dd>
-</dl>
-<p>If you reviewed the <a href="https://brightinventions.pl/blog/introducing-langchain-agents-tutorial-with-example/">original article</a> that this example was based on, you’ll see that most of the code is unchanged (save for the fact that we have switched from a function agent to a tools agent). The main difference is that we compose the agent function into an Inspect solver by passing it to <code>langchain_solver()</code>.</p>
-<p>Finally, here’s a task that uses the <code>wikipedia_search()</code> solver:</p>
-<div class="sourceCode" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
-<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> wikipedia() <span class="op">-&gt;</span> Task:</span>
-<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
-<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>        dataset<span class="op">=</span>json_dataset(<span class="st">"wikipedia.jsonl"</span>),</span>
-<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>        plan<span class="op">=</span>wikipedia_search(),</span>
-<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a>        scorer<span class="op">=</span>model_graded_fact(),</span>
-<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>See the <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/examples/agents/langchain">working version</a> of this example if you want to run and experiment with it.</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> pip install git+https://github.com/UKGovernmentBEIS/inspect_ai.git</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 
 
 </section>
@@ -1315,8 +925,8 @@ <h3 class="anchored" data-anchor-id="example-wikipedia-search">Example: Wikipedi
       </a>          
   </div>
   <div class="nav-page nav-page-next">
-      <a href="./scorers.html" class="pagination-link" aria-label="Scorers">
-        <span class="nav-page-text"><span class="chapter-title">Scorers</span></span> <i class="bi bi-arrow-right-short"></i>
+      <a href="./agents.html" class="pagination-link" aria-label="Agents">
+        <span class="nav-page-text"><span class="chapter-title">Agents</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/vscode.html b/vscode.html
index edb6d301f..d38b8539c 100644
--- a/vscode.html
+++ b/vscode.html
@@ -180,6 +180,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -878,7 +883,7 @@ <h2 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","descPosition":"bottom","selector":".lightbox","closeEffect":"zoom","loop":false});
+<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","selector":".lightbox","loop":false,"descPosition":"bottom","closeEffect":"zoom"});
 window.onload = () => {
   lightboxQuarto.on('slide_before_load', (data) => {
     const { slideIndex, slideNode, slideConfig, player, trigger } = data;
diff --git a/workflow.html b/workflow.html
index 259cf9029..b03ea5997 100644
--- a/workflow.html
+++ b/workflow.html
@@ -214,6 +214,11 @@
   <div class="sidebar-item-container"> 
   <a href="./tools.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Tools</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./agents.html" class="sidebar-item-text sidebar-link"><span class="chapter-title">Agents</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1100,7 +1105,7 @@ <h2 class="anchored" data-anchor-id="eval-suites">Eval Suites</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"loop":false,"selector":".lightbox","closeEffect":"zoom","descPosition":"bottom","openEffect":"zoom"});
+<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","openEffect":"zoom","descPosition":"bottom","loop":false,"selector":".lightbox"});
 window.onload = () => {
   lightboxQuarto.on('slide_before_load', (data) => {
     const { slideIndex, slideNode, slideConfig, player, trigger } = data;