Skip to content

Commit

Permalink
Built site for gh-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
jjallaire committed Sep 13, 2024
1 parent 2b8c74c commit 87c8f1d
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .nojekyll
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ef1b3979
e426d6d0
2 changes: 1 addition & 1 deletion eval-logs.html
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ <h3 class="anchored" data-anchor-id="reading-logs">Reading Logs</h3>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","openEffect":"zoom","loop":false,"descPosition":"bottom","closeEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"loop":false,"descPosition":"bottom","closeEffect":"zoom","openEffect":"zoom","selector":".lightbox"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,7 @@ <h2 class="anchored" data-anchor-id="learning-more">Learning More</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","closeEffect":"zoom","loop":false,"descPosition":"bottom","openEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","closeEffect":"zoom","openEffect":"zoom","loop":false,"descPosition":"bottom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
4 changes: 2 additions & 2 deletions log-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ <h2 class="anchored" data-anchor-id="publishing">Publishing</h2>
<h4 class="anchored" data-anchor-id="other-notes">Other Notes</h4>
<ul>
<li><p>You may provide a default output directory for bundling the viewer in your <code>.env</code> file by setting the <code>INSPECT_VIEW_BUNDLE_OUTPUT_DIR</code> variable.</p></li>
<li><p>You may specify an S3 url as the target for bundled views. See the <a href="evallogs.qmd#sec-amazon-s3">Amazon S3</a> section for additional information on configuring S3.</p></li>
<li><p>You may specify an S3 url as the target for bundled views. See the <a href="./eval-logs.html#sec-amazon-s3">Amazon S3</a> section for additional information on configuring S3.</p></li>
<li><p>You can use the <code>bundle_log_dir()</code> function in Python directly to bundle the viewer and logs into an output directory.</p></li>
<li><p>The bundled viewer will show the first log file by default. You may link to the viewer to show a specific log file by including the url parameter <code>log_file=&lt;log_file&gt;</code>.</p></li>
<li><p>The bundled output directory includes a <code>robots.txt</code> file to prevent indexing of the folder. If you deploy this folder outside of the root of your website, we recommend that you update your root <code>robots.txt</code> file to exclude this folder from being indexed.</p></li>
Expand Down Expand Up @@ -1035,7 +1035,7 @@ <h4 class="anchored" data-anchor-id="other-notes">Other Notes</h4>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","loop":false,"descPosition":"bottom","openEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom","closeEffect":"zoom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
</url>
<url>
<loc>https://inspect.ai-safety-institute.org.uk/log-viewer.html</loc>
<lastmod>2024-09-13T18:29:10.933Z</lastmod>
<lastmod>2024-09-13T18:30:32.644Z</lastmod>
</url>
<url>
<loc>https://inspect.ai-safety-institute.org.uk/vscode.html</loc>
Expand Down
26 changes: 13 additions & 13 deletions tutorial.html
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ <h2 class="anchored" data-anchor-id="sec-security-guide">Security Guide</h2>
<section id="setup" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect and defining a system message that orients the model to its role as a computer security expert.</p>
<div id="cfbeb9f2" class="cell">
<div id="a2029948" class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> csv_dataset</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_fact</span>
Expand All @@ -459,7 +459,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
<section id="eval" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval">Eval</h3>
<p>Discerning whether the correct security guidance was provided by the model might provide difficult using only text matching algorithms. Here we use a model to read the response and assess the quality of the answer.</p>
<div id="91fde1d5" class="cell">
<div id="0f668171" class="cell">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> security_guide():</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand Down Expand Up @@ -489,7 +489,7 @@ <h2 class="anchored" data-anchor-id="sec-hellaswag">HellaSwag</h2>
<section id="setup-1" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect, defining a system message, and writing a function to convert dataset records to samples (we need to do this to convert the index-based label in the dataset to a letter).</p>
<div id="324c60cb" class="cell">
<div id="785b2dae" class="cell">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> choice</span>
Expand All @@ -514,7 +514,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
<section id="eval-1" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-1">Eval</h3>
<p>We’ll load the dataset from <a href="https://huggingface.co/datasets/Rowan/hellaswag">HuggingFace</a> using the <code>hf_dataset()</code> function. We’ll draw data from the validation split, and use the <code>record_to_sample()</code> function to parse the records (we’ll also pass <code>trust=True</code> to indicate that we are okay with Hugging Face executing the dataset loading code provided by hellaswag):</p>
<div id="f9266cd4" class="cell">
<div id="0946a81b" class="cell">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> hellaswag():</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> </span>
Expand Down Expand Up @@ -574,7 +574,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
<li><code>record_to_sample()</code> to convert raw records to samples. Note that we need a function rather than just mapping field names with a <code>FieldSpec</code> because the <strong>answer</strong> field in the dataset needs to be divided into reasoning and the actual answer (which appears at the very end after <code>####</code>).</li>
<li><code>sample_to_fewshot()</code> to generate fewshot examples from samples.</li>
</ol>
<div id="0172f83b" class="cell">
<div id="28838989" class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> match</span>
Expand Down Expand Up @@ -621,7 +621,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
<section id="eval-2" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-2">Eval</h3>
<p>We’ll load the dataset from <a href="https://huggingface.co/datasets/gsm8k">HuggingFace</a> using the <code>hf_dataset()</code> function. By default we use 10 fewshot examples, but the <code>fewshot</code> task arg can be used to turn this up, down, or off. The <code>fewshot_seed</code> is provided for stability of fewshot examples across runs.</p>
<div id="210a9741" class="cell">
<div id="65369197" class="cell">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gsm8k(fewshot<span class="op">=</span><span class="dv">10</span>, fewshot_seed<span class="op">=</span><span class="dv">42</span>):</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="co"># build plan dynamically (may or may not be doing fewshot)</span></span>
Expand Down Expand Up @@ -688,7 +688,7 @@ <h2 class="anchored" data-anchor-id="sec-mathematics">Mathematics</h2>
<section id="setup-3" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect and defining a prompt that asks the model to reason step by step and respond with its answer on a line at the end. It also nudges the model not to enclose its answer in <code>\boxed</code>, a LaTeX command for displaying equations that models often use in math output.</p>
<div id="b0973afe" class="cell">
<div id="77187fd3" class="cell">
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
Expand Down Expand Up @@ -726,7 +726,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
<section id="eval-3" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
<p>Here is the basic setup for our eval. We <code>shuffle</code> the dataset so that when we use <code>--limit</code> to develop on smaller slices we get some variety of inputs and results:</p>
<div id="4d3e56e1" class="cell">
<div id="8075ad01" class="cell">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math(shuffle<span class="op">=</span><span class="va">True</span>):</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand All @@ -749,7 +749,7 @@ <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
<span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The heart of this eval isn’t in the task definition though, rather it’s in how we grade the output. Math expressions can be logically equivalent but not literally the same. Consequently, we’ll use a model to assess whether the output and the target are logically equivalent. the <code>expression_equivalence()</code> custom scorer implements this:</p>
<div id="1a7acc60" class="cell">
<div id="f102a039" class="cell">
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at">@scorer</span>(metrics<span class="op">=</span>[accuracy(), stderr()])</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> expression_equivalence():</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">async</span> <span class="kw">def</span> score(state: TaskState, target: Target):</span>
Expand Down Expand Up @@ -830,7 +830,7 @@ <h2 class="anchored" data-anchor-id="sec-tool-use">Tool Use</h2>
<section id="addition" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
<p>We’ll demonstrate with a simple tool that adds two numbers, using the <code>@tool</code> decorator to register it with the system:</p>
<div id="674c50b4" class="cell">
<div id="78308f9f" class="cell">
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes, match</span>
Expand Down Expand Up @@ -865,7 +865,7 @@ <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> y: Second number to add.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Type annotations and descriptions are <em>required</em> for tool declarations so that the model can be informed which types to pass back to the tool function and what the purpose of each parameter is.</p>
<p>Now that we’ve defined the tool, we can use it in an evaluation by passing it to the <code>use_tools()</code> function.</p>
<div id="cb07da18" class="cell">
<div id="c3d039aa" class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> addition_problem():</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand Down Expand Up @@ -894,7 +894,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
<ol start="2" type="1">
<li><code>ctf_agent()</code>, which defines the agent’s plan. The plan consists principally of using <code>bash()</code> and <code>python()</code> tools in a loop until the flag is discovered. We’ll describe this function in more detail below.</li>
</ol>
<div id="42750f7d" class="cell">
<div id="5ad6b4f0" class="cell">
<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> textwrap <span class="im">import</span> dedent</span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> dataset <span class="im">import</span> read_dataset</span>
Expand All @@ -920,7 +920,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
</div>
<p>Note that we specify <code>sandbox="docker"</code> to ensure that code generated from the model is run in a secure <a href="agents.html#sec-sandbox-environments">sandbox environment</a>.</p>
<p>Here is the definition of the agent:</p>
<div id="34b38dc9" class="cell">
<div id="5a5d9307" class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="at">@plan</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ctf_agent(max_attempts<span class="op">=</span><span class="dv">3</span>):</span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a> SYSTEM_MESSAGE <span class="op">=</span> dedent(<span class="st">"""</span></span>
Expand Down
2 changes: 1 addition & 1 deletion vscode.html
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ <h2 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","descPosition":"bottom","loop":false,"selector":".lightbox"});
<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","selector":".lightbox","closeEffect":"zoom","openEffect":"zoom","loop":false});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion workflow.html
Original file line number Diff line number Diff line change
Expand Up @@ -1135,7 +1135,7 @@ <h2 class="anchored" data-anchor-id="eval-suites">Eval Suites</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","loop":false,"descPosition":"bottom","openEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","loop":false,"openEffect":"zoom","selector":".lightbox","descPosition":"bottom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down

0 comments on commit 87c8f1d

Please sign in to comment.