diff --git a/.github/workflows/log_viewer.yml b/.github/workflows/log_viewer.yml index 9d3950b18..08ea304f2 100644 --- a/.github/workflows/log_viewer.yml +++ b/.github/workflows/log_viewer.yml @@ -43,28 +43,31 @@ jobs: - name: Run eslint run: yarn eslint - build: - runs-on: ubuntu-latest - defaults: - run: - working-directory: src/inspect_ai/_view/www - steps: - - uses: actions/checkout@v4 - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: "22.x" - - name: Install dependencies - run: yarn install + # TODO: This is failing even with a freshly generated build.js file + # Need to debug or better understand the cause + # build: + # runs-on: ubuntu-latest + # defaults: + # run: + # working-directory: src/inspect_ai/_view/www + # steps: + # - uses: actions/checkout@v4 + # - name: Set up Node.js + # uses: actions/setup-node@v4 + # with: + # node-version: "22.x" + # - name: Install dependencies + # run: yarn install - - name: Build log viewer - run: yarn build + # - name: Build log viewer + # run: yarn build - - name: Ensure dist changes are checked in - run: | - if [[ $(git status --porcelain) != "" ]] - then - echo "Log viewer dist files have not been updated, please run yarn build and check in the changes." - git status - exit 1 - fi + # - name: Ensure dist changes are checked in + # run: | + # if [[ $(git status --porcelain) != "" ]] + # then + # echo "Log viewer dist files have not been updated, please run yarn build and check in the changes." + # git status + # git diff dist/assets/index.js + # exit 1 + # fi diff --git a/benchmarks/README.md b/benchmarks/README.md index e7558b54e..cf3fa0395 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,6 +1,6 @@ ## Benchmarks -This directory contains evals for several benchmarks. Datasets for evals are not embedded in the repository but are rather ether downloaded either directly from their source URL or via Hugging Face datasets. To use Hugging Face datasets please install the datasets package with `pip install datasets`. +This directory contains evals for several benchmarks. Datasets for evals are not embedded in the repository but are rather either downloaded either directly from their source URL or via Hugging Face datasets. To use Hugging Face datasets please install the datasets package with `pip install datasets`. | Benchmark | Reference | Code | Dataset | |-------------------------|----------------|---------------:|----------------| diff --git a/docs/_quarto.yml b/docs/_quarto.yml index a75d511a7..aa62749f7 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -71,10 +71,12 @@ book: chapters: - caching.qmd - parallelism.qmd + - agents-api.qmd - eval-logs.qmd - eval-suites.qmd - extensions.qmd + toc-depth: 2 number-sections: true number-depth: 2 diff --git a/docs/agents-api.qmd b/docs/agents-api.qmd new file mode 100644 index 000000000..eec4e70ff --- /dev/null +++ b/docs/agents-api.qmd @@ -0,0 +1,307 @@ +--- +title: "Agents API" +format: html +--- + +::: callout-note +The Agents API described in this article is currently available only in the development version of Inspect. You can install the development version with: + +``` {.bash .code-overflow-wrap} +pip install git+https://github.com/ukgovernmentbeis/inspect_ai +``` +::: + +## Overview + +This article describes advanced Inspect APIs available for creating evaluations with agents. You can also build agents evals using Inspect's default ReAct tool use loop or by bridging to an external agent library (see the main [Agents](agents.qmd) article for further details). Topics covered in this article include: + +1. Sharing state across solvers and tools +2. Creating a custom tool use loop +3. Dynamically customising tool descriptions +4. Observability with sample transcripts. +5. Delegating work to sub-tasks +6. Sandboxing arbitrary code execution + +We'll assume that you already understand Inspect [Solvers](solvers.qmd) and [Tools](tools.qmd) (please review those articles as required before proceeding). + +## Use of `metadata` + +Before proceeding, it's important to point that some of the features described below were previously approximated by using the `metadata` field of `TaskState`, specifically `metadata` was often used as a catch-all storage location for: + +- Carrying state between solvers and sometimes tools. +- Providing a place to log additional structured data. +- Recording calls to "helper" models used for elicitation or scoring. + +The `metadata` field no longer need be used for these scenarios (and in fact should now be treated as a read-only part of the `TaskState`). Below we'll describe how the `Store` can be used for state, how structured data can be logged to the sample `Transcript`, and how all model calls are now automatically recorded and included in the transcript. + +## Sharing State + +Sequences of solvers often need to store and manipulate shared state. Further, tools may often want their own persistent state (or groups of tools may want to share state). This can be accomplished in Insepct using the `Store`, which provides a scoped scratchpad for arbitrary values. + +The core of the `Store` interface is: + +``` python +from inspect_ai.solver import Store + +class Store: + def get(self, key: str, default: VT) -> VT + def set(self, key: str, value: Any) -> None + def delete(self, key: str) -> None +``` + +Basic views on the store's collection (e.g. `items()`, `keys()`, `values()`) are also provided. Note that the `get()` method will automatically add the `default` to the store if it doesn't exist. + +The `Store` can be accessed via `TaskState` as follows: + +``` python +history = state.store.get("history", []) +``` + +It is also possible the access the `Store` *for the current sample* using the `store()` function. This is the mechanism for tools to read and write the `Store`. For example: + +``` python +from inspect_ai.solver import store +from inspect_ai.tool import tool + +@tool +def web_browser_back(): + def execute() -> str: + history = store().get("web_browser:history", []) + return history.pop() +``` + +While there is no formal namespacing mechanism for the `Store`, this can be informally achieved using key prefixes as demonstrated above. + +You should generally try to use JSON serialisable Python types in the `Store` (e.g. objects should be dataclasses or Pydantic BaseModel) so that they can be recorded in the [Transcript](#sec-transcripts). + +While the default `Store` for a sample is shared globally between solvers and tools, a more narrowly scoped `Store` is created automatically for [Subtasks](#sec-subtasks). + +## Tool Use + +### Custom Loop + +The higher level `generate()` function passed to solvers includes a built-in tool use loop—when the model calls a tool, Inspect calls the underlying Python function and reports the result to the model, proceeding until the model stops calling tools. However, for more advanced agents you may want to intervene in the tool use loop in a variety of ways: + +1. Urge the model to continue (or take a different path) if it gives up. +2. Exercise more fine grained control over which, when, and how many tool calls are made. +3. Redirect the model to another trajectory if its not on a productive course. +4. Have multiple `generate()` passes each with a distinct set of tools. + +To do this, create a solver that emulates the default tool use loop and provides additional customisation as required. Here is the code at the core of Inspect tool use in `generate()`: + +``` python +model = get_model() +state.output = await model.generate( + state.messages, state.tools +) +state.messages.append(output.message) +state.messages.extend( + call_tools(state.output.message, state.tools) +) +``` + +This does everything that default `generate()` does, save for an outer loop to continue calling the mode as long as it continues calling tools. You could implement the outer loop as follows: + +``` python +model = get_model() +while True: + state.output = await model.generate( + state.messages, state.tools + ) + state.messages.append(state.output.message) + if state.output.message.tool_calls: + state.messages.extend( + call_tools(state.output.message, state.tools) + ) + else: + break +``` + +Note that you don't necessarily even need to structure the agent using a loop. For example, you might have an inner function implementing the loop, while an outer function dynamically swaps out what tools are available. For example, imagine the above was implemented in a function named `tool_use_loop()`, you might have outer function like this: + +``` python +# first pass w/ core tools +state.tools = [decompile(), dissasemble(), bash()] +state = await tool_use_loop(state) + +# second pass w/ prompt and python tool only +state.tools = [python()] +state = await tool_use_loop(state) +``` + +Taken together these APIs enable you to build a custom version of `generate()` with whatever structure and logic you need. + +### Tool Descriptions + +In some cases you may want to change the default descriptions created by a tool author—for example you might want to provide better disambiguation between multiple similar tools that are used together. You also might have need to do this during development of tools (to explore what descriptions are most useful to models). + +The `tool_with()` function enables you to take any tool and adapt its name and/or descriptions. For example: + +``` python +from inspect_ai.tool import tool_with + +my_add = tool_with( + tool=add(), + name="my_add", + description="a tool to add numbers", + parameters={ + "x": "the x argument", + "y": "the y argument" + }) +``` + +You need not provide all of the parameters shown above, for example here are some examples where we modify just the main tool description or only a single parameter: + +``` python +my_add = tool_with(add(), description="a tool to add numbers") +my_add = tool_with(add(), parameters={"x": "the x argument"}) +``` + +Note that the `tool_with()` function returns a copy of the passed tool with modified descriptions (the passed tool retains its original descriptions).. + +## Transcripts + +Transcripts provide a rich per-sample sequential view of everything that occurs during plan execution and scoring, including: + +- Model interactions (including the raw API call made to the provider). +- Tool calls (including a sub-transcript of activitywithin the tool) +- Changes (in [JSON Patch](https://jsonpatch.com/) format) to the `TaskState` for the `Sample`. +- Scoring (including a sub-transcript of interactions within the scorer). +- Custom `info()` messages inserted explicitly into the transcript. +- Python logger calls (`info` level or designated custom `log-level`). + +This information is provided within the Inspect log viewer in the **Transcript** tab (which sits alongside the Messages, Scoring, and Metadata tabs in the per-sample display). + +### Custom Info + +You can insert custom entries into the transcript via the Transcipt `info()` method (which creates an `InfoEvent`). Access the transcript for the current sample using the `transcript()` function, for example: + +``` python +from inspect_ai.solver import transcript + +transcript().info("here is some custom info") +``` + +You can pass arbitrary JSON serialisable objects to `info()`. + +### Grouping with Steps + +You can create arbitrary groupings of transcript activity using the Transcript `step()` context manager. For example: + +``` python +with transcript().step("reasoning"): + ... + state.store.set("next-action", next_action) +``` + +There are two reasons that you might want to create steps: + +1. Any changes to the store which occur during a step will be collected into a `StoreEvent` that records the changes (in [JSON Patch](https://jsonpatch.com/) format) that occurred. +2. The Inspect log viewer will create a visual delineation for the step, which will make it easier to see the flow of activity within the transcript. + +## Subtasks {#sec-subtasks} + +Subtasks provide a mechanism for creating isolated, re-usable units of execution. You might implement a complex tool using a subtask or might use them in a multi-agent evaluation. The main characteristics of sub-tasks are: + +1. They run in their own async coroutine. +2. They have their own isolated `Store` (no access to the sample `Store`). +3. They have their own isolated `Transcript` + +To create a subtask, declare an async function with the `@subtask` decorator. The function can take any arguments and return a value of any type. For example: + +``` python +from inspect_ai.solver import Store, subtask + +@subtask +async def web_search(keywords: str) -> str: + # get links for these keywords + links = await search_links(keywords) + + # add links to the store so they end up in the transcript + store().set("links", links) + + # summarise the links + return await fetch_and_summarise(links) +``` + +Note that we add `links` to the `store` not because we strictly need to for our implementation, but because we want the links to be recorded as part of the transcript. + +Call the subtask as you would any async function: + +``` python +summary = await web_search(keywords="solar power") +``` + +A few things will occur automatically when you run a subtask: + +- New isolated `Store` and `Transcript` objects will be created for the subtask (accessible via the `store()` and `transcript()` functions). Changes to the `Store` that occur during execution will be recorded in a `StoreEvent`. + +- A `SubtaskEvent` will be added to the current transcript. The event will include the name of the subtask, its input and results, and a transcript of all events that occur within the subtask. + +You can also include one or more steps within a subtask. + +### Parallel Execution + +You can execute subtasks in parallel using `asyncio.gather()`. For example, to run 3 `web_search()` subtasks in parallel: + + +``` python +import asyncio + +searches = [ + web_search(keywords="solar power"), + web_search(keywords="wind power"), + web_search(keywords="hydro power"), +] + +results = await asyncio.gather(*searches) +``` + +Note that we don't `await` the subtasks when building up our list of `searches`. Rather, we let `asyncio.gather()` await all of them, returning only when all of the results are available. + +## Sandboxing + +Many agents provide models with the ability to execute arbitrary code. It's important that this code be sandboxed so that it executes in an isolated context. Inspect supports this through the `SandboxEnvironment` (which in turn may be implemented using Docker or various other schemes). Enable sandboxing for a task with the `sandbox` parameter. For example: + +``` python +@task +def file_probe() + return Task( + dataset=dataset, + plan=[ + use_tools([list_files()]), + generate() + ], + sandbox="docker", + scorer=includes(), + ) +) +``` + +Use the `SandboxEnvironment` within a tool via the `sandbox()` function. For example, here's an implementation of the `list_files()` tool referenced above: + +``` python +from inspect_ai.tool import ToolError, tool +from inspect_ai.util import sandbox + +@tool +def list_files(): + async def execute(dir: str): + """List the files in a directory. + + Args: + dir (str): Directory + + Returns: + File listing of the directory + """ + result = await sandbox().exec(["ls", dir]) + if result.success: + return result.stdout + else: + raise ToolError(result.stderr) + + return execute +``` + +See the section on [Sandbox Environments](agents.qmd##sec-sandbox-environments) for further details on using sandboxes with Inspect. \ No newline at end of file diff --git a/docs/agents.qmd b/docs/agents.qmd index c95003707..90fcc0414 100644 --- a/docs/agents.qmd +++ b/docs/agents.qmd @@ -16,7 +16,7 @@ We'll cover the basics of all of these approaches below. An important additional consideration for agent evaluations is sandboxing (providing a secure environment for models to execute code within). The [Sandbox Environments](#sec-sandbox-environments) section goes into more depth on this. -## Tool Use Loop +## Tool Use Loop {#sec-tool-use-loop} A basic agent can be implemented by providing tools to the model with `use_tools()` and then calling `generate()`. Every time the model calls a tool, the appropriate Python function is called and then the model is re-prompted to generate based on the output of the function. This is typically combined with a ReAct prompt that urges the model to reason about each action it takes. For example: @@ -165,7 +165,11 @@ def generate_ctf(): In this example we rely on the default `generate()` tool calling behaviour (`"loop"`). However, you can also imaging combining tool filtering with the more tailored tool calling logic described in [Tool Calls](#sec-tool-calls). -## Agent Libraries +### Agents API + +For more sophisticated agents, Inspect offers several additional advanced APIs for state management, sub-agents, and fine granined logging. See the [Agents API](agents-api.qmd) article for additional details. + +## Agent Libraries {#sec-agent-libraries} You can also adapt code from a research paper or 3rd party agent library to run within an Inspect solver. Below we'll provide an example of doing this for a [LangChain Agent](https://python.langchain.com/v0.2/docs/tutorials/agents/). diff --git a/docs/extensions.qmd b/docs/extensions.qmd index 9f6c4d34e..a301b9a73 100644 --- a/docs/extensions.qmd +++ b/docs/extensions.qmd @@ -16,7 +16,7 @@ For each of these, you can create an extension within a Python package, and then You can add a model provider by deriving a new class from `ModelAPI` and then creating a function decorated by `@modelapi` that returns the class. These are typically implemented in separate files (for reasons described below): -```{.python filename="custom.py"} +``` {.python filename="custom.py"} class CustomModelAPI(ModelAPI): def __init__( self, @@ -39,7 +39,7 @@ class CustomModelAPI(ModelAPI): ... ``` -```{.python filename="providers.py"} +``` {.python filename="providers.py"} @modelapi(name="custom") def custom(): from .custom import CustomModelAPI @@ -47,16 +47,17 @@ def custom(): return CustomModelAPI ``` - The layer of indirection (creating a function that returns a ModelAPI class) is done so that you can separate the registration of models from the importing of libraries they require (important for limiting dependencies). You can see this used within Inspect to make all model package dependencies optional [here](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/model/_providers/providers.py). With this scheme, pacakges required to interace with models (e.g. `openai`, `anthropic`, `vllm`, etc.) are only imported when their model API type is actually used. The `__init__()` method *must* call the `super().__init__()` method, and typically instantiates the model client library. -The `__init__()` method receive a `**model_args` parameter that will carry any custom `model_args` (or `-M` arguments from the CLI) specified by the user. You can then pass these on to the approriate place in your model initialisation code (for example, here is what many of the built-in providers do with `model_args` passed to them: ). +The `__init__()` method receive a `**model_args` parameter that will carry any custom `model_args` (or `-M` arguments from the CLI) specified by the user. You can then pass these on to the appropriate place in your model initialisation code (for example, here is what many of the built-in providers do with `model_args` passed to them: ). + +The `generate()` method handles interacting with the model, converting inspect messages, tools, and config into model native data structures. Note that the generate method may optionally return a `tuple[ModelOutput,ModelCall]` in order to record the raw request and response to the model within the sample transcript. -The `generate()` method handles interacting with the model, converting inspect messages, tools, and config into model native data structures. In addition, there are some optional properties you can override to specify various behaviours and constraints (default max tokens and connections, identifying rate limit errors, whether to collapse consecutive user and/or assistant messages, etc.). +In addition, there are some optional properties you can override to specify various behaviours and constraints (default max tokens and connections, identifying rate limit errors, whether to collapse consecutive user and/or assistant messages, etc.). See the [ModelAPI](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/model/_model.py) source code for further documentation on these properties. -See the [ModelAPI](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/model/_model.py) source code for further documentation on these properties. See the implementation of the [built-in model providers](https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_ai/model/_providers) for additional insight on building a custom provider. +See the implementation of the [built-in model providers](https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_ai/model/_providers) for additional insight on building a custom provider. ### Model Registration @@ -104,12 +105,12 @@ eval(math, model = "custom/my-model") [Sandbox Environments](#sec-sandbox-environments) provide a mechanism for sandboxing execution of tool code as well as providing more sophisticated infrastructure (e.g. creating network hosts for a cybersecurity eval). Inspect comes with two sandbox environments built in: -| Environment Type | Description | +| Environment Type | Description | |----------------------------|--------------------------------------------| -| `local` | Run `sandbox()` methods in the same file system as the running evaluation (should *only be used* if you are already running your evaluation in another sandbox). | -| `docker` | Run `sandbox()` methods within a Docker container | +| `local` | Run `sandbox()` methods in the same file system as the running evaluation (should *only be used* if you are already running your evaluation in another sandbox). | +| `docker` | Run `sandbox()` methods within a Docker container | -To create a custom sandbox environment, derive a class from `SandboxEnvironment`, implement the required static and instance methods, and add the `@sandboxenv` decorator to it. +To create a custom sandbox environment, derive a class from `SandboxEnvironment`, implement the required static and instance methods, and add the `@sandboxenv` decorator to it. The static class methods control the lifecycle of containers and other computing resources associated with the `SandboxEnvironment`: @@ -157,13 +158,13 @@ class PodmanSandboxEnvironment(SandboxEnvironment): The class methods take care of various stages of initialisation, setup, and teardown: -| Method | Lifecycle | Purpose | +| Method | Lifecycle | Purpose | |-------------------|-------------------|----------------------------------| -| `task_init()` | Called once for each unique sandbox environment config before executing the tasks in an `eval()` run. | Expensive initialisation operations (e.g. pulling or building images) | -| `sample_init()` | Called at the beginning of each `Sample`. | Create `SandboxEnvironment` instances for the sample. | -| `sample_cleanup()` | Called at the end of each `Sample` | Cleanup `SandboxEnvironment` instances for the sample. | -| `task_cleanup()` | Called once for each unique sandbox environment config after executing the tasks in an `eval()` run. | Last chance handler for any resources not yet cleaned up (see also discussion below). | -| `cli_cleanup()` | Called via `inspect sandbox cleanup` | CLI invoked manual cleanup of resources created by this `SandboxEnvironment`. | +| `task_init()` | Called once for each unique sandbox environment config before executing the tasks in an `eval()` run. | Expensive initialisation operations (e.g. pulling or building images) | +| `sample_init()` | Called at the beginning of each `Sample`. | Create `SandboxEnvironment` instances for the sample. | +| `sample_cleanup()` | Called at the end of each `Sample` | Cleanup `SandboxEnvironment` instances for the sample. | +| `task_cleanup()` | Called once for each unique sandbox environment config after executing the tasks in an `eval()` run. | Last chance handler for any resources not yet cleaned up (see also discussion below). | +| `cli_cleanup()` | Called via `inspect sandbox cleanup` | CLI invoked manual cleanup of resources created by this `SandboxEnvironment`. | In the case of parallel execution of a group of tasks within the same working directory, the `task_init()` and `task_cleanup()` functions will be called once for each unique sandbox environment configuration (e.g. Docker Compose file). This is a performance optimisation derived from the fact that initialisation and cleanup are shared for tasks with identical configurations. @@ -187,7 +188,6 @@ The `SandboxEnvironment` instance methods provide access to process execution an {{< include _sandboxenv-interface.md >}} - The best way to learn about writing sandbox environments is to look at the source code for the built in environments, [LocalSandboxEnvironment](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/util/_sandbox/local.py) and [DockerSandboxEnvironment](https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/util/_sandbox/docker/docker.py). ### Environment Registration diff --git a/docs/theme.scss b/docs/theme.scss index 808a32239..e5bac34ed 100644 --- a/docs/theme.scss +++ b/docs/theme.scss @@ -16,7 +16,7 @@ padding-top: 5px !important; } -.sidebar-menu-container > ul > li:first-of-type { +.sidebar-menu-container>ul>li:first-of-type { margin-bottom: 0.7em !important; } @@ -68,4 +68,4 @@ .blockquote { color: #505a62; -} +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 40a82fe28..66261f950 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -133,6 +133,7 @@ dev = [ "types-beautifulsoup4", "types-boto3", "types-botocore", + "types-jsonpatch", "types-jsonschema", "types-protobuf", "types-psutil", diff --git a/requirements.txt b/requirements.txt index 072fe30dc..52fc79383 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,9 @@ fsspec httpx json-stream jsonlines +jsonpatch jsonschema +mmh3 nest_asyncio numpy platformdirs diff --git a/src/inspect_ai/_eval/context.py b/src/inspect_ai/_eval/context.py index 5938486ba..ba9d3c51c 100644 --- a/src/inspect_ai/_eval/context.py +++ b/src/inspect_ai/_eval/context.py @@ -1,6 +1,6 @@ from inspect_ai._util.hooks import init_hooks -from inspect_ai._util.logger import init_http_rate_limit_count, init_logger_records -from inspect_ai.model import Model +from inspect_ai._util.logger import init_http_rate_limit_count +from inspect_ai.model import GenerateConfig, Model from inspect_ai.model._model import init_active_model, init_model_usage from inspect_ai.util._concurrency import init_concurrency from inspect_ai.util._subprocess import init_max_subprocesses @@ -13,7 +13,6 @@ def init_eval_context(max_subprocesses: int | None = None) -> None: init_hooks() -def init_task_context(model: Model) -> None: - init_active_model(model) +def init_task_context(model: Model, config: GenerateConfig = GenerateConfig()) -> None: + init_active_model(model, config) init_model_usage() - init_logger_records() diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index 43b64e642..4f6dba044 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -219,9 +219,8 @@ async def eval_async( init_eval_context(max_subprocesses) # resolve models - models = resolve_models( - model, model_base_url, model_args, GenerateConfig(**kwargs) - ) + generate_config = GenerateConfig(**kwargs) + models = resolve_models(model, model_base_url, model_args, generate_config) # resolve epochs if isinstance(epochs, int): @@ -231,7 +230,7 @@ async def eval_async( # 'default' model in tools, solvers, and scorers) resolved_tasks: list[ResolvedTask] = [] for m in models: - init_active_model(m) + init_active_model(m, generate_config) resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox)) # warn and return empty string if we resolved no tasks diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py index 74a566f63..6231efedb 100644 --- a/src/inspect_ai/_eval/task/log.py +++ b/src/inspect_ai/_eval/task/log.py @@ -1,5 +1,4 @@ from importlib import metadata as importlib_metadata -from logging import LogRecord from typing import Any, cast from shortuuid import uuid @@ -7,7 +6,6 @@ from inspect_ai._util.constants import PKG_NAME from inspect_ai._util.datetime import iso_now from inspect_ai._util.git import git_context -from inspect_ai._util.logger import logger_records from inspect_ai._util.path import cwd_relative_path from inspect_ai._util.registry import ( registry_log_name, @@ -26,9 +24,8 @@ EvalSample, EvalSpec, EvalStats, - LoggingMessage, ) -from inspect_ai.log._log import LogEvent, Recorder +from inspect_ai.log._log import LogType, Recorder from inspect_ai.model import ( GenerateConfig, Model, @@ -38,6 +35,7 @@ from inspect_ai.scorer import Score from inspect_ai.scorer._metric import SampleScore from inspect_ai.solver import Plan, Solver, TaskState +from inspect_ai.solver._subtask.transcript import eval_events, transcript class TaskLogger: @@ -106,13 +104,13 @@ def location(self) -> str: def samples_logged(self) -> int: return self._samples_logged - def log_event( + def log( self, - type: LogEvent, - data: EvalSample | EvalPlan | EvalResults | LoggingMessage, + type: LogType, + data: EvalSample | EvalPlan | EvalResults, flush: bool = False, ) -> None: - self.recorder.log_event(self.eval, type, data, flush) + self.recorder.log(self.eval, type, data, flush) # track samples logged if type == "sample": @@ -127,7 +125,7 @@ def log_sample( flush: bool = False, ) -> None: # log - self.log_event( + self.log( "sample", EvalSample( id=sample.id if isinstance(sample.id, int) else str(sample.id), @@ -139,15 +137,17 @@ def log_sample( messages=state.messages, output=state.output, scores=cast(dict[str, Score], scores), + store=dict(state.store.items()), + transcript=eval_events(transcript().events), ), flush, ) def log_plan(self, plan: EvalPlan) -> None: - self.log_event("plan", plan) + self.log("plan", plan) def log_results(self, results: EvalResults) -> None: - self.log_event("results", results) + self.log("results", results) def log_cancelled(self, stats: EvalStats) -> EvalLog: return self.recorder.log_cancelled(self.eval, stats) @@ -178,18 +178,10 @@ def eval_plan_step(solver: Solver) -> EvalPlanStep: if plan.finish: eval_plan.steps.append(eval_plan_step(plan.finish)) - logger.log_event("plan", eval_plan) + logger.log("plan", eval_plan) def collect_eval_data(stats: EvalStats, logger: TaskLogger) -> None: # collect stats stats.completed_at = iso_now() stats.model_usage = model_usage() - - # collect log output - log_logger_records(logger, logger_records()) - - -def log_logger_records(logger: TaskLogger, records: list[LogRecord]) -> None: - for record in records: - logger.log_event("logging", LoggingMessage.from_log_record(record)) diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py index 6299eacae..272193766 100644 --- a/src/inspect_ai/_eval/task/run.py +++ b/src/inspect_ai/_eval/task/run.py @@ -18,7 +18,11 @@ TaskSuccess, ) from inspect_ai._eval.task.util import sample_messages -from inspect_ai._util.constants import DEFAULT_EPOCHS, DEFAULT_MAX_CONNECTIONS +from inspect_ai._util.constants import ( + DEFAULT_EPOCHS, + DEFAULT_MAX_CONNECTIONS, + SAMPLE_SUBTASK, +) from inspect_ai._util.datetime import iso_now from inspect_ai._util.error import exception_message from inspect_ai._util.file import file, filesystem @@ -52,11 +56,18 @@ from inspect_ai.scorer._metric import SampleScore from inspect_ai.scorer._scorer import unique_scorer_name from inspect_ai.solver import Generate, Plan, Solver, TaskState -from inspect_ai.util import SandboxEnvironment +from inspect_ai.solver._subtask.subtask import init_subtask +from inspect_ai.solver._subtask.transcript import ( + SampleInitEvent, + ScoreEvent, + transcript, +) +from inspect_ai.solver._task_state import state_jsonable from inspect_ai.util._sandbox.context import ( cleanup_sandbox_environments_sample, init_sandbox_environments_sample, ) +from inspect_ai.util._sandbox.environment import SandboxEnvironment from ..context import init_task_context from ..task import Task @@ -64,6 +75,7 @@ from .images import samples_with_base64_images, states_with_base64_images from .log import TaskLogger, collect_eval_data, log_plan from .results import eval_results +from .transcript import solver_transcript py_logger = getLogger(__name__) @@ -100,8 +112,11 @@ async def task_run(options: TaskRunOptions) -> EvalLog: sample_semaphore = options.sample_semaphore kwargs = options.kwargs + # resolve default generate_config for task + generate_config = task.config.merge(GenerateConfigArgs(**kwargs)) + # init task context - init_task_context(model) + init_task_context(model, generate_config) # track stats and error stats = EvalStats(started_at=iso_now()) @@ -114,7 +129,6 @@ async def task_run(options: TaskRunOptions) -> EvalLog: sandbox_cleanup = config.sandbox_cleanup is not False log_images = config.log_images is True log_samples = config.log_samples is not False - generate_config = task.config.merge(GenerateConfigArgs(**kwargs)) # resolve dataset _, samples, states = await resolve_dataset( @@ -209,7 +223,6 @@ async def generate( sandbox=sandbox, sandbox_cleanup=sandbox_cleanup, plan=plan, - max_messages=config.max_messages, scorers=scorers, generate=generate, progress=progress, @@ -299,7 +312,6 @@ async def task_run_sample( sandbox: tuple[str, str | None] | None, sandbox_cleanup: bool, plan: Plan, - max_messages: int | None, scorers: list[Scorer] | None, generate: Generate, progress: Callable[..., None], @@ -317,7 +329,7 @@ async def task_run_sample( progress() # log if requested if logger: - logger.log_event("sample", previous_sample, False) + logger.log("sample", previous_sample, False) # return score if previous_sample.scores: @@ -336,7 +348,10 @@ async def task_run_sample( semaphore if semaphore else contextlib.nullcontext() ) - # use sandboxenv if provided + # initialise subtask + init_subtask(SAMPLE_SUBTASK, state.store) + + # use toolenv if provided sandboxenv_cm = ( sandboxenv_context(task_name, sandbox, sandbox_cleanup, sample) if sandbox @@ -346,10 +361,17 @@ async def task_run_sample( # solver loop async with semaphore_cm, sandboxenv_cm: try: + # sample init event + transcript()._event( + SampleInitEvent(sample=sample, state=state_jsonable(state)) + ) + # run plan steps (checking for early termination) for index, solver in enumerate(plan.steps): # run the solver - state = await solver(state, generate) + with solver_transcript(solver, state) as st: + state = await solver(state, generate) + st.complete(state) progress() # check for early termination (tick remaining progress) @@ -360,7 +382,9 @@ async def task_run_sample( # run finishing step them mark completed if plan.finish: - state = await plan.finish(state, generate) + with solver_transcript(plan.finish, state) as st: + state = await plan.finish(state, generate) + st.complete(state) progress() state.completed = True @@ -380,18 +404,20 @@ async def task_run_sample( if scorers: for scorer in scorers: scorer_name = unique_scorer_name(scorer, list(results.keys())) - score_result = ( - await scorer(state, Target(sample.target)) if scorer else None - ) - if score_result is not None: - sample_score = SampleScore( - value=score_result.value, - answer=score_result.answer, - explanation=score_result.explanation, - metadata=score_result.metadata, - sample_id=sample.id, + with transcript().step(name=scorer_name, type="scorer"): + score_result = ( + await scorer(state, Target(sample.target)) if scorer else None ) - results[scorer_name] = sample_score + if score_result is not None: + sample_score = SampleScore( + value=score_result.value, + answer=score_result.answer, + explanation=score_result.explanation, + metadata=score_result.metadata, + sample_id=sample.id, + ) + transcript()._event(ScoreEvent(score=score_result)) + results[scorer_name] = sample_score progress() # log it diff --git a/src/inspect_ai/_eval/task/transcript.py b/src/inspect_ai/_eval/task/transcript.py new file mode 100644 index 000000000..d599c3e24 --- /dev/null +++ b/src/inspect_ai/_eval/task/transcript.py @@ -0,0 +1,28 @@ +import contextlib +from typing import Iterator + +from inspect_ai._util.json import json_changes +from inspect_ai._util.registry import ( + registry_log_name, +) +from inspect_ai.solver import Solver, TaskState, transcript +from inspect_ai.solver._subtask.transcript import StateEvent +from inspect_ai.solver._task_state import set_sample_state, state_jsonable + + +class SolverTranscript: + def __init__(self, before_state: TaskState) -> None: + self.before = state_jsonable(before_state) + + def complete(self, after_state: TaskState) -> None: + after = state_jsonable(after_state) + changes = json_changes(self.before, after) + if changes: + transcript()._event(StateEvent(changes=changes)) + + +@contextlib.contextmanager +def solver_transcript(solver: Solver, state: TaskState) -> Iterator[SolverTranscript]: + set_sample_state(state) + with transcript().step(name=registry_log_name(solver), type="solver"): + yield SolverTranscript(state) diff --git a/src/inspect_ai/_util/constants.py b/src/inspect_ai/_util/constants.py index 967b233c7..93c2dc879 100644 --- a/src/inspect_ai/_util/constants.py +++ b/src/inspect_ai/_util/constants.py @@ -27,3 +27,4 @@ DEFAULT_LOG_BUFFER_LOCAL = 10 DEFAULT_LOG_BUFFER_REMOTE = 100 SCORED_SUFFIX = "-scored" +SAMPLE_SUBTASK = "sample" diff --git a/src/inspect_ai/_util/file.py b/src/inspect_ai/_util/file.py index 9d7c66df5..995359980 100644 --- a/src/inspect_ai/_util/file.py +++ b/src/inspect_ai/_util/file.py @@ -131,6 +131,9 @@ def ls( def is_local(self) -> bool: return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem) + def put_file(self, lpath: str, rpath: str) -> None: + self.fs.put_file(lpath, rpath) + def _file_info(self, info: dict[str, Any]) -> FileInfo: # name needs the protocol prepended file = info.copy() diff --git a/src/inspect_ai/_util/json.py b/src/inspect_ai/_util/json.py new file mode 100644 index 000000000..02c9859d7 --- /dev/null +++ b/src/inspect_ai/_util/json.py @@ -0,0 +1,98 @@ +from typing import Any, Literal, cast + +import jsonpatch +from pydantic import BaseModel, Field, JsonValue +from pydantic_core import to_jsonable_python + +JSONType = Literal["string", "integer", "number", "boolean", "array", "object", "null"] + +PythonType = Literal["str", "int", "float", "bool", "list", "dict", "None"] + + +def jsonable_python(x: Any) -> Any: + return to_jsonable_python(x, exclude_none=True, fallback=lambda _x: None) + + +def jsonable_dict(x: Any) -> dict[str, JsonValue]: + x = to_jsonable_python(x, exclude_none=True, fallback=lambda _x: None) + if isinstance(x, dict): + return x + else: + raise TypeError( + f"jsonable_dict must be passed an object with fields (type passed was {type(x)})" + ) + + +def python_type_to_json_type(python_type: str | None) -> JSONType: + match python_type: + case "str": + return "string" + case "int": + return "integer" + case "float": + return "number" + case "bool": + return "boolean" + case "list": + return "array" + case "dict": + return "object" + case "None": + return "null" + # treat 'unknown' as string as anything can be converted to string + case None: + return "string" + case _: + raise ValueError( + f"Unsupported type: {python_type} for Python to JSON conversion." + ) + + +def json_type_to_python_type(json_type: str) -> PythonType: + match json_type: + case "string": + return "str" + case "integer": + return "int" + case "number": + return "float" + case "boolean": + return "bool" + case "array": + return "list" + case "object": + return "dict" + case "null": + return "None" + case _: + raise ValueError( + f"Unsupported type: {json_type} for JSON to Python conversion." + ) + + +class JsonChange(BaseModel): + """Describes a change to data using JSON Patch format.""" + + op: Literal["remove", "add", "replace", "move", "test", "copy"] + """Change operation.""" + + path: str + """Path within object that was changed (uses / to delimit levels).""" + + from_: str | None = Field(default=None, alias="from") + """Location from which data was moved or copied.""" + + value: JsonValue = Field(default=None) + """Changed value.""" + + model_config = {"populate_by_name": True} + + +def json_changes( + before: dict[str, Any], after: dict[str, Any] +) -> list[JsonChange] | None: + patch = jsonpatch.make_patch(before, after) + if patch: + return [JsonChange(**change) for change in cast(list[Any], patch)] + else: + return None diff --git a/src/inspect_ai/_util/logger.py b/src/inspect_ai/_util/logger.py index 1348094af..93ae18698 100644 --- a/src/inspect_ai/_util/logger.py +++ b/src/inspect_ai/_util/logger.py @@ -1,24 +1,17 @@ from contextvars import ContextVar from logging import INFO, Logger, LogRecord -_logger_records_context_var = ContextVar[list[LogRecord]]("logger_records", default=[]) - - -def init_logger_records() -> None: - _logger_records_context_var.set([]) +from inspect_ai.log._message import LoggingMessage +from inspect_ai.solver._subtask.transcript import LoggerEvent, transcript def notify_logger_record(record: LogRecord, write: bool) -> None: if write: - _logger_records_context_var.get().append(record) + transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record))) if record.levelno <= INFO and "429" in record.getMessage(): _rate_limit_count_context_var.set(_rate_limit_count_context_var.get() + 1) -def logger_records() -> list[LogRecord]: - return _logger_records_context_var.get() - - _rate_limit_count_context_var = ContextVar[int]("rate_limit_count", default=0) diff --git a/src/inspect_ai/_view/schema.py b/src/inspect_ai/_view/schema.py index ef9477e05..9a0107e76 100644 --- a/src/inspect_ai/_view/schema.py +++ b/src/inspect_ai/_view/schema.py @@ -17,7 +17,7 @@ def sync_view_schema() -> None: """ # export schema file schema_path = Path(WWW_DIR, "log-schema.json") - types_path = Path(WWW_DIR, "log.d.ts") + types_path = Path(WWW_DIR, "src", "types", "log.d.ts") with open(schema_path, "w", encoding="utf-8") as f: # make everything required schema = EvalLog.model_json_schema() @@ -29,6 +29,7 @@ def sync_view_schema() -> None: # generate types w/ json-schema-to-typescript subprocess.run( [ + "yarn", "json2ts", "--input", schema_path, @@ -39,7 +40,7 @@ def sync_view_schema() -> None: ], cwd=WWW_DIR, ) - subprocess.run(["yarn", "prettier:write"], cwd=WWW_DIR) + subprocess.run(["yarn", "prettier:write"], cwd=types_path.parent) def schema_to_strict(schema: dict[str, Any]) -> dict[str, Any]: diff --git a/src/inspect_ai/_view/www/.gitignore b/src/inspect_ai/_view/www/.gitignore index 40b878db5..42679e49b 100644 --- a/src/inspect_ai/_view/www/.gitignore +++ b/src/inspect_ai/_view/www/.gitignore @@ -1 +1,4 @@ -node_modules/ \ No newline at end of file +node_modules/ +.env +__pycache__/ +dist/assets/*.js.map \ No newline at end of file diff --git a/src/inspect_ai/_view/www/App.css b/src/inspect_ai/_view/www/App.css index c9e561cc6..6cd4faa3f 100644 --- a/src/inspect_ai/_view/www/App.css +++ b/src/inspect_ai/_view/www/App.css @@ -9,6 +9,12 @@ --inspect-input-border: var(--bs-light-border-subtle); } +body:not([class^="vscode-"]) button { + --bs-nav-pills-link-active-bg: #e3eaf1; + --bs-nav-pills-link-active-color: black; + --bs-nav-link-color: black; +} + #app { height: 100vh; overflow-y: hidden; @@ -18,7 +24,7 @@ display: grid; height: 100vh; overflow-y: hidden; - grid-template-rows: minmax(65px, max-content) max-content 1fr; + grid-template-rows: max-content max-content 1fr; } .modal { @@ -30,7 +36,7 @@ } body[class^="vscode-"] .app-main-grid { - grid-template-rows: minmax(55px, max-content) max-content 1fr; + grid-template-rows: max-content max-content 1fr; } body[class^="vscode-"] { @@ -61,6 +67,10 @@ body[class^="vscode-"] { --inspect-input-border: var(--vscode-input-border); } +html.vscode { + font-size: 13px; +} + body[class^="vscode-"] .modal-backdrop { --bs-backdrop-opacity: 0.15; --bs-backdrop-bg: var(--vscode-editor-foreground); @@ -92,15 +102,19 @@ body[class^="vscode-"] { } body[class^="vscode-"] .navbar-brand { - font-size: 1em; + font-size: 1.1em; +} +body[class^="vscode-"] .navbar-brand > div { + margin-top: -0.2rem !important; } -body[class^="vscode-"] .navbar-brand .navbar-secondary-text { - font-size: 0.8em; +body[class^="vscode-"] .task-title { + margin-top: 0.4em; } -body[class^="vscode-"] .navbar #sidebarToggle > i.bi { - font-size: 1.1em; +body[class^="vscode-"] .task-model { + margin-top: 0.2rem; + font-size: 0.9rem; } body[class^="vscode-"] .accordion-button::after { @@ -211,7 +225,7 @@ body[class^="vscode-"] .sidebar .list-group { :root { --bs-navbar-padding-y: 0; --bs-navbar-brand-padding-y: 0; - --sidebar-width: 500px; + --sidebar-width: 550px; } body { @@ -429,13 +443,6 @@ pre[class*="language-"] { font-weight: 500; } -.card-subheading { - margin-top: 1em; - font-size: 0.8rem; - font-weight: 700; - padding-bottom: 0.2em; -} - .btn .btn-link { cursor: pointer; } @@ -689,3 +696,166 @@ table.table.table-sm td { .vscode-dark .tool-output { background-color: #333333; } + + +/* jsondiffpatch */ + +.jsondiffpatch-delta { + padding: 1em; + background: var(--bs-light); + font-family: var(--bs-font-monospace); + font-size: 0.9em; +} +.jsondiffpatch-delta pre { + white-space: pre-wrap; + word-wrap: break-word; + word-break: break-all; + margin-bottom: 0; +} +ul.jsondiffpatch-delta { + list-style-type: none; + padding: 0 0 0 1.5em; + margin: 0; +} +.jsondiffpatch-delta ul { + list-style-type: none; + padding: 0 0 0 1.5em; + margin: 0; +} +.jsondiffpatch-added, +.jsondiffpatch-modified .jsondiffpatch-right-value pre, +.jsondiffpatch-textdiff-added { + background: #dafbe1; +} + +.jsondiffpatch-deleted .jsondiffpatch-property-name, +.jsondiffpatch-deleted pre, +.jsondiffpatch-modified .jsondiffpatch-left-value pre, +.jsondiffpatch-textdiff-deleted { + background: #ffebe9; + text-decoration: line-through; +} +.jsondiffpatch-unchanged, +.jsondiffpatch-movedestination { + color: gray; +} +.jsondiffpatch-unchanged, +.jsondiffpatch-movedestination > .jsondiffpatch-value { + transition: all 0.5s; + -webkit-transition: all 0.5s; + overflow-y: hidden; +} +.jsondiffpatch-unchanged-showing .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-showing + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 100px; +} +.jsondiffpatch-unchanged-hidden .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-hidden + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 0; +} +.jsondiffpatch-unchanged-hiding + .jsondiffpatch-movedestination + > .jsondiffpatch-value, +.jsondiffpatch-unchanged-hidden + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + display: block; +} +.jsondiffpatch-unchanged-visible .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-visible + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 100px; +} +.jsondiffpatch-unchanged-hiding .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-hiding + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 0; +} +.jsondiffpatch-unchanged-showing .jsondiffpatch-arrow, +.jsondiffpatch-unchanged-hiding .jsondiffpatch-arrow { + display: none; +} +.jsondiffpatch-value { + display: inline-block; +} +.jsondiffpatch-property-name { + display: inline-block; + padding-right: 5px; + vertical-align: top; +} +.jsondiffpatch-property-name:after { + content: ': '; +} +.jsondiffpatch-child-node-type-array > .jsondiffpatch-property-name:after { + content: ': ['; +} +.jsondiffpatch-child-node-type-array:after { + content: '],'; +} +div.jsondiffpatch-child-node-type-array:before { + content: '['; +} +div.jsondiffpatch-child-node-type-array:after { + content: ']'; +} +.jsondiffpatch-child-node-type-object > .jsondiffpatch-property-name:after { + content: ': {'; +} +.jsondiffpatch-child-node-type-object:after { + content: '},'; +} +div.jsondiffpatch-child-node-type-object:before { + content: '{'; +} +div.jsondiffpatch-child-node-type-object:after { + content: '}'; +} +.jsondiffpatch-value pre:after { + content: ','; +} +li:last-child > .jsondiffpatch-value pre:after, +.jsondiffpatch-modified > .jsondiffpatch-left-value pre:after { + content: ''; +} +.jsondiffpatch-modified .jsondiffpatch-value { + display: inline-block; +} +.jsondiffpatch-modified .jsondiffpatch-right-value { + margin-left: 0; +} +.jsondiffpatch-moved .jsondiffpatch-value { + display: none; +} +.jsondiffpatch-moved .jsondiffpatch-moved-destination { + display: inline-block; + background: #ffffbb; + color: #888; +} +.jsondiffpatch-moved .jsondiffpatch-moved-destination:before { + content: ' => '; +} +ul.jsondiffpatch-textdiff { + padding: 0; +} +.jsondiffpatch-textdiff-location { + color: #bbb; + display: inline-block; + min-width: 60px; +} +.jsondiffpatch-textdiff-line { + display: inline-block; +} +.jsondiffpatch-textdiff-line-number:after { + content: ','; +} +.jsondiffpatch-error { + background: red; + color: white; + font-weight: bold; +} diff --git a/src/inspect_ai/_view/www/dist/assets/index.css b/src/inspect_ai/_view/www/dist/assets/index.css index e3f248f29..6a8c4f222 100644 --- a/src/inspect_ai/_view/www/dist/assets/index.css +++ b/src/inspect_ai/_view/www/dist/assets/index.css @@ -131,6 +131,12 @@ code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shad --inspect-input-border: var(--bs-light-border-subtle); } +body:not([class^="vscode-"]) button { + --bs-nav-pills-link-active-bg: #e3eaf1; + --bs-nav-pills-link-active-color: black; + --bs-nav-link-color: black; +} + #app { height: 100vh; overflow-y: hidden; @@ -140,7 +146,7 @@ code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shad display: grid; height: 100vh; overflow-y: hidden; - grid-template-rows: minmax(65px, max-content) max-content 1fr; + grid-template-rows: max-content max-content 1fr; } .modal { @@ -152,7 +158,7 @@ code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shad } body[class^="vscode-"] .app-main-grid { - grid-template-rows: minmax(55px, max-content) max-content 1fr; + grid-template-rows: max-content max-content 1fr; } body[class^="vscode-"] { @@ -183,6 +189,10 @@ body[class^="vscode-"] { --inspect-input-border: var(--vscode-input-border); } +html.vscode { + font-size: 13px; +} + body[class^="vscode-"] .modal-backdrop { --bs-backdrop-opacity: 0.15; --bs-backdrop-bg: var(--vscode-editor-foreground); @@ -214,15 +224,19 @@ body[class^="vscode-"] { } body[class^="vscode-"] .navbar-brand { - font-size: 1em; + font-size: 1.1em; +} +body[class^="vscode-"] .navbar-brand > div { + margin-top: -0.2rem !important; } -body[class^="vscode-"] .navbar-brand .navbar-secondary-text { - font-size: 0.8em; +body[class^="vscode-"] .task-title { + margin-top: 0.4em; } -body[class^="vscode-"] .navbar #sidebarToggle > i.bi { - font-size: 1.1em; +body[class^="vscode-"] .task-model { + margin-top: 0.2rem; + font-size: 0.9rem; } body[class^="vscode-"] .accordion-button::after { @@ -333,7 +347,7 @@ body[class^="vscode-"] .sidebar .list-group { :root { --bs-navbar-padding-y: 0; --bs-navbar-brand-padding-y: 0; - --sidebar-width: 500px; + --sidebar-width: 550px; } body { @@ -551,13 +565,6 @@ pre[class*="language-"] { font-weight: 500; } -.card-subheading { - margin-top: 1em; - font-size: 0.8rem; - font-weight: 700; - padding-bottom: 0.2em; -} - .btn .btn-link { cursor: pointer; } @@ -811,3 +818,166 @@ table.table.table-sm td { .vscode-dark .tool-output { background-color: #333333; } + + +/* jsondiffpatch */ + +.jsondiffpatch-delta { + padding: 1em; + background: var(--bs-light); + font-family: var(--bs-font-monospace); + font-size: 0.9em; +} +.jsondiffpatch-delta pre { + white-space: pre-wrap; + word-wrap: break-word; + word-break: break-all; + margin-bottom: 0; +} +ul.jsondiffpatch-delta { + list-style-type: none; + padding: 0 0 0 1.5em; + margin: 0; +} +.jsondiffpatch-delta ul { + list-style-type: none; + padding: 0 0 0 1.5em; + margin: 0; +} +.jsondiffpatch-added, +.jsondiffpatch-modified .jsondiffpatch-right-value pre, +.jsondiffpatch-textdiff-added { + background: #dafbe1; +} + +.jsondiffpatch-deleted .jsondiffpatch-property-name, +.jsondiffpatch-deleted pre, +.jsondiffpatch-modified .jsondiffpatch-left-value pre, +.jsondiffpatch-textdiff-deleted { + background: #ffebe9; + text-decoration: line-through; +} +.jsondiffpatch-unchanged, +.jsondiffpatch-movedestination { + color: gray; +} +.jsondiffpatch-unchanged, +.jsondiffpatch-movedestination > .jsondiffpatch-value { + transition: all 0.5s; + -webkit-transition: all 0.5s; + overflow-y: hidden; +} +.jsondiffpatch-unchanged-showing .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-showing + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 100px; +} +.jsondiffpatch-unchanged-hidden .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-hidden + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 0; +} +.jsondiffpatch-unchanged-hiding + .jsondiffpatch-movedestination + > .jsondiffpatch-value, +.jsondiffpatch-unchanged-hidden + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + display: block; +} +.jsondiffpatch-unchanged-visible .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-visible + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 100px; +} +.jsondiffpatch-unchanged-hiding .jsondiffpatch-unchanged, +.jsondiffpatch-unchanged-hiding + .jsondiffpatch-movedestination + > .jsondiffpatch-value { + max-height: 0; +} +.jsondiffpatch-unchanged-showing .jsondiffpatch-arrow, +.jsondiffpatch-unchanged-hiding .jsondiffpatch-arrow { + display: none; +} +.jsondiffpatch-value { + display: inline-block; +} +.jsondiffpatch-property-name { + display: inline-block; + padding-right: 5px; + vertical-align: top; +} +.jsondiffpatch-property-name:after { + content: ': '; +} +.jsondiffpatch-child-node-type-array > .jsondiffpatch-property-name:after { + content: ': ['; +} +.jsondiffpatch-child-node-type-array:after { + content: '],'; +} +div.jsondiffpatch-child-node-type-array:before { + content: '['; +} +div.jsondiffpatch-child-node-type-array:after { + content: ']'; +} +.jsondiffpatch-child-node-type-object > .jsondiffpatch-property-name:after { + content: ': {'; +} +.jsondiffpatch-child-node-type-object:after { + content: '},'; +} +div.jsondiffpatch-child-node-type-object:before { + content: '{'; +} +div.jsondiffpatch-child-node-type-object:after { + content: '}'; +} +.jsondiffpatch-value pre:after { + content: ','; +} +li:last-child > .jsondiffpatch-value pre:after, +.jsondiffpatch-modified > .jsondiffpatch-left-value pre:after { + content: ''; +} +.jsondiffpatch-modified .jsondiffpatch-value { + display: inline-block; +} +.jsondiffpatch-modified .jsondiffpatch-right-value { + margin-left: 0; +} +.jsondiffpatch-moved .jsondiffpatch-value { + display: none; +} +.jsondiffpatch-moved .jsondiffpatch-moved-destination { + display: inline-block; + background: #ffffbb; + color: #888; +} +.jsondiffpatch-moved .jsondiffpatch-moved-destination:before { + content: ' => '; +} +ul.jsondiffpatch-textdiff { + padding: 0; +} +.jsondiffpatch-textdiff-location { + color: #bbb; + display: inline-block; + min-width: 60px; +} +.jsondiffpatch-textdiff-line { + display: inline-block; +} +.jsondiffpatch-textdiff-line-number:after { + content: ','; +} +.jsondiffpatch-error { + background: red; + color: white; + font-weight: bold; +} diff --git a/src/inspect_ai/_view/www/dist/assets/index.js b/src/inspect_ai/_view/www/dist/assets/index.js index ba2f525c4..0ccc34f72 100644 --- a/src/inspect_ai/_view/www/dist/assets/index.js +++ b/src/inspect_ai/_view/www/dist/assets/index.js @@ -2381,7 +2381,7 @@ const MILLISECONDS_MULTIPLIER = 1e3; const TRANSITION_END = "transitionend"; const parseSelector = (selector) => { if (selector && window.CSS && window.CSS.escape) { - selector = selector.replace(/#([^\s"#']+)/g, (match, id2) => `#${CSS.escape(id2)}`); + selector = selector.replace(/#([^\s"#']+)/g, (match, id) => `#${CSS.escape(id)}`); } return selector; }; @@ -6219,6 +6219,9 @@ const filename = (path) => { return path; } }; +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} function throttle(func, wait, options) { var context, args, result; var timeout = null; @@ -6250,9 +6253,6 @@ function throttle(func, wait, options) { return result; }; } -function sleep(ms) { - return new Promise((resolve) => setTimeout(resolve, ms)); -} const clearDocumentSelection = () => { const sel = window.getSelection(); if (sel) { @@ -6263,14 +6263,29 @@ const clearDocumentSelection = () => { } } }; -const icons = { +const ApplicationIcons = { arrows: { right: "bi bi-arrow-right", down: "bi bi-arrow-down", up: "bi bi-arrow-up" }, - "collapse-all": "bi bi-arrows-collapse", - "collapse-up": "bi bi-chevron-up", + caret: { + right: "bi bi-caret-right", + down: "bi bi-caret-down" + }, + changes: { + add: "bi bi-plus", + remove: "bi bi-dash", + replace: "bi bi-plus-slash-minus" + }, + chevron: { + right: "bi bi-chevron-right", + down: "bi bi-chevron-down" + }, + collapse: { + all: "bi bi-arrows-collapse", + up: "bi bi-chevron-up" + }, close: "bi bi-x", config: "bi bi-gear", confirm: "bi bi-check", @@ -6281,11 +6296,13 @@ const icons = { error: "bi bi-exclamation-circle", "expand-all": "bi bi-arrows-expand", "expand-down": "bi bi-chevron-down", + info: "bi bi-info-circle", inspect: "bi bi-gear", json: "bi bi-filetype-json", logging: { notset: "bi bi-card-text", debug: "bi bi-bug", + http: "bi bi-download", info: "bi bi-info-square", warning: "bi bi-exclamation-triangle", error: "bi bi-x-circle", @@ -6294,9 +6311,10 @@ const icons = { menu: "bi bi-list", messages: "bi bi-chat-right-text", metadata: "bi bi-table", - model: "bi bi-cpu", + model: "bi bi-grid-3x3-gap", "toggle-right": "bi bi-chevron-right", more: "bi bi-zoom-in", + "multiple-choice": "bi bi-card-list", next: "bi bi-chevron-right", previous: "bi bi-chevron-left", role: { @@ -6305,7 +6323,7 @@ const icons = { assistant: "bi bi-robot", tool: "bi bi-tools" }, - sample: "bi bi-speedometer", + sample: "bi bi-database", samples: "bi bi-file-spreadsheet", scorer: "bi bi-calculator", search: "bi bi-search", @@ -6317,64 +6335,33 @@ const icons = { system_message: "bi bi-cpu", use_tools: "bi bi-tools" }, + step: "bi bi-fast-forward-btn", + subtask: "bi bi-subtract", + transcript: "bi bi-list-columns-reverse", usage: "bi bi-stopwatch" }; -const colors = { - logging: { - debug: "var(--bs-secondary)", - info: "var(--bs-blue)", - warning: "var(--bs-warning)", - error: "var(--bs-danger)", - critical: "var(--bs-danger)" - } +const kBaseFontSize = 0.9; +const ScaleBaseFont = (scale) => { + return `${kBaseFontSize + scale}rem`; }; -const sharedStyles = { - moreButton: { - maxHeight: "1.8em", - fontSize: "0.8rem", - padding: "0 0.2em 0 0.2em", - color: "var(--bs-secondary)" - }, - threeLineClamp: { - display: "-webkit-box", - "-webkit-line-clamp": "3", - "-webkit-box-orient": "vertical", - overflow: "hidden" - }, - lineClamp: (len) => { - return { - display: "-webkit-box", - "-webkit-line-clamp": `${len}`, - "-webkit-box-orient": "vertical", - overflow: "hidden" - }; - }, - wrapText: () => { - return { - whiteSpace: "nowrap", - textOverflow: "ellipsis", - overflow: "hidden" - }; +const FontSize = { + title: ScaleBaseFont(0.6), + "title-secondary": ScaleBaseFont(0.4), + larger: ScaleBaseFont(0.2), + large: ScaleBaseFont(0.1), + base: ScaleBaseFont(0), + small: ScaleBaseFont(-0.1), + smaller: ScaleBaseFont(-0.1) +}; +const TextStyle = { + label: { + textTransform: "uppercase" }, - scoreFills: { - green: { - backgroundColor: "var(--bs-success)", - borderColor: "var(--bs-success)", - color: "var(--bs-body-bg)" - }, - red: { - backgroundColor: "var(--bs-danger)", - borderColor: "var(--bs-danger)", - color: "var(--bs-body-bg)" - }, - orange: { - backgroundColor: "var(--bs-orange)", - borderColor: "var(--bs-orange)", - color: "var(--bs-body-bg)" - } + secondary: { + color: "var(--bs-secondary)" } }; -const ErrorPanel = ({ id: id2, classes, title, error }) => { +const ErrorPanel = ({ id, classes, title, error }) => { const emptyStyle = { display: "flex", flex: "0 0 content", @@ -6385,7 +6372,7 @@ const ErrorPanel = ({ id: id2, classes, title, error }) => { const stack2 = error.stack; return m$1`
{ marginTop: "4rem" }} > -
+
@@ -6406,7 +6393,7 @@ const ErrorPanel = ({ id: id2, classes, title, error }) => {
{
Error: ${message || ""} ${stack2 && m$1` -
+            
             
               at ${stack2}
             
@@ -6452,7 +6439,6 @@ class AppErrorBoundary extends b {
 }
 const ProgressBar = ({ style, animating }) => {
   const emptyStyle = {
-    ...style,
     display: "flex",
     textAlign: "center",
     flex: "0 0 content",
@@ -6460,19 +6446,20 @@ const ProgressBar = ({ style, animating }) => {
     justifyContent: "center",
     border: "none",
     padding: "0",
-    background: "#FFFFFF00",
-    fontSize: "0.7em",
     zIndex: 1001,
-    width: "100%"
+    width: "100%",
+    height: "0px",
+    overflow: "visible"
   };
   const progressContainerStyle = {
     width: "100%",
-    height: "4px",
+    height: "2px",
     background: "none"
   };
   const progressBarStyle = {
     width: "5%",
-    height: "2px"
+    height: "2px",
+    ...style
   };
   return m$1`
     
@@ -6493,3672 +6480,1975 @@ const ProgressBar = ({ style, animating }) => {
`; }; -const CopyButton = ({ value }) => { - return m$1``; -}; -const Navbar = ({ - file, - task, +const Sidebar = ({ + offcanvas, logs, - model, - status, - samples, - results, - offcanvas + loading, + logHeaders, + selectedIndex, + onSelectedIndexChanged }) => { - const toggleOffCanClass = offcanvas ? "" : " d-md-none"; - const logFileName = file ? filename(file) : ""; - let statusPanel; - if (status === "success") { - statusPanel = m$1`<${ResultsPanel} results="${results}" />`; - } else if (status === "cancelled") { - statusPanel = m$1`<${CanceledPanel} - sampleCount=${(samples == null ? void 0 : samples.length) || 0} - />`; - } else if (status === "started") { - statusPanel = m$1`<${RunningPanel} />`; - } - const navbarContents = logFileName ? m$1` `; }; -const CanceledPanel = ({ sampleCount }) => { +const prettyDir = (path) => { + try { + let url = new URL(path); + if (url.protocol === "file:") { + return url.pathname; + } else { + return path; + } + } catch { + return path; + } +}; +const EvalStatus = ({ logHeader }) => { + var _a; + switch (logHeader.status) { + case "error": + return m$1`<${StatusError} message="Error" />`; + case "cancelled": + return m$1`<${StatusCancelled} message="Cancelled" />`; + case "started": + return m$1`<${StatusRunning} message="Running" />`; + default: + if (((_a = logHeader == null ? void 0 : logHeader.results) == null ? void 0 : _a.scores) && logHeader.results.scores.length > 0) { + if (logHeader.results.scores.length === 1) { + return m$1`<${SidebarScore} + scorer=${logHeader.results.scores[0]} + />`; + } else { + return m$1`<${SidebarScores} scores=${logHeader.results.scores} />`; + } + } else { + return ""; + } + } +}; +const SidebarScore = ({ scorer }) => { return m$1`
- cancelled - (${sampleCount} ${sampleCount === 1 ? "sample" : "samples"}) + ${Object.keys(scorer.metrics).map((metric) => { + return m$1` +
+
+ ${scorer.metrics[metric].name} +
+ ${scorer.reducer ? m$1`
+ ${scorer.reducer} +
` : ""} +
+ ${formatPrettyDecimal(scorer.metrics[metric].value)} +
+
+ `; + })}
`; }; -const RunningPanel = () => { +const SidebarScores = ({ scores }) => { return m$1`
-
-
- Running -
-
`; -}; -const ResultsPanel = ({ results }) => { - if (results.scores.length === 1) { - const scorers = {}; - results.scores.map((score) => { - scorers[score.name] = Object.keys(score.metrics).map((key2) => { - return { - name: key2, - value: score.metrics[key2].value, - reducer: score.reducer - }; - }); - }); - const metrics = Object.values(scorers)[0]; - return m$1`
{ + const name = score.name; + const reducer = score.reducer; + return m$1` +
- ${metrics.map((metric, i2) => { - return m$1`<${VerticalMetric} metric=${metric} isFirst=${i2 === 0} />`; + > +
+ ${name} +
+ ${reducer ? m$1`
+ ${reducer} +
` : ""} +
+ ${Object.keys(score.metrics).map((key2) => { + const metric = score.metrics[key2]; + return m$1`
+ ${metric.name} +
+
+ ${formatPrettyDecimal(metric.value)} +
`; })} +
+
+ `; + })} +
`; +}; +const StatusCancelled = ({ message }) => { + return m$1`
${message}
`; +}; +const StatusRunning = ({ message }) => { + return m$1`
+ ${message} +
`; +}; +const StatusError = ({ message }) => { + return m$1`
${message}
`; +}; +const LogDirectoryTitle = ({ log_dir, offcanvas }) => { + if (log_dir) { + const displayDir = prettyDir(log_dir); + return m$1`
+ Log Directory + ${offcanvas ? displayDir : ""}
`; } else { - return m$1`
- ${results.scores.map((score, index) => { - return m$1`<${MultiScorerMetric} - scorer=${score} - isFirst=${index === 0} - />`; - })} -
`; + >${offcanvas ? "Log History" : ""} + `; } }; -const VerticalMetric = ({ metric, isFirst }) => { - const reducer_component = metric.reducer ? m$1`
- ${metric.reducer} -
` : ""; - return m$1`
-
- ${metric.name} -
- ${reducer_component} -
- ${formatPrettyDecimal(metric.value)} -
-
`; -}; -const MultiScorerMetric = ({ scorer, isFirst }) => { - const baseFontSize = Object.keys(scorer.metrics).length === 1 ? 0.9 : 0.7; - const reducer_component = scorer.reducer ? m$1`
- ${scorer.reducer} -
` : ""; - return m$1`
-
- ${scorer.name} -
- ${reducer_component} -
- ${Object.keys(scorer.metrics).map((key2) => { - const metric = scorer.metrics[key2]; - return m$1`
${metric.name}
-
- ${formatPrettyDecimal(metric.value)} -
`; - })} -
-
`; -}; -const Sidebar = ({ - offcanvas, - logs, - loading, - logHeaders, - selectedIndex, - onSelectedIndexChanged -}) => { - const btnOffCanClass = offcanvas ? "" : " d-md-none"; - const sidebarOffCanClass = offcanvas ? " offcanvas" : " offcanvas-md"; - return m$1` - - `; -}; -const prettyDir = (path) => { - try { - let url = new URL(path); - if (url.protocol === "file:") { - return url.pathname; - } else { - return path; - } - } catch { - return path; - } -}; -const EvalStatus = ({ logHeader }) => { - var _a; - switch (logHeader.status) { - case "error": - return m$1`<${StatusError} message="Error" />`; - case "cancelled": - return m$1`<${StatusCancelled} message="Cancelled" />`; - case "started": - return m$1`<${StatusRunning} message="Running" />`; - default: - if (((_a = logHeader == null ? void 0 : logHeader.results) == null ? void 0 : _a.scores) && logHeader.results.scores.length > 0) { - if (logHeader.results.scores.length === 1) { - return m$1`<${SidebarScore} - scorer=${logHeader.results.scores[0]} - />`; - } else { - return m$1`<${SidebarScores} scores=${logHeader.results.scores} />`; - } - } else { - return ""; - } - } -}; -const SidebarScore = ({ scorer }) => { - return m$1`
- ${Object.keys(scorer.metrics).map((metric) => { - return m$1` -
-
- ${scorer.metrics[metric].name} -
- ${scorer.reducer ? m$1`
- ${scorer.reducer} -
` : ""} -
- ${formatPrettyDecimal(scorer.metrics[metric].value)} -
-
- `; - })} -
`; -}; -const SidebarScores = ({ scores }) => { - return m$1`
- ${scores.map((score) => { - const name = score.name; - const reducer = score.reducer; - return m$1` -
-
- ${name} -
- ${reducer ? m$1`
- ${reducer} -
` : ""} -
- ${Object.keys(score.metrics).map((key2) => { - const metric = score.metrics[key2]; - return m$1`
${metric.name}
-
- ${formatPrettyDecimal(metric.value)} -
`; - })} -
-
- `; - })} -
`; -}; -const StatusCancelled = ({ message }) => { - return m$1`
${message}
`; -}; -const StatusRunning = ({ message }) => { - return m$1`
- ${message} -
`; -}; -const StatusError = ({ message }) => { - return m$1`
${message}
`; -}; -const LogDirectoryTitle = ({ log_dir, offcanvas }) => { - if (log_dir) { - const displayDir = prettyDir(log_dir); - return m$1`
- Log Directory - ${offcanvas ? displayDir : ""} -
`; - } else { - return m$1`${offcanvas ? "Log History" : ""} - `; - } -}; -var prism = { exports: {} }; -(function(module) { - var _self = typeof window !== "undefined" ? window : typeof WorkerGlobalScope !== "undefined" && self instanceof WorkerGlobalScope ? self : {}; - /** - * Prism: Lightweight, robust, elegant syntax highlighting - * - * @license MIT - * @author Lea Verou - * @namespace - * @public - */ - var Prism2 = function(_self2) { - var lang = /(?:^|\s)lang(?:uage)?-([\w-]+)(?=\s|$)/i; - var uniqueId = 0; - var plainTextGrammar = {}; - var _2 = { - /** - * By default, Prism will attempt to highlight all code elements (by calling {@link Prism.highlightAll}) on the - * current page after the page finished loading. This might be a problem if e.g. you wanted to asynchronously load - * additional languages or plugins yourself. - * - * By setting this value to `true`, Prism will not automatically highlight all code elements on the page. - * - * You obviously have to change this value before the automatic highlighting started. To do this, you can add an - * empty Prism object into the global scope before loading the Prism script like this: - * - * ```js - * window.Prism = window.Prism || {}; - * Prism.manual = true; - * // add a new