From c742d6b220b802c6bc7627242315f1ce444c18e2 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Fri, 13 Dec 2024 16:57:35 +0000 Subject: [PATCH] WIP --- apps/evalite-docs/astro.config.mjs | 32 ++- .../src/content/docs/examples/ai-sdk.md | 47 +++++ .../docs/guides/environment-variables.mdx | 51 +++++ .../src/content/docs/guides/example.md | 11 - .../src/content/docs/guides/scorers.mdx | 64 ++++++ .../src/content/docs/guides/streams.md | 28 +++ .../src/content/docs/guides/traces.mdx | 81 +++++++ .../src/content/docs/quickstart.mdx | 92 ++++++++ .../src/content/docs/reference/example.md | 11 - package.json | 1 + packages/evalite/readme.md | 198 ------------------ 11 files changed, 392 insertions(+), 224 deletions(-) create mode 100644 apps/evalite-docs/src/content/docs/examples/ai-sdk.md create mode 100644 apps/evalite-docs/src/content/docs/guides/environment-variables.mdx delete mode 100644 apps/evalite-docs/src/content/docs/guides/example.md create mode 100644 apps/evalite-docs/src/content/docs/guides/scorers.mdx create mode 100644 apps/evalite-docs/src/content/docs/guides/streams.md create mode 100644 apps/evalite-docs/src/content/docs/guides/traces.mdx create mode 100644 apps/evalite-docs/src/content/docs/quickstart.mdx delete mode 100644 apps/evalite-docs/src/content/docs/reference/example.md diff --git a/apps/evalite-docs/astro.config.mjs b/apps/evalite-docs/astro.config.mjs index 7818e15..dd3a58c 100644 --- a/apps/evalite-docs/astro.config.mjs +++ b/apps/evalite-docs/astro.config.mjs @@ -8,18 +8,42 @@ export default defineConfig({ title: "Evalite", social: { github: "https://github.com/mattpocock/evalite", + discord: "https://mattpocock.com/ai-discord", }, sidebar: [ + { + label: "Getting Started", + items: [ + { + label: "Quickstart", + slug: "quickstart", + }, + ], + }, { label: "Guides", items: [ - // Each item here is one entry in the navigation menu. - { label: "Example Guide", slug: "guides/example" }, + { + label: "Environment Variables", + slug: "guides/environment-variables", + }, + { + label: "Scorers", + slug: "guides/scorers", + }, + { + label: "Traces", + slug: "guides/traces", + }, + { + label: "Streams", + slug: "guides/streams", + }, ], }, { - label: "Reference", - autogenerate: { directory: "reference" }, + label: "Examples", + items: [{ label: "AI SDK", slug: "examples/ai-sdk" }], }, ], }), diff --git a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md b/apps/evalite-docs/src/content/docs/examples/ai-sdk.md new file mode 100644 index 0000000..c8adb17 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/examples/ai-sdk.md @@ -0,0 +1,47 @@ +--- +title: AI SDK +--- + +Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) is a great way to get started with AI in your apps. + +It abstracts away the differences between different AI providers, so you can **switch between them easily**. + +Here's how it might look with Evalite: + +```ts +// my-eval.eval.ts + +import { openai } from "@ai-sdk/openai"; +import { streamText } from "ai"; +import { Factuality, Levenshtein } from "autoevals"; +import { evalite } from "evalite"; +import { traceAISDKModel } from "evalite/ai-sdk"; + +evalite("Test Capitals", { + data: async () => [ + { + input: `What's the capital of France?`, + expected: `Paris`, + }, + { + input: `What's the capital of Germany?`, + expected: `Berlin`, + }, + ], + task: async (input) => { + const result = await streamText({ + model: traceAISDKModel(openai("gpt-4o-mini")), + system: ` + Answer the question concisely. Answer in as few words as possible. + Remove full stops from the end of the output. + If the country has no capital, return ' has no capital'. + If the country does not exist, return 'Unknown'. + `, + prompt: input, + }); + + return result.textStream; + }, + scorers: [Factuality, Levenshtein], +}); +``` diff --git a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx b/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx new file mode 100644 index 0000000..946f38b --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx @@ -0,0 +1,51 @@ +--- +title: Environment Variables +--- + +import { Steps } from "@astrojs/starlight/components"; + +To call your LLM from a third-party service, you'll likely need some environment variables to keep your API keys safe. + +Since **Evalite is based on Vitest**, it should already pick them up from your `vite.config.ts`. + +## Setting Up Env Variables + +If you don't have Vitest set up, here's how to do it: + + + +1. Create a `.env` file in the root of your project: + + ``` + OPENAI_API_KEY=your-api-key + ``` + +2. Add `.env` to your `.gitignore`, if it's not already there + + ``` + .env + ``` + +3. Install `dotenv`: + + ```bash + pnpm add -D dotenv + ``` + +4. Add a `vite.config.ts` file: + + ```ts + // vite.config.ts + + import { defineConfig } from "vite/config"; + + export default defineConfig({ + test: { + setupFiles: ["dotenv/config"], + }, + }); + ``` + + + +Now, your environment variables will be available in your evals. diff --git a/apps/evalite-docs/src/content/docs/guides/example.md b/apps/evalite-docs/src/content/docs/guides/example.md deleted file mode 100644 index ebd0f3b..0000000 --- a/apps/evalite-docs/src/content/docs/guides/example.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Example Guide -description: A guide in my new Starlight docs site. ---- - -Guides lead a user through a specific task they want to accomplish, often with a sequence of steps. -Writing a good guide requires thinking about what your users are trying to do. - -## Further reading - -- Read [about how-to guides](https://diataxis.fr/how-to-guides/) in the Diátaxis framework diff --git a/apps/evalite-docs/src/content/docs/guides/scorers.mdx b/apps/evalite-docs/src/content/docs/guides/scorers.mdx new file mode 100644 index 0000000..e14d343 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/scorers.mdx @@ -0,0 +1,64 @@ +--- +title: Scorers +--- + +import { Aside } from "@astrojs/starlight/components"; + +Scorers are used to score the output of your LLM call. + +[Autoevals](https://github.com/braintrustdata/autoevals) is a great library of scorers to get you started. + +You can create your own using `createScorer`: + +```ts +import { createScorer } from "evalite"; + +const containsParis = createScorer({ + name: "Contains Paris", + description: "Checks if the output contains the word 'Paris'.", + score: (output) => { + return output.includes("Paris") ? 1 : 0; + }, +}); + +evalite("My Eval", { + data: async () => { + return [{ input: "Hello", output: "Hello World!" }]; + }, + task: async (input) => { + return input + " World!"; + }, + scorers: [containsParis], +}); +``` + +The `name` and `description` of the scorer will be displayed in the Evalite UI. + +## Scorer Metadata + +You can provide metadata along with your custom scorer: + +```ts +import { createScorer } from "evalite"; + +const containsParis = createScorer({ + name: "Contains Paris", + description: "Checks if the output contains the word 'Paris'.", + score: (output) => { + return { + score: output.includes("Paris") ? 1 : 0, + metadata: { + // Can be anything! + }, + }; + }, +}); +``` + +This will be visible along with the score in the Evalite UI. + + diff --git a/apps/evalite-docs/src/content/docs/guides/streams.md b/apps/evalite-docs/src/content/docs/guides/streams.md new file mode 100644 index 0000000..cc6f32b --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/streams.md @@ -0,0 +1,28 @@ +--- +title: Streams +--- + +You can handle streams in Evalite by returning any async iterable (including a `ReadableStream`) from your task. This means you can test functions like the AI SDK `streamText` function easily: + +```ts +import { evalite } from "evalite"; +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { Factuality } from "autoevals"; + +evalite("My Eval", { + data: async () => { + return [{ input: "What is the capital of France?", expected: "Paris" }]; + }, + task: async (input) => { + const result = await streamText({ + model: openai("your-model"), + system: `Answer the question concisely.`, + prompt: input, + }); + + return result.textStream; + }, + scorers: [Factuality], +}); +``` diff --git a/apps/evalite-docs/src/content/docs/guides/traces.mdx b/apps/evalite-docs/src/content/docs/guides/traces.mdx new file mode 100644 index 0000000..3309373 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/guides/traces.mdx @@ -0,0 +1,81 @@ +--- +title: Traces +--- + +import { Aside } from "@astrojs/starlight/components"; + +Traces are used to track the behaviour of each individual call to an LLM inside your task. + +## `reportTrace` + +You can report a trace by calling `reportTrace` inside an `evalite` eval: + +```ts +import { evalite, type Evalite } from "evalite"; +import { reportTrace } from "evalite/evals"; + +evalite("My Eval", { + data: async () => { + return [{ input: "Hello", expected: "Hello World!" }]; + }, + task: async (input) => { + // Track the start time + const start = performance.now(); + + // Call our LLM + const result = await myLLMCall(); + + // Report the trace once it's finished + reportTrace({ + start, + end: performance.now(), + output: result.output, + input: [ + { + role: "user", + content: input, + }, + ], + usage: { + completionTokens: result.completionTokens, + promptTokens: result.promptTokens, + }, + }); + + // Return the output + return result.output; + }, + scorers: [Levenshtein], +}); +``` + + + +## `traceAISDKModel` + +If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function: + +```ts +import { traceAISDKModel } from "evalite/ai-sdk"; +import { generateText } from "ai"; +import { openai } from "@ai-sdk/openai"; + +// All calls to this model will be recorded in evalite! +const tracedModel = traceAISDKModel(openai("gpt-4o-mini")); + +const result = await generateText({ + model: tracedModel, + system: `Answer the question concisely.`, + prompt: `What is the capital of France?`, +}); +``` + + diff --git a/apps/evalite-docs/src/content/docs/quickstart.mdx b/apps/evalite-docs/src/content/docs/quickstart.mdx new file mode 100644 index 0000000..63db40a --- /dev/null +++ b/apps/evalite-docs/src/content/docs/quickstart.mdx @@ -0,0 +1,92 @@ +--- +title: Quickstart +description: A guide in my new Starlight docs site. +--- + +import { Aside, Steps } from "@astrojs/starlight/components"; + +We're going to walk through setting up Evalite in an existing project. + + + +1. Install `evalite`, `vitest` and `autoevals`: + +Install `evalite`, `vitest`, and a scoring library like `autoevals`: + +```bash +pnpm add -D evalite vitest autoevals +``` + +2. Add an `eval:dev` script: + + Add an `eval:dev` script to your package.json: + + ```json + { + "scripts": { + "eval:dev": "evalite watch" + } + } + ``` + +3. Create your first eval: + + Create `my-eval.eval.ts`: + + ```ts + // my-eval.eval.ts + + import { evalite } from "evalite"; + import { Levenshtein } from "autoevals"; + + evalite("My Eval", { + // A function that returns an array of test data + // - TODO: Replace with your test data + data: async () => { + return [{ input: "Hello", expected: "Hello World!" }]; + }, + // The task to perform + // - TODO: Replace with your LLM call + task: async (input) => { + return input + " World!"; + }, + // The scoring methods for the eval + scorers: [Levenshtein], + }); + ``` + + + +4. Run Your Eval + + Run `pnpm run eval:dev`. + + ```bash + pnpm run eval:dev + ``` + + This runs `evalite`, which runs the evals: + + - Runs the `data` function to get the test data + - Runs the `task` function on each test data + - Scores the output of the `task` function using the `scorers` + - Saves the results to a sqlite database in `node_modules/.evalite` + + It then: + + - Shows a UI for viewing the traces, scores, inputs and outputs at http://localhost:3006. + - If you only ran one eval, it also shows a table summarizing the eval in the terminal. + +5. View Your Eval + + Open http://localhost:3006 in your browser to view the results of the eval. + + + +### What Next? + +Head to the [AI SDK example](/examples/ai-sdk) to see a fully-fleshed out example of Evalite in action. diff --git a/apps/evalite-docs/src/content/docs/reference/example.md b/apps/evalite-docs/src/content/docs/reference/example.md deleted file mode 100644 index 0224f09..0000000 --- a/apps/evalite-docs/src/content/docs/reference/example.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Example Reference -description: A reference page in my new Starlight docs site. ---- - -Reference pages are ideal for outlining how things work in terse and clear terms. -Less concerned with telling a story or addressing a specific use case, they should give a comprehensive outline of what you're documenting. - -## Further reading - -- Read [about reference](https://diataxis.fr/reference/) in the Diátaxis framework diff --git a/package.json b/package.json index 71de5b6..392cac1 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "private": true, "scripts": { "dev": "turbo watch dev", + "docs:dev": "turbo watch dev --filter=evalite-docs", "wsl:dev": "pnpm run -r --parallel dev", "ci": "turbo build test lint after-build", "build": "turbo build after-build", diff --git a/packages/evalite/readme.md b/packages/evalite/readme.md index 6e5f1b8..925722b 100644 --- a/packages/evalite/readme.md +++ b/packages/evalite/readme.md @@ -121,201 +121,3 @@ This also works for `watch` mode: ```bash evalite watch my-eval.eval.ts ``` - -### Environment Variables - -To call your LLM from a third-party service, you'll likely need some environment variables to keep your API keys safe. - -Since Evalite is based on Vitest, it should already pick them up from your `vite.config.ts`. - -If you don't have Vitest set up, here's how to do it: - -1. Create a `.env` file in the root of your project: - -``` -OPENAI_API_KEY=your-api-key -``` - -2. Add `.env` to your `.gitignore`, if it's not already there - -``` -.env -``` - -3. Install `dotenv`: - -```bash -pnpm add -D dotenv -``` - -4. Add a `vite.config.ts` file: - -```ts -// vite.config.ts - -import { defineConfig } from "vite/config"; - -export default defineConfig({ - test: { - setupFiles: ["dotenv/config"], - }, -}); -``` - -Now, your environment variables will be available in your evals. - -### Scorers - -Scorers are used to score the output of your LLM call. - -[Autoevals](https://github.com/braintrustdata/autoevals) is a great library of scorers to get you started. - -You can create your own using `createScorer`: - -```ts -import { createScorer } from "evalite"; - -const containsParis = createScorer({ - name: "Contains Paris", - description: "Checks if the output contains the word 'Paris'.", - score: (output) => { - return output.includes("Paris") ? 1 : 0; - }, -}); - -evalite("My Eval", { - data: async () => { - return [{ input: "Hello", output: "Hello World!" }]; - }, - task: async (input) => { - return input + " World!"; - }, - scorers: [containsParis], -}); -``` - -#### Metadata - -You can provide metadata along with your custom scorer: - -```ts -import { createScorer } from "evalite"; - -const containsParis = createScorer({ - name: "Contains Paris", - description: "Checks if the output contains the word 'Paris'.", - score: (output) => { - return { - score: output.includes("Paris") ? 1 : 0, - metadata: { - // Can be anything! - }, - }; - }, -}); -``` - -This will be visible along with the score in the Evalite UI. - -> [!TIP] -> -> This is especially useful for debugging LLM-as-a-judge evals. In autoevals `Factuality` scorer, the metadata will include a rationale for why the scorer gave the score it did. - -### Traces - -Traces are used to track the behaviour of each individual call to an LLM inside your task. - -You can report a trace by calling `reportTrace` inside an `evalite` eval: - -```ts -import { evalite, type Evalite } from "evalite"; -import { reportTrace } from "evalite/evals"; - -evalite("My Eval", { - data: async () => { - return [{ input: "Hello", expected: "Hello World!" }]; - }, - task: async (input) => { - // Track the start time - const start = performance.now(); - - // Call our LLM - const result = await myLLMCall(); - - // Report the trace once it's finished - reportTrace({ - start, - end: performance.now(), - output: result.output, - input: [ - { - role: "user", - content: input, - }, - ], - usage: { - completionTokens: result.completionTokens, - promptTokens: result.promptTokens, - }, - }); - - // Return the output - return result.output; - }, - scorers: [Levenshtein], -}); -``` - -> [!NOTE] -> -> `reportTrace` is a no-op in production, so you can leave it in your code without worrying about performance. - -#### Reporting Traces Automatically - -If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function: - -```ts -import { traceAISDKModel } from "evalite/ai-sdk"; -import { generateText } from "ai"; -import { openai } from "@ai-sdk/openai"; - -// All calls to this model will be recorded in evalite! -const tracedModel = traceAISDKModel(openai("gpt-4o-mini")); - -const result = await generateText({ - model: tracedModel, - system: `Answer the question concisely.`, - prompt: `What is the capital of France?`, -}); -``` - -> [!NOTE] -> -> `traceAISDKModel`, like `reportTrace`, is a no-op in production. - -### Streams - -You can handle streams in Evalite by returning any async iterable (including a `ReadableStream`) from your task. This means you can test functions like the AI SDK `streamText` function easily: - -```ts -import { evalite } from "evalite"; -import { streamText } from "ai"; -import { openai } from "@ai-sdk/openai"; -import { Factuality } from "autoevals"; - -evalite("My Eval", { - data: async () => { - return [{ input: "What is the capital of France?", expected: "Paris" }]; - }, - task: async (input) => { - const result = await streamText({ - model: openai("your-model"), - system: `Answer the question concisely.`, - prompt: input, - }); - - return result.textStream; - }, - scorers: [Factuality], -}); -```