WIP

mattpocock · Dec 13, 2024 · c742d6b · c742d6b
1 parent eda71c5
commit c742d6b
Show file tree

Hide file tree

Showing 11 changed files with 392 additions and 224 deletions.
diff --git a/apps/evalite-docs/astro.config.mjs b/apps/evalite-docs/astro.config.mjs
@@ -8,18 +8,42 @@ export default defineConfig({
       title: "Evalite",
       social: {
         github: "https://github.com/mattpocock/evalite",
+        discord: "https://mattpocock.com/ai-discord",
       },
       sidebar: [
+        {
+          label: "Getting Started",
+          items: [
+            {
+              label: "Quickstart",
+              slug: "quickstart",
+            },
+          ],
+        },
         {
           label: "Guides",
           items: [
-            // Each item here is one entry in the navigation menu.
-            { label: "Example Guide", slug: "guides/example" },
+            {
+              label: "Environment Variables",
+              slug: "guides/environment-variables",
+            },
+            {
+              label: "Scorers",
+              slug: "guides/scorers",
+            },
+            {
+              label: "Traces",
+              slug: "guides/traces",
+            },
+            {
+              label: "Streams",
+              slug: "guides/streams",
+            },
           ],
         },
         {
-          label: "Reference",
-          autogenerate: { directory: "reference" },
+          label: "Examples",
+          items: [{ label: "AI SDK", slug: "examples/ai-sdk" }],
         },
       ],
     }),

diff --git a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md b/apps/evalite-docs/src/content/docs/examples/ai-sdk.md
@@ -0,0 +1,47 @@
+---
+title: AI SDK
+---
+
+Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) is a great way to get started with AI in your apps.
+
+It abstracts away the differences between different AI providers, so you can **switch between them easily**.
+
+Here's how it might look with Evalite:
+
+```ts
+// my-eval.eval.ts
+
+import { openai } from "@ai-sdk/openai";
+import { streamText } from "ai";
+import { Factuality, Levenshtein } from "autoevals";
+import { evalite } from "evalite";
+import { traceAISDKModel } from "evalite/ai-sdk";
+
+evalite("Test Capitals", {
+  data: async () => [
+    {
+      input: `What's the capital of France?`,
+      expected: `Paris`,
+    },
+    {
+      input: `What's the capital of Germany?`,
+      expected: `Berlin`,
+    },
+  ],
+  task: async (input) => {
+    const result = await streamText({
+      model: traceAISDKModel(openai("gpt-4o-mini")),
+      system: `
+        Answer the question concisely. Answer in as few words as possible.
+        Remove full stops from the end of the output.
+        If the country has no capital, return '<country> has no capital'.
+        If the country does not exist, return 'Unknown'.
+      `,
+      prompt: input,
+    });
+
+    return result.textStream;
+  },
+  scorers: [Factuality, Levenshtein],
+});
+```
diff --git a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx b/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx
@@ -0,0 +1,51 @@
+---
+title: Environment Variables
+---
+
+import { Steps } from "@astrojs/starlight/components";
+
+To call your LLM from a third-party service, you'll likely need some environment variables to keep your API keys safe.
+
+Since **Evalite is based on Vitest**, it should already pick them up from your `vite.config.ts`.
+
+## Setting Up Env Variables
+
+If you don't have Vitest set up, here's how to do it:
+
+<Steps>
+
+1. Create a `.env` file in the root of your project:
+
+   ```
+   OPENAI_API_KEY=your-api-key
+   ```
+
+2. Add `.env` to your `.gitignore`, if it's not already there
+
+   ```
+   .env
+   ```
+
+3. Install `dotenv`:
+
+   ```bash
+   pnpm add -D dotenv
+   ```
+
+4. Add a `vite.config.ts` file:
+
+   ```ts
+   // vite.config.ts
+
+   import { defineConfig } from "vite/config";
+
+   export default defineConfig({
+     test: {
+       setupFiles: ["dotenv/config"],
+     },
+   });
+   ```
+
+</Steps>
+
+Now, your environment variables will be available in your evals.
diff --git a/apps/evalite-docs/src/content/docs/guides/example.md b/apps/evalite-docs/src/content/docs/guides/example.md
diff --git a/apps/evalite-docs/src/content/docs/guides/scorers.mdx b/apps/evalite-docs/src/content/docs/guides/scorers.mdx
@@ -0,0 +1,64 @@
+---
+title: Scorers
+---
+
+import { Aside } from "@astrojs/starlight/components";
+
+Scorers are used to score the output of your LLM call.
+
+[Autoevals](https://github.com/braintrustdata/autoevals) is a great library of scorers to get you started.
+
+You can create your own using `createScorer`:
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer<string>({
+  name: "Contains Paris",
+  description: "Checks if the output contains the word 'Paris'.",
+  score: (output) => {
+    return output.includes("Paris") ? 1 : 0;
+  },
+});
+
+evalite("My Eval", {
+  data: async () => {
+    return [{ input: "Hello", output: "Hello World!" }];
+  },
+  task: async (input) => {
+    return input + " World!";
+  },
+  scorers: [containsParis],
+});
+```
+
+The `name` and `description` of the scorer will be displayed in the Evalite UI.
+
+## Scorer Metadata
+
+You can provide metadata along with your custom scorer:
+
+```ts
+import { createScorer } from "evalite";
+
+const containsParis = createScorer<string>({
+  name: "Contains Paris",
+  description: "Checks if the output contains the word 'Paris'.",
+  score: (output) => {
+    return {
+      score: output.includes("Paris") ? 1 : 0,
+      metadata: {
+        // Can be anything!
+      },
+    };
+  },
+});
+```
+
+This will be visible along with the score in the Evalite UI.
+
+<Aside type="tip">
+
+This is especially useful for debugging LLM-as-a-judge evals. In autoevals `Factuality` scorer, the metadata will include a rationale for why the scorer gave the score it did.
+
+</Aside>
diff --git a/apps/evalite-docs/src/content/docs/guides/streams.md b/apps/evalite-docs/src/content/docs/guides/streams.md
@@ -0,0 +1,28 @@
+---
+title: Streams
+---
+
+You can handle streams in Evalite by returning any async iterable (including a `ReadableStream`) from your task. This means you can test functions like the AI SDK `streamText` function easily:
+
+```ts
+import { evalite } from "evalite";
+import { streamText } from "ai";
+import { openai } from "@ai-sdk/openai";
+import { Factuality } from "autoevals";
+
+evalite("My Eval", {
+  data: async () => {
+    return [{ input: "What is the capital of France?", expected: "Paris" }];
+  },
+  task: async (input) => {
+    const result = await streamText({
+      model: openai("your-model"),
+      system: `Answer the question concisely.`,
+      prompt: input,
+    });
+
+    return result.textStream;
+  },
+  scorers: [Factuality],
+});
+```
diff --git a/apps/evalite-docs/src/content/docs/guides/traces.mdx b/apps/evalite-docs/src/content/docs/guides/traces.mdx
@@ -0,0 +1,81 @@
+---
+title: Traces
+---
+
+import { Aside } from "@astrojs/starlight/components";
+
+Traces are used to track the behaviour of each individual call to an LLM inside your task.
+
+## `reportTrace`
+
+You can report a trace by calling `reportTrace` inside an `evalite` eval:
+
+```ts
+import { evalite, type Evalite } from "evalite";
+import { reportTrace } from "evalite/evals";
+
+evalite("My Eval", {
+  data: async () => {
+    return [{ input: "Hello", expected: "Hello World!" }];
+  },
+  task: async (input) => {
+    // Track the start time
+    const start = performance.now();
+
+    // Call our LLM
+    const result = await myLLMCall();
+
+    // Report the trace once it's finished
+    reportTrace({
+      start,
+      end: performance.now(),
+      output: result.output,
+      input: [
+        {
+          role: "user",
+          content: input,
+        },
+      ],
+      usage: {
+        completionTokens: result.completionTokens,
+        promptTokens: result.promptTokens,
+      },
+    });
+
+    // Return the output
+    return result.output;
+  },
+  scorers: [Levenshtein],
+});
+```
+
+<Aside>
+
+`reportTrace` is a no-op in production, so you can leave it in your code without worrying about performance.
+
+</Aside>
+
+## `traceAISDKModel`
+
+If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function:
+
+```ts
+import { traceAISDKModel } from "evalite/ai-sdk";
+import { generateText } from "ai";
+import { openai } from "@ai-sdk/openai";
+
+// All calls to this model will be recorded in evalite!
+const tracedModel = traceAISDKModel(openai("gpt-4o-mini"));
+
+const result = await generateText({
+  model: tracedModel,
+  system: `Answer the question concisely.`,
+  prompt: `What is the capital of France?`,
+});
+```
+
+<Aside>
+
+`traceAISDKModel`, like `reportTrace`, is a no-op in production.
+
+</Aside>