Documented tracing

mattpocock · Dec 3, 2024 · 8eae50f · 8eae50f
1 parent 79d3bf7
commit 8eae50f
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 30 deletions.
diff --git a/packages/evalite-core/src/index.ts b/packages/evalite-core/src/index.ts
@@ -33,7 +33,7 @@ export declare namespace Evalite {
     results: Result[];
     duration: number | undefined;
     sourceCodeHash: string;
-    traces: Trace[];
+    traces: StoredTrace[];
   };
 
   export type Scorer<TExpected> = (
@@ -46,15 +46,20 @@ export declare namespace Evalite {
     scorers: Scorer<TExpected>[];
   };
 
-  export interface Trace {
+  export interface UserProvidedTrace {
     prompt: TracePrompt[];
-    usage: {
-      promptTokens: number;
-      completionTokens: number;
-    };
+    usage:
+      | {
+          promptTokens: number;
+          completionTokens: number;
+        }
+      | undefined;
     output: string;
     start: number;
     end: number;
+  }
+
+  export interface StoredTrace extends UserProvidedTrace {
     duration: number;
   }
 

diff --git a/packages/evalite-core/src/json-db.ts b/packages/evalite-core/src/json-db.ts
@@ -18,7 +18,7 @@ export type JsonDbRun = {
   scores: Evalite.Score[];
   duration: number;
   score: number;
-  traces: Evalite.Trace[];
+  traces: Evalite.UserProvidedTrace[];
 };
 
 export const appendToJsonDb = async (opts: {

diff --git a/packages/evalite/package.json b/packages/evalite/package.json
@@ -15,7 +15,9 @@
   },
   "exports": {
     ".": "./dist/index.js",
-    "./reporter": "./dist/reporter.js"
+    "./reporter": "./dist/reporter.js",
+    "./traces": "./dist/traces.js",
+    "./ai-sdk": "./dist/ai-sdk.js"
   },
   "dependencies": {
     "table": "^6.8.2",

diff --git a/packages/evalite/readme.md b/packages/evalite/readme.md
@@ -4,7 +4,7 @@ The TypeScript-native, open-source tool for testing LLM-powered apps.
 
 - Fully open source: **No API Key required**
 - Based on Vitest
-- Supports
+- Supports tracing, custom scorers, and
 
 ## Quickstart
 
@@ -67,22 +67,87 @@ This runs `evalite`, which runs the evals:
 - Runs the `data` function to get the test data
 - Runs the `task` function on each test data
 - Scores the output of the `task` function using the `scorers`
+- Appends the result of the eval to a `evalite-report.jsonl` file
 
-It then produces:
+It then:
 
-- A report of the
-- If you only ran one eval, it also shows table summarizing the eval in the terminal
+- Shows a UI for viewing the traces, scores, inputs and outputs at http://localhost:3006.
+- If you only ran one eval, it also shows a table summarizing the eval in the terminal.
 
-##
+### 5. View Your Eval
 
-I want a simple test runner that can:
+Open http://localhost:3006 in your browser to view the results of the eval.
 
--Run my evals on a watch script
--Show me a UI for viewing traces, scores, inputs and outputs
--Not need me to sign up for an API key
+## Guides
 
-So, I'm building one.
+### Traces
 
-It's based on Vitest, and it's called Evalite.
+Traces are used to track the behaviour of each individual call to an LLM inside your task.
 
-Here's an [early preview](https://www.aihero.dev/evalite-an-early-preview).
+You can report a trace by calling `reportTrace` inside an `evalite` eval:
+
+```ts
+import { evalite, type Evalite } from "evalite";
+import { reportTrace } from "evalite/evals";
+
+evalite("My Eval", {
+  data: async () => {
+    return [{ input: "Hello", output: "Hello World!" }];
+  },
+  task: async (input) => {
+    // Track the start time
+    const start = performance.now();
+
+    // Call our LLM
+    const result = await myLLMCall();
+
+    // Report the trace once it's finished
+    reportTrace({
+      start,
+      end: performance.now(),
+      output: result.output,
+      prompt: [
+        {
+          role: "user",
+          content: input,
+        },
+      ],
+      usage: {
+        completionTokens: result.completionTokens,
+        promptTokens: result.promptTokens,
+      },
+    });
+
+    // Return the output
+    return result.output;
+  },
+  scorers: [Levenshtein],
+});
+```
+
+> [!NOTE]
+>
+> `reportTrace` is a no-op in production, so you can leave it in your code without worrying about performance.
+
+#### Reporting Traces Automatically
+
+If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function:
+
+```ts
+import { traceAISDKModel } from "evalite/ai-sdk";
+import { generateText } from "ai";
+import { openai } from "@ai-sdk/openai";
+
+// All calls to this model will be recorded in evalite!
+const tracedModel = traceAISDKModel(openai("gpt-3.5-turbo"));
+
+const result = await generateText({
+  model: tracedModel,
+  system: `Answer the question concisely.`,
+  prompt: `What is the capital of France?`,
+});
+```
+
+> [!NOTE]
+>
+> `traceAISDKModel`, like `reportTrace`, is a no-op in production.
diff --git a/packages/evalite/src/trace-model.ts → packages/evalite/src/ai-sdk.ts b/packages/evalite/src/trace-model.ts → packages/evalite/src/ai-sdk.ts
@@ -1,7 +1,8 @@
 import { experimental_wrapLanguageModel, type LanguageModelV1 } from "ai";
-import { reportTrace } from "./traces.js";
+import { reportTrace, shouldReportTrace } from "./traces.js";
 
-export const traceAISDKModel = (model: LanguageModelV1) => {
+export const traceAISDKModel = (model: LanguageModelV1): LanguageModelV1 => {
+  if (!shouldReportTrace()) return model;
   return experimental_wrapLanguageModel({
     model,
     middleware: {
@@ -23,7 +24,7 @@ export const traceAISDKModel = (model: LanguageModelV1) => {
             const content = prompt.content.map((content) => {
               if (content.type !== "text") {
                 throw new Error(
-                  `Unsupported content type: ${content.type}. Only text is currently supported.`
+                  `Unsupported content type: ${content.type}. Only text is currently supported by traceAISDKModel.`
                 );
               }
 
@@ -39,7 +40,6 @@ export const traceAISDKModel = (model: LanguageModelV1) => {
             };
           }),
           usage: generated.usage,
-          duration: end - start,
           start,
           end,
         });

diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts
@@ -42,7 +42,7 @@ export const evalite = <TInput, TExpected>(
       throw new Error("You must provide at least one scorer.");
     }
 
-    const traces: Evalite.Trace[] = [];
+    const traces: Evalite.StoredTrace[] = [];
 
     reportTraceLocalStorage.enterWith((trace) => traces.push(trace));
 

diff --git a/packages/evalite/src/tests/fixtures/traces/traces.eval.ts b/packages/evalite/src/tests/fixtures/traces/traces.eval.ts
@@ -1,5 +1,4 @@
 import { evalite, Levenshtein, reportTrace } from "../../../index.js";
-import { setTimeout } from "node:timers/promises";
 
 evalite("Traces", {
   data: () => {
@@ -12,7 +11,6 @@ evalite("Traces", {
   },
   task: async (input) => {
     reportTrace({
-      duration: 100,
       start: 0,
       end: 100,
       output: "abcdef",

diff --git a/packages/evalite/src/traces.ts b/packages/evalite/src/traces.ts
@@ -2,10 +2,18 @@ import type { Evalite } from "@evalite/core";
 import { AsyncLocalStorage } from "async_hooks";
 
 export const reportTraceLocalStorage = new AsyncLocalStorage<
-  (trace: Evalite.Trace) => void
+  (trace: Evalite.StoredTrace) => void
 >();
 
-export const reportTrace = (trace: Evalite.Trace) => {
+export const shouldReportTrace = (): boolean => {
+  return process.env.NODE_ENV === "test";
+};
+
+export const reportTrace = (trace: Evalite.UserProvidedTrace): void => {
+  if (!shouldReportTrace()) {
+    return;
+  }
+
   const _reportTrace = reportTraceLocalStorage.getStore();
 
   if (!_reportTrace) {
@@ -14,5 +22,8 @@ export const reportTrace = (trace: Evalite.Trace) => {
     );
   }
 
-  _reportTrace(trace);
+  _reportTrace({
+    ...trace,
+    duration: trace.end - trace.start,
+  });
 };