From e6cc0c2bcd898e42cdeb1334290b89fb3cf31a6d Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Tue, 3 Dec 2024 11:04:23 +0000 Subject: [PATCH 1/5] Added only, but failing tests --- packages/evalite-core/src/index.ts | 8 ++ packages/evalite/src/index.ts | 88 +++++++++++-------- .../src/tests/fixtures/only/only-2.eval.ts | 18 ++++ .../src/tests/fixtures/only/only.eval.ts | 32 +++++++ packages/evalite/src/tests/only.test.ts | 26 ++++++ 5 files changed, 137 insertions(+), 35 deletions(-) create mode 100644 packages/evalite/src/tests/fixtures/only/only-2.eval.ts create mode 100644 packages/evalite/src/tests/fixtures/only/only.eval.ts create mode 100644 packages/evalite/src/tests/only.test.ts diff --git a/packages/evalite-core/src/index.ts b/packages/evalite-core/src/index.ts index 3b8df4b..d91a0cd 100644 --- a/packages/evalite-core/src/index.ts +++ b/packages/evalite-core/src/index.ts @@ -19,6 +19,14 @@ export declare namespace Evalite { duration: number; }; + export interface Runner { + ( + testName: string, + runnerOpts: RunnerOpts + ): void; + only: Runner; + } + export type Score = { /** * A number between 0 and 1. diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts index 329d5d5..984fd25 100644 --- a/packages/evalite/src/index.ts +++ b/packages/evalite/src/index.ts @@ -1,5 +1,5 @@ import type { Evalite } from "@evalite/core"; -import { inject, it } from "vitest"; +import { inject, it, type Test } from "vitest"; import { reportTraceLocalStorage } from "./traces.js"; declare module "vitest" { @@ -32,46 +32,64 @@ const runTask = async (opts: { }; }; -export const evalite = ( - testName: string, +const runEval = async ( + task: Readonly, opts: Evalite.RunnerOpts ) => { - return it(testName, async ({ task }) => { - if (opts.scorers.length === 0) { - throw new Error("You must provide at least one scorer."); - } + if (opts.scorers.length === 0) { + throw new Error("You must provide at least one scorer."); + } - const traces: Evalite.StoredTrace[] = []; + const traces: Evalite.StoredTrace[] = []; - reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); + reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); - const sourceCodeHash = inject("evaliteInputHash"); + const sourceCodeHash = inject("evaliteInputHash"); - const data = await opts.data(); - const start = performance.now(); - const results = await Promise.all( - data.map(async ({ input, expected }): Promise => { - const { result, scores, duration } = await runTask({ - expected, - input, - scores: opts.scorers, - task: opts.task, - }); + const data = await opts.data(); + const start = performance.now(); + const results = await Promise.all( + data.map(async ({ input, expected }): Promise => { + const { result, scores, duration } = await runTask({ + expected, + input, + scores: opts.scorers, + task: opts.task, + }); - return { - input, - result, - scores, - duration, - expected, - }; - }) - ); - task.meta.evalite = { - results, - duration: Math.round(performance.now() - start), - sourceCodeHash, - traces, - }; + return { + input, + result, + scores, + duration, + expected, + }; + }) + ); + task.meta.evalite = { + results, + duration: Math.round(performance.now() - start), + sourceCodeHash, + traces, + }; +}; + +function evaliteBase( + testName: string, + opts: Evalite.RunnerOpts +) { + return it(testName, async ({ task }) => { + await runEval(task, opts); + }); +} + +evaliteBase.only = function evaliteOnly( + testName: string, + opts: Evalite.RunnerOpts +) { + return it.only(testName, async ({ task }) => { + await runEval(task, opts); }); }; + +export const evalite = evaliteBase as Evalite.Runner; diff --git a/packages/evalite/src/tests/fixtures/only/only-2.eval.ts b/packages/evalite/src/tests/fixtures/only/only-2.eval.ts new file mode 100644 index 0000000..38e2c91 --- /dev/null +++ b/packages/evalite/src/tests/fixtures/only/only-2.eval.ts @@ -0,0 +1,18 @@ +import { evalite } from "../../../index.js"; +import { reportTrace } from "../../../traces.js"; +import { Levenshtein } from "autoevals"; + +evalite("Also Not Run", { + data: () => { + return [ + { + input: "abc", + expected: "abcdef", + }, + ]; + }, + task: async (input) => { + return input + "def"; + }, + scorers: [Levenshtein], +}); diff --git a/packages/evalite/src/tests/fixtures/only/only.eval.ts b/packages/evalite/src/tests/fixtures/only/only.eval.ts new file mode 100644 index 0000000..07a1eab --- /dev/null +++ b/packages/evalite/src/tests/fixtures/only/only.eval.ts @@ -0,0 +1,32 @@ +import { Levenshtein } from "autoevals"; +import { evalite } from "../../../index.js"; + +evalite.only("Only", { + data: () => { + return [ + { + input: "abc", + expected: "abcdef", + }, + ]; + }, + task: async (input) => { + return input + "def"; + }, + scorers: [Levenshtein], +}); + +evalite("Not Run", { + data: () => { + return [ + { + input: "abc", + expected: "abcdef", + }, + ]; + }, + task: async (input) => { + return input + "def"; + }, + scorers: [Levenshtein], +}); diff --git a/packages/evalite/src/tests/only.test.ts b/packages/evalite/src/tests/only.test.ts new file mode 100644 index 0000000..91a99cc --- /dev/null +++ b/packages/evalite/src/tests/only.test.ts @@ -0,0 +1,26 @@ +import { getJsonDbEvals } from "@evalite/core"; +import { assert, expect, it } from "vitest"; +import { runVitest } from "../command.js"; +import { captureStdout, loadFixture } from "./test-utils.js"; + +it.only("Should only run the targeted eval", async () => { + using fixture = loadFixture("only"); + + const captured = captureStdout(); + + await runVitest({ + cwd: fixture.dir, + path: undefined, + testOutputWritable: captured.writable, + }); + + console.log(captured.getOutput()); + + const evals = await getJsonDbEvals({ + dbLocation: fixture.jsonDbLocation, + }); + + expect(evals["Only"]).toBeDefined(); + expect(evals["Not Run"]).toBeUndefined(); + expect(evals["Also Not Run"]).toBeUndefined(); +}); From 18266f819e5274aaf1b458de0907a9425d9117fb Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Tue, 3 Dec 2024 11:07:10 +0000 Subject: [PATCH 2/5] Added a precommit hook for handling the readme --- .husky/pre-commit | 1 + package.json | 6 +- pnpm-lock.yaml | 10 +++ readme.md | 153 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 .husky/pre-commit create mode 100644 readme.md diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100644 index 0000000..42f8db8 --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1 @@ +cp packages/evalite/readme.md readme.md \ No newline at end of file diff --git a/package.json b/package.json index 60cd439..caee378 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,8 @@ "ci": "turbo build test lint", "build": "turbo build", "release": "pnpm run ci && changeset publish", - "test-example": "cd packages/example && evalite" + "test-example": "cd packages/example && evalite", + "prepare": "husky" }, "keywords": [], "author": "Matt Pocock", @@ -26,7 +27,8 @@ "tsx": "^4.19.0", "turbo": "2.3.3", "typescript": "5.6.2", - "vitest": "^2.0.5" + "vitest": "^2.0.5", + "husky": "^9.1.7" }, "resolutions": { "typescript": "5.6.2" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a5aeb4f..bc9822a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,9 @@ importers: '@types/node': specifier: ^22.7.7 version: 22.7.7 + husky: + specifier: ^9.1.7 + version: 9.1.7 prettier: specifier: ^3.3.3 version: 3.3.3 @@ -2726,6 +2729,11 @@ packages: humanize-ms@1.2.1: resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==} + husky@9.1.7: + resolution: {integrity: sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA==} + engines: {node: '>=18'} + hasBin: true + iconv-lite@0.4.24: resolution: {integrity: sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==} engines: {node: '>=0.10.0'} @@ -7961,6 +7969,8 @@ snapshots: dependencies: ms: 2.1.3 + husky@9.1.7: {} + iconv-lite@0.4.24: dependencies: safer-buffer: 2.1.2 diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..af6012f --- /dev/null +++ b/readme.md @@ -0,0 +1,153 @@ +# Evalite + +The TypeScript-native, open-source tool for testing LLM-powered apps. + +- Fully open source: **No API Key required** +- Based on Vitest +- Supports tracing, custom scorers, and + +## Quickstart + +### 1. Install `evalite` and `autoevals`: + +Install `evalite`, and a scoring library like `autoevals`: + +```bash +pnpm add -D evalite autoevals +``` + +### 2. Add an `eval` script: + +Add an `eval` script to your package.json: + +```json +{ + "scripts": { + "eval": "evalite" + } +} +``` + +### 3. Create your first eval: + +Create `my-eval.eval.ts`: + +```ts +// my-eval.eval.ts + +import { evalite } from "evalite"; +import { Levenshtein } from "autoevals"; + +evalite("My Eval", { + // A function that returns an array of test data + // - TODO: Replace with your test data + data: async () => { + return [{ input: "Hello", output: "Hello World!" }]; + }, + // The task to perform + // - TODO: Replace with your LLM call + task: async (input) => { + return input + " World!"; + }, + // The scoring methods for the eval + scorers: [Levenshtein], +}); +``` + +> [!NOTE] +> +> `.eval.ts` is the extension Evalite looks for when scanning for evals. + +### 4. Run Your Eval + +Run `pnpm run eval`. + +This runs `evalite`, which runs the evals: + +- Runs the `data` function to get the test data +- Runs the `task` function on each test data +- Scores the output of the `task` function using the `scorers` +- Appends the result of the eval to a `evalite-report.jsonl` file + +It then: + +- Shows a UI for viewing the traces, scores, inputs and outputs at http://localhost:3006. +- If you only ran one eval, it also shows a table summarizing the eval in the terminal. + +### 5. View Your Eval + +Open http://localhost:3006 in your browser to view the results of the eval. + +## Guides + +### Traces + +Traces are used to track the behaviour of each individual call to an LLM inside your task. + +You can report a trace by calling `reportTrace` inside an `evalite` eval: + +```ts +import { evalite, type Evalite } from "evalite"; +import { reportTrace } from "evalite/evals"; + +evalite("My Eval", { + data: async () => { + return [{ input: "Hello", output: "Hello World!" }]; + }, + task: async (input) => { + // Track the start time + const start = performance.now(); + + // Call our LLM + const result = await myLLMCall(); + + // Report the trace once it's finished + reportTrace({ + start, + end: performance.now(), + output: result.output, + prompt: [ + { + role: "user", + content: input, + }, + ], + usage: { + completionTokens: result.completionTokens, + promptTokens: result.promptTokens, + }, + }); + + // Return the output + return result.output; + }, + scorers: [Levenshtein], +}); +``` + +> [!NOTE] +> +> `reportTrace` is a no-op in production, so you can leave it in your code without worrying about performance. + +#### Reporting Traces Automatically + +If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function: + +```ts +import { traceAISDKModel } from "evalite/ai-sdk"; +import { generateText } from "ai"; +import { openai } from "@ai-sdk/openai"; + +// All calls to this model will be recorded in evalite! +const tracedModel = traceAISDKModel(openai("gpt-3.5-turbo")); + +const result = await generateText({ + model: tracedModel, + system: `Answer the question concisely.`, + prompt: `What is the capital of France?`, +}); +``` + +> [!NOTE] +> +> `traceAISDKModel`, like `reportTrace`, is a no-op in production. From 425e2a752b5fde5aca9481e8701fc6d17914f900 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Tue, 3 Dec 2024 11:07:42 +0000 Subject: [PATCH 3/5] Tweak --- packages/evalite/readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/evalite/readme.md b/packages/evalite/readme.md index af6012f..d0fec28 100644 --- a/packages/evalite/readme.md +++ b/packages/evalite/readme.md @@ -1,3 +1,5 @@ + + # Evalite The TypeScript-native, open-source tool for testing LLM-powered apps. From 9f5ba7ad4a4d7837c0f2bb444b93e676a145cf79 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Tue, 3 Dec 2024 11:07:56 +0000 Subject: [PATCH 4/5] Tweak --- .husky/pre-commit | 3 ++- readme.md | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.husky/pre-commit b/.husky/pre-commit index 42f8db8..ebbfcbc 100644 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -1 +1,2 @@ -cp packages/evalite/readme.md readme.md \ No newline at end of file +cp packages/evalite/readme.md readme.md +git add readme.md \ No newline at end of file diff --git a/readme.md b/readme.md index af6012f..d0fec28 100644 --- a/readme.md +++ b/readme.md @@ -1,3 +1,5 @@ + + # Evalite The TypeScript-native, open-source tool for testing LLM-powered apps. From 72d5e7f7dfb40a61c27e8c9094450bdca1525830 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Tue, 3 Dec 2024 11:14:59 +0000 Subject: [PATCH 5/5] Fix --- packages/evalite-core/src/json-db.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/evalite-core/src/json-db.ts b/packages/evalite-core/src/json-db.ts index baa57aa..5a8ed76 100644 --- a/packages/evalite-core/src/json-db.ts +++ b/packages/evalite-core/src/json-db.ts @@ -41,7 +41,7 @@ export const appendToJsonDb = async (opts: { const jsonDbTask: JsonDBEval = { name: task.name, score: average(task.meta.evalite?.results || [], (t) => { - return average(t.scores, (s) => s.score); + return average(t.scores, (s) => s.score ?? 0); }), duration: task.meta.evalite?.duration ?? 0, results: [], @@ -58,7 +58,7 @@ export const appendToJsonDb = async (opts: { expected, scores, duration, - score: average(scores, (s) => s.score), + score: average(scores, (s) => s.score ?? 0), traces: task.meta.evalite.traces, }); }