From f24149b30d579244a78df27fe2f7454c34a934b4 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Wed, 11 Dec 2024 15:15:09 +0000 Subject: [PATCH] WIP attempt --- packages/evalite-core/src/db.ts | 47 ++-- packages/evalite-core/src/types.ts | 2 +- packages/evalite-tests/tests/basics.test.ts | 2 +- packages/evalite/src/index.ts | 47 ++-- packages/evalite/src/reporter.ts | 273 +++++++++++--------- packages/evalite/src/run-vitest.ts | 7 - packages/example/src/fail.eval.ts | 2 +- pnpm-lock.yaml | 3 + 8 files changed, 213 insertions(+), 170 deletions(-) diff --git a/packages/evalite-core/src/db.ts b/packages/evalite-core/src/db.ts index 2a9dcf6..592d96e 100644 --- a/packages/evalite-core/src/db.ts +++ b/packages/evalite-core/src/db.ts @@ -138,9 +138,15 @@ export const saveRun = ( result?: { state: TaskState; }; - meta: { - evalite?: Evalite.TaskMeta; - }; + tasks: { + name: string; + result?: { + state: TaskState; + }; + meta: { + evalite?: Evalite.TaskMeta; + }; + }[]; }[]; }[]; } @@ -155,7 +161,7 @@ export const saveRun = ( .run({ runType }).lastInsertRowid; for (const file of files) { - for (const task of file.tasks) { + for (const suite of file.tasks) { const evalId = db .prepare( ` @@ -165,23 +171,24 @@ export const saveRun = ( ) .run({ runId, - name: task.name, + name: suite.name, filepath: file.filepath, - duration: task.meta.evalite?.duration ?? 0, - status: task.result?.state === "fail" ? "fail" : "success", + duration: 0, // TODO - go with max duration + status: suite.result?.state === "fail" ? "fail" : "success", }).lastInsertRowid; - if (task.meta.evalite) { - let order = 0; - for (const { input, output, scores, duration, expected, traces } of task - .meta.evalite.results) { + let order = 0; + for (const task of suite.tasks) { + if (task.meta.evalite?.result) { order += 1; + const { duration, input, output, expected, scores, traces } = + task.meta.evalite.result; const resultId = db .prepare( ` - INSERT INTO results (eval_id, duration, input, output, expected, col_order) - VALUES (@evalId, @duration, @input, @output, @expected, @col_order) - ` + INSERT INTO results (eval_id, duration, input, output, expected, col_order) + VALUES (@evalId, @duration, @input, @output, @expected, @col_order) + ` ) .run({ evalId, @@ -195,9 +202,9 @@ export const saveRun = ( for (const score of scores) { db.prepare( ` - INSERT INTO scores (result_id, name, score, description, metadata) - VALUES (@resultId, @name, @score, @description, @metadata) - ` + INSERT INTO scores (result_id, name, score, description, metadata) + VALUES (@resultId, @name, @score, @description, @metadata) + ` ).run({ resultId, name: score.name, @@ -212,9 +219,9 @@ export const saveRun = ( traceOrder += 1; db.prepare( ` - INSERT INTO traces (result_id, input, output, start_time, end_time, prompt_tokens, completion_tokens, col_order) - VALUES (@resultId, @input, @output, @start_time, @end_time, @prompt_tokens, @completion_tokens, @col_order) - ` + INSERT INTO traces (result_id, input, output, start_time, end_time, prompt_tokens, completion_tokens, col_order) + VALUES (@resultId, @input, @output, @start_time, @end_time, @prompt_tokens, @completion_tokens, @col_order) + ` ).run({ resultId, input: JSON.stringify(trace.input), diff --git a/packages/evalite-core/src/types.ts b/packages/evalite-core/src/types.ts index 8f92ef7..3088981 100644 --- a/packages/evalite-core/src/types.ts +++ b/packages/evalite-core/src/types.ts @@ -46,7 +46,7 @@ export declare namespace Evalite { }; export type TaskMeta = { - results: Result[]; + result: Result; duration: number | undefined; }; diff --git a/packages/evalite-tests/tests/basics.test.ts b/packages/evalite-tests/tests/basics.test.ts index c66db26..11c1659 100644 --- a/packages/evalite-tests/tests/basics.test.ts +++ b/packages/evalite-tests/tests/basics.test.ts @@ -22,7 +22,7 @@ it("Should report the basics correctly", async () => { expect(captured.getOutput()).toContain("100% basics.eval.ts (1 eval)"); }); -it("Should create a evalite-report.jsonl", async () => { +it("Should save the basic information in a db", async () => { using fixture = loadFixture("basics"); const captured = captureStdout(); diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts index 075ec00..cec6a8a 100644 --- a/packages/evalite/src/index.ts +++ b/packages/evalite/src/index.ts @@ -1,5 +1,5 @@ import type { Evalite } from "@evalite/core"; -import { inject, it } from "vitest"; +import { afterEach, beforeEach, describe, inject, it } from "vitest"; import { reportTraceLocalStorage } from "./traces.js"; declare module "vitest" { @@ -78,34 +78,37 @@ export const evalite = ( testName: string, opts: Evalite.RunnerOpts ) => { - return it(testName, async ({ task }) => { - const data = await opts.data(); - const start = performance.now(); - const results = await Promise.all( - data.map(async ({ input, expected }): Promise => { + return describe(testName, async () => { + const dataset = await opts.data(); + + let index = 0; + for (const data of dataset) { + index++; + it(`${testName} ${index}`, { concurrent: true }, async ({ task }) => { + const start = performance.now(); + const traces: Evalite.Trace[] = []; reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); + const { output, scores, duration } = await runTask({ - expected, - input, + expected: data.expected, + input: data.input, scores: opts.scorers, task: opts.task, }); - - return { - input, - output, - scores, - duration, - expected, - traces, + task.meta.evalite = { + result: { + duration, + expected: data.expected, + input: data.input, + output, + scores, + traces, + }, + duration: Math.round(performance.now() - start), }; - }) - ); - task.meta.evalite = { - results, - duration: Math.round(performance.now() - start), - }; + }); + } }); }; diff --git a/packages/evalite/src/reporter.ts b/packages/evalite/src/reporter.ts index 8a9a50c..8e5b181 100644 --- a/packages/evalite/src/reporter.ts +++ b/packages/evalite/src/reporter.ts @@ -1,12 +1,11 @@ -import type { RunnerTask, RunnerTestFile, TaskResultPack } from "vitest"; -import { BasicReporter } from "vitest/reporters"; - import { type Evalite } from "@evalite/core"; +import { saveRun, type SQLiteDatabase } from "@evalite/core/db"; import { table } from "table"; import c from "tinyrainbow"; -import { average, sum } from "./utils.js"; import { inspect } from "util"; -import { saveRun, type SQLiteDatabase } from "@evalite/core/db"; +import type { RunnerTask, RunnerTestFile, TaskResultPack, Test } from "vitest"; +import { BasicReporter } from "vitest/reporters"; +import { average, sum } from "./utils.js"; export interface EvaliteReporterOptions { isWatching: boolean; @@ -122,119 +121,119 @@ export default class EvaliteReporter extends BasicReporter { super.onFinished(files, errors); }; - protected override printTask(task: RunnerTask): void { - // Tasks can be files or individual tests, and - // this ensures we only print files - if ( - !("filepath" in task) || - !task.result?.state || - task.result?.state === "run" - ) { - return; - } - - const hasNoEvalite = task.tasks.every((t) => !t.meta.evalite); - - if (hasNoEvalite) { - return super.printTask(task); - } - - const scores: number[] = []; - - const failed = task.tasks.some((t) => t.result?.state === "fail"); - - for (const { meta } of task.tasks) { - if (meta.evalite) { - scores.push( - ...meta.evalite!.results.flatMap((r) => - r.scores.map((s) => s.score ?? 0) - ) - ); - } - } - - const totalScore = scores.reduce((a, b) => a + b, 0); - const averageScore = totalScore / scores.length; - - const title = failed ? c.red("✖") : displayScore(averageScore); - - const toLog = [ - ` ${title} `, - `${task.name} `, - c.dim( - `(${task.tasks.length} ${task.tasks.length > 1 ? "evals" : "eval"})` - ), - ]; - - // if (task.result.duration) { - // toLog.push(" " + c.dim(`${Math.round(task.result.duration ?? 0)}ms`)); - // } - - this.ctx.logger.log(toLog.join("")); - } - - override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void { - // this.printErrorsSummary(errors); // TODO - - const evals = files.flatMap((file) => - file.tasks.filter((task) => task.meta.evalite) - ); - - const scores = evals.flatMap((task) => - task.meta.evalite!.results.flatMap((r) => r.scores.map((s) => s.score)) - ); - - const totalScore = sum(scores, (score) => score ?? 0); - const averageScore = totalScore / scores.length; - - const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0); - const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0); - const setupTime = files.reduce((a, b) => a + (b.setupDuration || 0), 0); - - const totalDuration = collectTime + testsTime + setupTime; - - const failedTasks = files.filter((file) => { - return file.tasks.some((task) => task.result?.state === "fail"); - }); - - const scoreDisplay = - failedTasks.length > 0 - ? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`) - : displayScore(averageScore); - - this.ctx.logger.log( - [" ", c.dim("Score"), " ", scoreDisplay].join("") - ); - - this.ctx.logger.log( - [" ", c.dim("Eval Files"), " ", files.length].join("") - ); - - this.ctx.logger.log( - [ - " ", - c.dim("Evals"), - " ", - files.reduce((a, b) => a + b.tasks.length, 0), - ].join("") - ); - - this.ctx.logger.log( - [" ", c.dim("Duration"), " ", `${Math.round(totalDuration)}ms`].join( - "" - ) - ); - - if (evals.length === 1 && evals[0]) { - this.renderTable( - evals[0].meta.evalite!.results.map((result) => ({ - input: result.input, - output: result.output, - score: average(result.scores, (s) => s.score ?? 0), - })) - ); - } - } + // protected override printTask(task: RunnerTask): void { + // // Tasks can be files or individual tests, and + // // this ensures we only print files + // if ( + // !("filepath" in task) || + // !task.result?.state || + // task.result?.state === "run" + // ) { + // return; + // } + + // const hasNoEvalite = task.tasks.every((t) => !t.meta.evalite); + + // if (hasNoEvalite) { + // return super.printTask(task); + // } + + // const scores: number[] = []; + + // const failed = task.tasks.some((t) => t.result?.state === "fail"); + + // for (const { meta } of task.tasks) { + // if (meta.evalite) { + // scores.push( + // ...meta.evalite!.results.flatMap((r) => + // r.scores.map((s) => s.score ?? 0) + // ) + // ); + // } + // } + + // const totalScore = scores.reduce((a, b) => a + b, 0); + // const averageScore = totalScore / scores.length; + + // const title = failed ? c.red("✖") : displayScore(averageScore); + + // const toLog = [ + // ` ${title} `, + // `${task.name} `, + // c.dim( + // `(${task.tasks.length} ${task.tasks.length > 1 ? "evals" : "eval"})` + // ), + // ]; + + // // if (task.result.duration) { + // // toLog.push(" " + c.dim(`${Math.round(task.result.duration ?? 0)}ms`)); + // // } + + // this.ctx.logger.log(toLog.join("")); + // } + + // override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void { + // // this.printErrorsSummary(errors); // TODO + + // const evals = files.flatMap((file) => + // file.tasks.filter((task) => task.meta.evalite) + // ); + + // const scores = evals.flatMap((task) => + // task.meta.evalite!.results.flatMap((r) => r.scores.map((s) => s.score)) + // ); + + // const totalScore = sum(scores, (score) => score ?? 0); + // const averageScore = totalScore / scores.length; + + // const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0); + // const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0); + // const setupTime = files.reduce((a, b) => a + (b.setupDuration || 0), 0); + + // const totalDuration = collectTime + testsTime + setupTime; + + // const failedTasks = files.filter((file) => { + // return file.tasks.some((task) => task.result?.state === "fail"); + // }); + + // const scoreDisplay = + // failedTasks.length > 0 + // ? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`) + // : displayScore(averageScore); + + // this.ctx.logger.log( + // [" ", c.dim("Score"), " ", scoreDisplay].join("") + // ); + + // this.ctx.logger.log( + // [" ", c.dim("Eval Files"), " ", files.length].join("") + // ); + + // this.ctx.logger.log( + // [ + // " ", + // c.dim("Evals"), + // " ", + // files.reduce((a, b) => a + b.tasks.length, 0), + // ].join("") + // ); + + // this.ctx.logger.log( + // [" ", c.dim("Duration"), " ", `${Math.round(totalDuration)}ms`].join( + // "" + // ) + // ); + + // if (evals.length === 1 && evals[0]) { + // this.renderTable( + // evals[0].meta.evalite!.results.map((result) => ({ + // input: result.input, + // output: result.output, + // score: average(result.scores, (s) => s.score ?? 0), + // })) + // ); + // } + // } private renderTable( props: { @@ -294,6 +293,44 @@ export default class EvaliteReporter extends BasicReporter { ) ); } + + onTestStart(_test: Test) {} + onTestFinished(_test: Test) {} + + onTestFilePrepare(_file: RunnerTestFile) {} + onTestFileFinished(_file: RunnerTestFile) {} + + override onTaskUpdate(packs: TaskResultPack[]) { + const startingTestFiles: RunnerTestFile[] = []; + const finishedTestFiles: RunnerTestFile[] = []; + + const startingTests: Test[] = []; + const finishedTests: Test[] = []; + + for (const pack of packs) { + const task = this.ctx.state.idMap.get(pack[0]); + + if (task?.type === "suite" && "filepath" in task && task.result?.state) { + if (task?.result?.state === "run") { + startingTestFiles.push(task); + } + } + + if (task?.type === "test") { + if (task.result?.state === "run") { + startingTests.push(task); + } else if (task.result?.hooks?.afterEach !== "run") { + finishedTests.push(task); + } + } + } + + finishedTests.forEach((test) => this.onTestFinished(test)); + finishedTestFiles.forEach((file) => this.onTestFileFinished(file)); + + startingTestFiles.forEach((file) => this.onTestFilePrepare(file)); + startingTests.forEach((test) => this.onTestStart(test)); + } } const displayScore = (score: number) => { diff --git a/packages/evalite/src/run-vitest.ts b/packages/evalite/src/run-vitest.ts index eea59e0..083081c 100644 --- a/packages/evalite/src/run-vitest.ts +++ b/packages/evalite/src/run-vitest.ts @@ -60,13 +60,6 @@ export const runVitest = async (opts: { } ); - /** - * This is important to run before start, so that - * we immediately report the correct files to the - * server. - */ - await vitest.collect(filters); - await vitest.start(filters); const dispose = registerConsoleShortcuts( diff --git a/packages/example/src/fail.eval.ts b/packages/example/src/fail.eval.ts index fa93217..10a5c2a 100644 --- a/packages/example/src/fail.eval.ts +++ b/packages/example/src/fail.eval.ts @@ -8,7 +8,7 @@ evalite("Failure", { }, ], task: async (input) => { - throw new Error("It failed!"); + return "x"; }, scorers: [], }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 154a730..a8a18af 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -177,6 +177,9 @@ importers: '@evalite/core': specifier: workspace:* version: link:../evalite-core + '@vitest/runner': + specifier: ^2.1.8 + version: 2.1.8 commander: specifier: ^12.1.0 version: 12.1.0