From e3042019cc98c676bc1893d14957a28d7a85ab41 Mon Sep 17 00:00:00 2001 From: Matt Pocock Date: Wed, 11 Dec 2024 16:18:04 +0000 Subject: [PATCH] Completed refactor --- packages/evalite-core/src/db.ts | 15 +- packages/evalite-core/src/db/tests/db.test.ts | 46 ++-- packages/evalite-core/src/types.ts | 1 + packages/evalite-tests/tests/failing.test.ts | 3 + packages/evalite/package.json | 3 +- packages/evalite/src/index.ts | 1 + packages/evalite/src/reporter.ts | 234 +++++++++--------- packages/example/src/fail.eval.ts | 2 +- 8 files changed, 163 insertions(+), 142 deletions(-) diff --git a/packages/evalite-core/src/db.ts b/packages/evalite-core/src/db.ts index 592d96e..3535b7e 100644 --- a/packages/evalite-core/src/db.ts +++ b/packages/evalite-core/src/db.ts @@ -2,6 +2,7 @@ import type * as BetterSqlite3 from "better-sqlite3"; import Database from "better-sqlite3"; import type { Evalite } from "./index.js"; import type { TaskState } from "vitest"; +import { max } from "./utils.js"; export type SQLiteDatabase = BetterSqlite3.Database; @@ -138,7 +139,7 @@ export const saveRun = ( result?: { state: TaskState; }; - tasks: { + tasks?: { name: string; result?: { state: TaskState; @@ -162,6 +163,12 @@ export const saveRun = ( for (const file of files) { for (const suite of file.tasks) { + if (!suite.tasks) { + throw new Error( + "An unknown error occurred - did you nest evalite inside a describe block?" + ); + } + const evalId = db .prepare( ` @@ -173,15 +180,13 @@ export const saveRun = ( runId, name: suite.name, filepath: file.filepath, - duration: 0, // TODO - go with max duration + duration: max(suite.tasks, (t) => t.meta.evalite?.duration ?? 0), status: suite.result?.state === "fail" ? "fail" : "success", }).lastInsertRowid; - let order = 0; for (const task of suite.tasks) { if (task.meta.evalite?.result) { - order += 1; - const { duration, input, output, expected, scores, traces } = + const { duration, input, output, expected, scores, traces, order } = task.meta.evalite.result; const resultId = db .prepare( diff --git a/packages/evalite-core/src/db/tests/db.test.ts b/packages/evalite-core/src/db/tests/db.test.ts index dbcc9e2..f13c991 100644 --- a/packages/evalite-core/src/db/tests/db.test.ts +++ b/packages/evalite-core/src/db/tests/db.test.ts @@ -20,30 +20,34 @@ describe("getEvalsAverageScores", () => { tasks: [ { name: "task", - meta: { - evalite: { - duration: 100, - results: [ - { - input: "input", + tasks: [ + { + name: "task 1", + meta: { + evalite: { duration: 100, - output: "result", - expected: "expected", - scores: [ - { - name: "score", - score: 1, - }, - { - name: "Other Score", - score: 0, - }, - ], - traces: [], + result: { + order: 1, + input: "input", + duration: 100, + output: "result", + expected: "expected", + scores: [ + { + name: "score", + score: 1, + }, + { + name: "Other Score", + score: 0, + }, + ], + traces: [], + }, }, - ], + }, }, - }, + ], }, ], }, diff --git a/packages/evalite-core/src/types.ts b/packages/evalite-core/src/types.ts index 3088981..a8d2b80 100644 --- a/packages/evalite-core/src/types.ts +++ b/packages/evalite-core/src/types.ts @@ -13,6 +13,7 @@ export declare namespace Evalite { export type MaybePromise = T | Promise; export type Result = { + order: number; input: unknown; output: unknown; expected: unknown; diff --git a/packages/evalite-tests/tests/failing.test.ts b/packages/evalite-tests/tests/failing.test.ts index 314c4f5..6e53459 100644 --- a/packages/evalite-tests/tests/failing.test.ts +++ b/packages/evalite-tests/tests/failing.test.ts @@ -18,6 +18,9 @@ it("Should report a failing test", async () => { expect(captured.getOutput()).toContain("failing-test.eval.ts"); expect(captured.getOutput()).toContain("Score ✖ (1 failed)"); + + // Should not display a table + expect(captured.getOutput()).not.toContain("Input"); }); it("Should save the test as failed in the database", async () => { diff --git a/packages/evalite/package.json b/packages/evalite/package.json index da900b6..7d71925 100644 --- a/packages/evalite/package.json +++ b/packages/evalite/package.json @@ -39,7 +39,8 @@ "table": "^6.8.2", "commander": "^12.1.0", "tinyrainbow": "^1.2.0", - "@evalite/core": "workspace:*" + "@evalite/core": "workspace:*", + "@vitest/runner": "^2.1.8" }, "devDependencies": { "@types/ws": "^8.5.13", diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts index cec6a8a..cafd0bf 100644 --- a/packages/evalite/src/index.ts +++ b/packages/evalite/src/index.ts @@ -98,6 +98,7 @@ export const evalite = ( }); task.meta.evalite = { result: { + order: index, duration, expected: data.expected, input: data.input, diff --git a/packages/evalite/src/reporter.ts b/packages/evalite/src/reporter.ts index 8e5b181..0bf0c43 100644 --- a/packages/evalite/src/reporter.ts +++ b/packages/evalite/src/reporter.ts @@ -6,6 +6,7 @@ import { inspect } from "util"; import type { RunnerTask, RunnerTestFile, TaskResultPack, Test } from "vitest"; import { BasicReporter } from "vitest/reporters"; import { average, sum } from "./utils.js"; +import { getSuites, getTasks, getTests } from "@vitest/runner/utils"; export interface EvaliteReporterOptions { isWatching: boolean; @@ -121,119 +122,121 @@ export default class EvaliteReporter extends BasicReporter { super.onFinished(files, errors); }; - // protected override printTask(task: RunnerTask): void { - // // Tasks can be files or individual tests, and - // // this ensures we only print files - // if ( - // !("filepath" in task) || - // !task.result?.state || - // task.result?.state === "run" - // ) { - // return; - // } - - // const hasNoEvalite = task.tasks.every((t) => !t.meta.evalite); - - // if (hasNoEvalite) { - // return super.printTask(task); - // } - - // const scores: number[] = []; - - // const failed = task.tasks.some((t) => t.result?.state === "fail"); - - // for (const { meta } of task.tasks) { - // if (meta.evalite) { - // scores.push( - // ...meta.evalite!.results.flatMap((r) => - // r.scores.map((s) => s.score ?? 0) - // ) - // ); - // } - // } - - // const totalScore = scores.reduce((a, b) => a + b, 0); - // const averageScore = totalScore / scores.length; - - // const title = failed ? c.red("✖") : displayScore(averageScore); - - // const toLog = [ - // ` ${title} `, - // `${task.name} `, - // c.dim( - // `(${task.tasks.length} ${task.tasks.length > 1 ? "evals" : "eval"})` - // ), - // ]; - - // // if (task.result.duration) { - // // toLog.push(" " + c.dim(`${Math.round(task.result.duration ?? 0)}ms`)); - // // } - - // this.ctx.logger.log(toLog.join("")); - // } - - // override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void { - // // this.printErrorsSummary(errors); // TODO - - // const evals = files.flatMap((file) => - // file.tasks.filter((task) => task.meta.evalite) - // ); - - // const scores = evals.flatMap((task) => - // task.meta.evalite!.results.flatMap((r) => r.scores.map((s) => s.score)) - // ); - - // const totalScore = sum(scores, (score) => score ?? 0); - // const averageScore = totalScore / scores.length; - - // const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0); - // const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0); - // const setupTime = files.reduce((a, b) => a + (b.setupDuration || 0), 0); - - // const totalDuration = collectTime + testsTime + setupTime; - - // const failedTasks = files.filter((file) => { - // return file.tasks.some((task) => task.result?.state === "fail"); - // }); - - // const scoreDisplay = - // failedTasks.length > 0 - // ? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`) - // : displayScore(averageScore); - - // this.ctx.logger.log( - // [" ", c.dim("Score"), " ", scoreDisplay].join("") - // ); - - // this.ctx.logger.log( - // [" ", c.dim("Eval Files"), " ", files.length].join("") - // ); - - // this.ctx.logger.log( - // [ - // " ", - // c.dim("Evals"), - // " ", - // files.reduce((a, b) => a + b.tasks.length, 0), - // ].join("") - // ); - - // this.ctx.logger.log( - // [" ", c.dim("Duration"), " ", `${Math.round(totalDuration)}ms`].join( - // "" - // ) - // ); - - // if (evals.length === 1 && evals[0]) { - // this.renderTable( - // evals[0].meta.evalite!.results.map((result) => ({ - // input: result.input, - // output: result.output, - // score: average(result.scores, (s) => s.score ?? 0), - // })) - // ); - // } - // } + protected override printTask(file: RunnerTask): void { + // Tasks can be files or individual tests, and + // this ensures we only print files + if ( + !("filepath" in file) || + !file.result?.state || + file.result?.state === "run" + ) { + return; + } + + const tests = getTests(file); + + const hasNoEvalite = tests.every((t) => !t.meta.evalite); + + if (hasNoEvalite) { + return super.printTask(file); + } + + const scores: number[] = []; + + const failed = tests.some((t) => t.result?.state === "fail"); + + for (const { meta } of tests) { + if (meta.evalite) { + scores.push(...meta.evalite!.result.scores.map((s) => s.score ?? 0)); + } + } + + const totalScore = scores.reduce((a, b) => a + b, 0); + const averageScore = totalScore / scores.length; + + const title = failed ? c.red("✖") : displayScore(averageScore); + + const toLog = [ + ` ${title} `, + `${file.name} `, + c.dim( + `(${file.tasks.length} ${file.tasks.length > 1 ? "evals" : "eval"})` + ), + ]; + + // if (task.result.duration) { + // toLog.push(" " + c.dim(`${Math.round(task.result.duration ?? 0)}ms`)); + // } + + this.ctx.logger.log(toLog.join("")); + } + + override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void { + /** + * These tasks are the actual tests that were run + */ + const tests = getTests(files); + + const scores = tests.flatMap((test) => + test.meta.evalite?.result.scores.map((s) => s.score ?? 0) + ); + + const totalScore = sum(scores, (score) => score ?? 0); + const averageScore = totalScore / scores.length; + + const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0); + const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0); + const setupTime = files.reduce((a, b) => a + (b.setupDuration || 0), 0); + + const totalDuration = collectTime + testsTime + setupTime; + + const failedTasks = files.filter((file) => { + return file.tasks.some((task) => task.result?.state === "fail"); + }); + + const scoreDisplay = + failedTasks.length > 0 + ? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`) + : displayScore(averageScore); + + this.ctx.logger.log( + [" ", c.dim("Score"), " ", scoreDisplay].join("") + ); + + this.ctx.logger.log( + [" ", c.dim("Eval Files"), " ", files.length].join("") + ); + + this.ctx.logger.log( + [ + " ", + c.dim("Evals"), + " ", + files.reduce((a, b) => a + b.tasks.length, 0), + ].join("") + ); + + this.ctx.logger.log( + [" ", c.dim("Duration"), " ", `${Math.round(totalDuration)}ms`].join( + "" + ) + ); + + const totalFiles = new Set(files.map((f) => f.filepath)).size; + + if (totalFiles === 1 && failedTasks.length === 0) { + this.renderTable( + tests + .filter((t) => typeof t.meta.evalite === "object") + .map((t) => t.meta.evalite!.result) + .map((result) => ({ + input: result.input, + output: result.output, + score: average(result.scores, (s) => s.score ?? 0), + })) + ); + } + } private renderTable( props: { @@ -330,10 +333,13 @@ export default class EvaliteReporter extends BasicReporter { startingTestFiles.forEach((file) => this.onTestFilePrepare(file)); startingTests.forEach((test) => this.onTestStart(test)); + + super.onTaskUpdate(packs); } } -const displayScore = (score: number) => { +const displayScore = (_score: number) => { + const score = Number.isNaN(_score) ? 0 : _score; const percentageScore = Math.round(score * 100); if (percentageScore >= 80) { return c.bold(c.green(percentageScore + "%")); diff --git a/packages/example/src/fail.eval.ts b/packages/example/src/fail.eval.ts index 10a5c2a..da42110 100644 --- a/packages/example/src/fail.eval.ts +++ b/packages/example/src/fail.eval.ts @@ -8,7 +8,7 @@ evalite("Failure", { }, ], task: async (input) => { - return "x"; + throw new Error("Fail"); }, scorers: [], });