Skip to content

Commit

Permalink
Showed a table when there's only one eval running
Browse files Browse the repository at this point in the history
  • Loading branch information
mattpocock committed Dec 2, 2024
1 parent a5263dc commit a85f7ee
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 28 deletions.
1 change: 1 addition & 0 deletions packages/evalite-vitest/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"./reporter": "./dist/reporter.js"
},
"dependencies": {
"table": "^6.8.2",
"commander": "^12.1.0",
"js-levenshtein": "^1.1.6",
"tinyrainbow": "^1.2.0",
Expand Down
91 changes: 70 additions & 21 deletions packages/evalite-vitest/src/reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ import { BasicReporter } from "vitest/reporters";

import { appendToJsonDb, DEFAULT_SERVER_PORT } from "@evalite/core";
import c from "tinyrainbow";
import { average, sum } from "./utils.js";
import { table } from "table";

export interface EvaliteReporterOptions {
jsonDbLocation: string;
}

export default class EvaliteReporter extends BasicReporter {
private opts: EvaliteReporterOptions;

// private server: Server;
constructor(opts: EvaliteReporterOptions) {
super();
Expand All @@ -19,7 +22,6 @@ export default class EvaliteReporter extends BasicReporter {
// jsonDbLocation: "./evalite-report.jsonl",
// });
}

override onInit(ctx: any): void {
this.ctx = ctx;
this.start = performance.now();
Expand Down Expand Up @@ -96,12 +98,9 @@ export default class EvaliteReporter extends BasicReporter {
}

const totalScore = scores.reduce((a, b) => a + b, 0);
const averageScore = Math.round((totalScore / scores.length) * 100);

const color =
averageScore >= 80 ? c.green : averageScore >= 50 ? c.yellow : c.red;
const averageScore = totalScore / scores.length;

const title = failed ? c.red("✖") : c.bold(color(averageScore + "%"));
const title = failed ? c.red("✖") : displayScore(averageScore);

const toLog = [
` ${title} `,
Expand All @@ -119,22 +118,18 @@ export default class EvaliteReporter extends BasicReporter {
}

override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void {
const scores = files.flatMap((file) =>
file.tasks.flatMap((task) => {
if (task.meta.evalite) {
return task.meta.evalite.results.flatMap((r) =>
r.scores.map((s) => s.score)
);
}
return [];
})
// this.printErrorsSummary(errors); // TODO

const evals = files.flatMap((file) =>
file.tasks.filter((task) => task.meta.evalite)
);

const totalScore = scores.reduce((a, b) => a + b, 0);
const averageScore = Math.round((totalScore / scores.length) * 100);
const scores = evals.flatMap((task) =>
task.meta.evalite!.results.flatMap((r) => r.scores.map((s) => s.score))
);

const scoreColor =
averageScore >= 80 ? c.green : averageScore >= 50 ? c.yellow : c.red;
const totalScore = sum(scores, (score) => score);
const averageScore = totalScore / scores.length;

const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0);
const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0);
Expand All @@ -149,7 +144,7 @@ export default class EvaliteReporter extends BasicReporter {
const scoreDisplay =
failedTasks.length > 0
? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`)
: c.bold(scoreColor(averageScore + "%"));
: displayScore(averageScore);

this.ctx.logger.log(
[" ", c.dim("Score"), " ", scoreDisplay].join("")
Expand All @@ -174,6 +169,60 @@ export default class EvaliteReporter extends BasicReporter {
)
);

// super.reportTestSummary(files, errors);
if (evals.length === 1 && evals[0]) {
this.renderTable(
evals[0].meta.evalite!.results.map((result) => ({
input: result.input,
output: result.result,
score: average(result.scores, (s) => s.score),
}))
);
}
}

private renderTable(
props: {
input: unknown;
output: unknown;
score: number;
}[]
) {
this.ctx.logger.log("");

const availableColumns = process.stdout.columns || 80;

const scoreWidth = 5;
const columnsWritableWidth = 11;
const availableInnerSpace =
availableColumns - columnsWritableWidth - scoreWidth;

const colWidth = Math.floor(availableInnerSpace / 2);

this.ctx.logger.log(
table(
[
[c.bold("Input"), c.bold("Output"), c.bold("Score")],
...props.map((p) => [p.input, p.output, displayScore(p.score)]),
],
{
columns: [
{ width: colWidth, wrapWord: true },
{ width: colWidth, wrapWord: true },
{ width: scoreWidth },
],
}
)
);
}
}

const displayScore = (score: number) => {
const percentageScore = Math.round(score * 100);
if (percentageScore >= 80) {
return c.bold(c.green(percentageScore + "%"));
} else if (percentageScore >= 50) {
return c.bold(c.yellow(percentageScore + "%"));
} else {
return c.bold(c.red(percentageScore + "%"));
}
};
18 changes: 18 additions & 0 deletions packages/evalite-vitest/src/tests/basics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,21 @@ it("Should capture a hash of the source code", async () => {

expect(evals.Basics[0].sourceCodeHash.length).toEqual(64);
});

it("Should display a table when there is only one eval", async () => {
using fixture = loadFixture("basics");

const captured = captureStdout();

await runVitest({
cwd: fixture.dir,
path: undefined,
testOutputWritable: captured.writable,
});

expect(captured.getOutput()).toContain("Input");
expect(captured.getOutput()).toContain("Output");
expect(captured.getOutput()).toContain("Score");
expect(captured.getOutput()).toContain("abc");
expect(captured.getOutput()).toContain("abcdef");
});
7 changes: 0 additions & 7 deletions packages/evalite-vitest/src/tests/fixtures/basics/basics.ts

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { evalite, Levenshtein } from "../../../index.js";

evalite("Long Text", {
data: () => {
return [
{
input: [
`Some extremely long text that will test the bounds of our system.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
].join("\n"),
expected: [
`Some extremely long text that will test the bounds of our system.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
`This is a test to see if we can handle long text inputs.`,
].join("\n"),
},
];
},
task: (input) => {
return input;
},
scorers: [Levenshtein],
});
21 changes: 21 additions & 0 deletions packages/evalite-vitest/src/tests/long-text.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { getJsonDbEvals } from "@evalite/core";
import { assert, expect, it } from "vitest";
import { runVitest } from "../command.js";
import { captureStdout, loadFixture } from "./test-utils.js";

it("Should report long text correctly", async () => {
using fixture = loadFixture("long-text");

const captured = captureStdout();

await runVitest({
cwd: fixture.dir,
path: undefined,
testOutputWritable: captured.writable,
});

expect(captured.getOutput()).toContain("Input");
expect(captured.getOutput()).toContain("Output");
expect(captured.getOutput()).toContain("Score");
expect(captured.getOutput()).toContain("Some extremely long text");
});
14 changes: 14 additions & 0 deletions packages/evalite-vitest/src/tests/multi.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,17 @@ it("Should report multiple evals correctly", async () => {
expect(captured.getOutput()).toContain("100% multi-2.eval.ts (1 eval)");
expect(captured.getOutput()).toContain("100% multi-3.eval.ts (2 evals)");
});

it("Should not show a table when running multiple evals", async () => {
using fixture = loadFixture("multi");

const captured = captureStdout();

await runVitest({
cwd: fixture.dir,
path: undefined,
testOutputWritable: captured.writable,
});

expect(captured.getOutput()).not.toContain("ONLY ONE EVAL");
});
36 changes: 36 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a85f7ee

Please sign in to comment.