Showed a table when there's only one eval running

mattpocock · Dec 2, 2024 · a85f7ee · a85f7ee
1 parent a5263dc
commit a85f7ee
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 28 deletions.
diff --git a/packages/evalite-vitest/package.json b/packages/evalite-vitest/package.json
@@ -19,6 +19,7 @@
     "./reporter": "./dist/reporter.js"
   },
   "dependencies": {
+    "table": "^6.8.2",
     "commander": "^12.1.0",
     "js-levenshtein": "^1.1.6",
     "tinyrainbow": "^1.2.0",

diff --git a/packages/evalite-vitest/src/reporter.ts b/packages/evalite-vitest/src/reporter.ts
@@ -3,13 +3,16 @@ import { BasicReporter } from "vitest/reporters";
 
 import { appendToJsonDb, DEFAULT_SERVER_PORT } from "@evalite/core";
 import c from "tinyrainbow";
+import { average, sum } from "./utils.js";
+import { table } from "table";
 
 export interface EvaliteReporterOptions {
   jsonDbLocation: string;
 }
 
 export default class EvaliteReporter extends BasicReporter {
   private opts: EvaliteReporterOptions;
+
   // private server: Server;
   constructor(opts: EvaliteReporterOptions) {
     super();
@@ -19,7 +22,6 @@ export default class EvaliteReporter extends BasicReporter {
     //   jsonDbLocation: "./evalite-report.jsonl",
     // });
   }
-
   override onInit(ctx: any): void {
     this.ctx = ctx;
     this.start = performance.now();
@@ -96,12 +98,9 @@ export default class EvaliteReporter extends BasicReporter {
     }
 
     const totalScore = scores.reduce((a, b) => a + b, 0);
-    const averageScore = Math.round((totalScore / scores.length) * 100);
-
-    const color =
-      averageScore >= 80 ? c.green : averageScore >= 50 ? c.yellow : c.red;
+    const averageScore = totalScore / scores.length;
 
-    const title = failed ? c.red("✖") : c.bold(color(averageScore + "%"));
+    const title = failed ? c.red("✖") : displayScore(averageScore);
 
     const toLog = [
       ` ${title} `,
@@ -119,22 +118,18 @@ export default class EvaliteReporter extends BasicReporter {
   }
 
   override reportTestSummary(files: RunnerTestFile[], errors: unknown[]): void {
-    const scores = files.flatMap((file) =>
-      file.tasks.flatMap((task) => {
-        if (task.meta.evalite) {
-          return task.meta.evalite.results.flatMap((r) =>
-            r.scores.map((s) => s.score)
-          );
-        }
-        return [];
-      })
+    // this.printErrorsSummary(errors); // TODO
+
+    const evals = files.flatMap((file) =>
+      file.tasks.filter((task) => task.meta.evalite)
     );
 
-    const totalScore = scores.reduce((a, b) => a + b, 0);
-    const averageScore = Math.round((totalScore / scores.length) * 100);
+    const scores = evals.flatMap((task) =>
+      task.meta.evalite!.results.flatMap((r) => r.scores.map((s) => s.score))
+    );
 
-    const scoreColor =
-      averageScore >= 80 ? c.green : averageScore >= 50 ? c.yellow : c.red;
+    const totalScore = sum(scores, (score) => score);
+    const averageScore = totalScore / scores.length;
 
     const collectTime = files.reduce((a, b) => a + (b.collectDuration || 0), 0);
     const testsTime = files.reduce((a, b) => a + (b.result?.duration || 0), 0);
@@ -149,7 +144,7 @@ export default class EvaliteReporter extends BasicReporter {
     const scoreDisplay =
       failedTasks.length > 0
         ? c.red("✖ ") + c.dim(`(${failedTasks.length} failed)`)
-        : c.bold(scoreColor(averageScore + "%"));
+        : displayScore(averageScore);
 
     this.ctx.logger.log(
       ["      ", c.dim("Score"), "  ", scoreDisplay].join("")
@@ -174,6 +169,60 @@ export default class EvaliteReporter extends BasicReporter {
       )
     );
 
-    // super.reportTestSummary(files, errors);
+    if (evals.length === 1 && evals[0]) {
+      this.renderTable(
+        evals[0].meta.evalite!.results.map((result) => ({
+          input: result.input,
+          output: result.result,
+          score: average(result.scores, (s) => s.score),
+        }))
+      );
+    }
+  }
+
+  private renderTable(
+    props: {
+      input: unknown;
+      output: unknown;
+      score: number;
+    }[]
+  ) {
+    this.ctx.logger.log("");
+
+    const availableColumns = process.stdout.columns || 80;
+
+    const scoreWidth = 5;
+    const columnsWritableWidth = 11;
+    const availableInnerSpace =
+      availableColumns - columnsWritableWidth - scoreWidth;
+
+    const colWidth = Math.floor(availableInnerSpace / 2);
+
+    this.ctx.logger.log(
+      table(
+        [
+          [c.bold("Input"), c.bold("Output"), c.bold("Score")],
+          ...props.map((p) => [p.input, p.output, displayScore(p.score)]),
+        ],
+        {
+          columns: [
+            { width: colWidth, wrapWord: true },
+            { width: colWidth, wrapWord: true },
+            { width: scoreWidth },
+          ],
+        }
+      )
+    );
   }
 }
+
+const displayScore = (score: number) => {
+  const percentageScore = Math.round(score * 100);
+  if (percentageScore >= 80) {
+    return c.bold(c.green(percentageScore + "%"));
+  } else if (percentageScore >= 50) {
+    return c.bold(c.yellow(percentageScore + "%"));
+  } else {
+    return c.bold(c.red(percentageScore + "%"));
+  }
+};
diff --git a/packages/evalite-vitest/src/tests/basics.test.ts b/packages/evalite-vitest/src/tests/basics.test.ts
@@ -90,3 +90,21 @@ it("Should capture a hash of the source code", async () => {
 
   expect(evals.Basics[0].sourceCodeHash.length).toEqual(64);
 });
+
+it("Should display a table when there is only one eval", async () => {
+  using fixture = loadFixture("basics");
+
+  const captured = captureStdout();
+
+  await runVitest({
+    cwd: fixture.dir,
+    path: undefined,
+    testOutputWritable: captured.writable,
+  });
+
+  expect(captured.getOutput()).toContain("Input");
+  expect(captured.getOutput()).toContain("Output");
+  expect(captured.getOutput()).toContain("Score");
+  expect(captured.getOutput()).toContain("abc");
+  expect(captured.getOutput()).toContain("abcdef");
+});
diff --git a/packages/evalite-vitest/src/tests/fixtures/basics/basics.ts b/packages/evalite-vitest/src/tests/fixtures/basics/basics.ts
diff --git a/packages/evalite-vitest/src/tests/fixtures/long-text/long-text.eval.ts b/packages/evalite-vitest/src/tests/fixtures/long-text/long-text.eval.ts
@@ -0,0 +1,30 @@
+import { evalite, Levenshtein } from "../../../index.js";
+
+evalite("Long Text", {
+  data: () => {
+    return [
+      {
+        input: [
+          `Some extremely long text that will test the bounds of our system.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+        ].join("\n"),
+        expected: [
+          `Some extremely long text that will test the bounds of our system.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+          `This is a test to see if we can handle long text inputs.`,
+        ].join("\n"),
+      },
+    ];
+  },
+  task: (input) => {
+    return input;
+  },
+  scorers: [Levenshtein],
+});
diff --git a/packages/evalite-vitest/src/tests/long-text.test.ts b/packages/evalite-vitest/src/tests/long-text.test.ts
@@ -0,0 +1,21 @@
+import { getJsonDbEvals } from "@evalite/core";
+import { assert, expect, it } from "vitest";
+import { runVitest } from "../command.js";
+import { captureStdout, loadFixture } from "./test-utils.js";
+
+it("Should report long text correctly", async () => {
+  using fixture = loadFixture("long-text");
+
+  const captured = captureStdout();
+
+  await runVitest({
+    cwd: fixture.dir,
+    path: undefined,
+    testOutputWritable: captured.writable,
+  });
+
+  expect(captured.getOutput()).toContain("Input");
+  expect(captured.getOutput()).toContain("Output");
+  expect(captured.getOutput()).toContain("Score");
+  expect(captured.getOutput()).toContain("Some extremely long text");
+});
diff --git a/packages/evalite-vitest/src/tests/multi.test.ts b/packages/evalite-vitest/src/tests/multi.test.ts
@@ -21,3 +21,17 @@ it("Should report multiple evals correctly", async () => {
   expect(captured.getOutput()).toContain("100% multi-2.eval.ts  (1 eval)");
   expect(captured.getOutput()).toContain("100% multi-3.eval.ts  (2 evals)");
 });
+
+it("Should not show a table when running multiple evals", async () => {
+  using fixture = loadFixture("multi");
+
+  const captured = captureStdout();
+
+  await runVitest({
+    cwd: fixture.dir,
+    path: undefined,
+    testOutputWritable: captured.writable,
+  });
+
+  expect(captured.getOutput()).not.toContain("ONLY ONE EVAL");
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml