diff --git a/.vscode/settings.json b/.vscode/settings.json index b6be645..174324e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,5 +14,5 @@ "^@radix-ui", "^cmdk$" ], - "typescript.preferences.includePackageJsonAutoImports": "on", -} \ No newline at end of file + "typescript.preferences.includePackageJsonAutoImports": "on" +} diff --git a/apps/evalite-ui/app/components/score.tsx b/apps/evalite-ui/app/components/score.tsx index 2b4150f..a25c3f9 100644 --- a/apps/evalite-ui/app/components/score.tsx +++ b/apps/evalite-ui/app/components/score.tsx @@ -46,3 +46,19 @@ export const Score = (props: { ); }; + +export const getScoreState = ( + score: number, + prevScore: number | null | undefined +) => { + const state: ScoreState = + typeof prevScore === "undefined" || prevScore === null + ? "first" + : score > prevScore + ? "up" + : score < prevScore + ? "down" + : "same"; + + return state; +}; diff --git a/apps/evalite-ui/app/root.tsx b/apps/evalite-ui/app/root.tsx index 2948c99..e52d3be 100644 --- a/apps/evalite-ui/app/root.tsx +++ b/apps/evalite-ui/app/root.tsx @@ -35,7 +35,7 @@ import { TestServerStateContext, useSubscribeToTestServer, } from "./use-subscribe-to-socket"; -import { Score, type ScoreState } from "./components/score"; +import { getScoreState, Score, type ScoreState } from "./components/score"; export const links: LinksFunction = () => [ { rel: "preconnect", href: "https://fonts.googleapis.com" }, @@ -79,13 +79,7 @@ export const clientLoader = async () => { const score = mostRecentEval.score; - const state: ScoreState = !secondMostRecentEval - ? "first" - : score > secondMostRecentEval.score - ? "up" - : score < secondMostRecentEval.score - ? "down" - : "same"; + const state = getScoreState(score, secondMostRecentEval?.score); return { name: key, state, diff --git a/apps/evalite-ui/app/routes/eval.$name.tsx b/apps/evalite-ui/app/routes/eval.$name.tsx index bfa3394..309b653 100644 --- a/apps/evalite-ui/app/routes/eval.$name.tsx +++ b/apps/evalite-ui/app/routes/eval.$name.tsx @@ -3,7 +3,7 @@ import type { MetaFunction } from "@remix-run/node"; import { useLoaderData, type ClientLoaderFunctionArgs } from "@remix-run/react"; import { useContext } from "react"; import { InnerPageLayout } from "~/components/page-header"; -import { Score } from "~/components/score"; +import { getScoreState, Score } from "~/components/score"; import { Table, TableBody, @@ -33,9 +33,10 @@ export const clientLoader = async (args: ClientLoaderFunctionArgs) => { }; export default function Page() { - const { name, evaluation } = useLoaderData(); + const { name, evaluation, prevEvaluation } = + useLoaderData(); - const firstResult = evaluation.results[0]!; + const firstResult = evaluation.results[0]; const serverState = useContext(TestServerStateContext); @@ -47,7 +48,7 @@ export default function Page() { Input Output Expected - {firstResult.scores.map((scorer) => ( + {firstResult?.scores.map((scorer) => ( {scorer.name} ))} @@ -60,6 +61,9 @@ export default function Page() { {result.result as any} {result.expected as any} {result.scores.map((scorer) => { + const scoreInPreviousEvaluation = prevEvaluation?.results + .find((r) => r.input === result.input) + ?.scores.find((s) => s.name === scorer.name); return ( ); diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts index cde1c90..65a9719 100644 --- a/packages/evalite/src/index.ts +++ b/packages/evalite/src/index.ts @@ -41,10 +41,6 @@ export const evalite = ( opts: Evalite.RunnerOpts ) => { return it(testName, async ({ task }) => { - if (opts.scorers.length === 0) { - throw new Error("You must provide at least one scorer."); - } - const traces: Evalite.StoredTrace[] = []; reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); diff --git a/packages/evalite/src/reporter.ts b/packages/evalite/src/reporter.ts index 5175c75..f446664 100644 --- a/packages/evalite/src/reporter.ts +++ b/packages/evalite/src/reporter.ts @@ -46,7 +46,7 @@ export default class EvaliteReporter extends BasicReporter { // super.onTaskUpdate(packs); // } - override onWatcherStart(files: RunnerTestFile[], errors?: unknown[]): void { + override onWatcherStart(files?: RunnerTestFile[], errors?: unknown[]): void { super.onWatcherStart(files, errors); } diff --git a/packages/evalite/src/run-vitest.ts b/packages/evalite/src/run-vitest.ts index 67c321c..793e2ca 100644 --- a/packages/evalite/src/run-vitest.ts +++ b/packages/evalite/src/run-vitest.ts @@ -35,7 +35,7 @@ export const runVitest = async (opts: { }, }), ], - slowTestThreshold: 30_000, + testTimeout: 30_000, }, {}, { diff --git a/packages/example/src/content-generation.eval.ts b/packages/example/src/content-generation.eval.ts new file mode 100644 index 0000000..b60bae1 --- /dev/null +++ b/packages/example/src/content-generation.eval.ts @@ -0,0 +1,48 @@ +import { generateText } from "ai"; +import { createScorer, evalite } from "evalite"; +import { cacheModel } from "./cache-model"; +import { openai } from "@ai-sdk/openai"; +import { createStorage } from "unstorage"; +import fsDriver from "unstorage/drivers/fs"; +import { Humor } from "autoevals"; + +const storage = createStorage({ + driver: (fsDriver as any)({ + base: "./llm-cache.local", + }), +}); + +evalite("Content generation", { + data: async () => { + return [ + { + input: "Write a TypeScript tweet", + }, + { + input: "Write a tweet about TypeScript template literals types.", + }, + ]; + }, + task: async (input) => { + const result = await generateText({ + model: cacheModel(openai("gpt-4o-mini"), storage), + prompt: input, + system: ` + You are a helpful social media assistant. + You will be asked to write a tweet on a given topic. + Return only the tweet. + Do not use emojis. + Do not use hashtags. + Use code examples where required. + `, + }); + + return result.text; + }, + scorers: [ + Humor, + createScorer("No Hashtags", ({ output }) => { + return output.includes("#") ? 0 : 1; + }), + ], +}); diff --git a/packages/example/src/example.eval.ts b/packages/example/src/example.eval.ts index 877678a..7900bc0 100644 --- a/packages/example/src/example.eval.ts +++ b/packages/example/src/example.eval.ts @@ -46,6 +46,10 @@ evalite("Test Capitals", { input: `Name all the capitals of each part of the UK.`, expected: `London, Edinburgh, Cardiff, Belfast`, }, + { + input: `What's the capital of Antarctica?`, + expected: `Antarctica has no capital.`, + }, ], task: async (input) => { const result = await generateText({