diff --git a/packages/evalite-core/src/index.ts b/packages/evalite-core/src/index.ts index c52a8f7..e4c0ec6 100644 --- a/packages/evalite-core/src/index.ts +++ b/packages/evalite-core/src/index.ts @@ -19,14 +19,6 @@ export declare namespace Evalite { duration: number; }; - export type TaskReport = { - file: string; - task: string; - input: unknown; - result: unknown; - scores: Score[]; - }; - export type Score = { score: number; name: string; @@ -41,6 +33,7 @@ export declare namespace Evalite { results: Result[]; duration: number | undefined; sourceCodeHash: string; + traces: Trace[]; }; export type Scorer = ( @@ -52,6 +45,28 @@ export declare namespace Evalite { task: (input: TInput) => MaybePromise; scorers: Scorer[]; }; + + export interface Trace { + prompt: TracePrompt[]; + usage: { + promptTokens: number; + completionTokens: number; + }; + output: string; + start: number; + end: number; + duration: number; + } + + export type TracePrompt = { + role: string; + content: TracePromptTextContent[] | string; + }; + + export type TracePromptTextContent = { + type: "text"; + text: string; + }; } export * from "./json-db.js"; diff --git a/packages/evalite-core/src/json-db.ts b/packages/evalite-core/src/json-db.ts index 2347b4a..6e63301 100644 --- a/packages/evalite-core/src/json-db.ts +++ b/packages/evalite-core/src/json-db.ts @@ -7,17 +7,18 @@ export type JsonDBEval = { score: number; startTime: string; duration: number; - results: JsonDbResult[]; + results: JsonDbRun[]; sourceCodeHash: string; }; -export type JsonDbResult = { +export type JsonDbRun = { input: unknown; expected: unknown; result: unknown; scores: Evalite.Score[]; duration: number; score: number; + traces: Evalite.Trace[]; }; export const appendToJsonDb = async (opts: { @@ -58,6 +59,7 @@ export const appendToJsonDb = async (opts: { scores, duration, score: average(scores, (s) => s.score), + traces: task.meta.evalite.traces, }); } } diff --git a/packages/evalite/package.json b/packages/evalite/package.json index 6397cbf..04baa39 100644 --- a/packages/evalite/package.json +++ b/packages/evalite/package.json @@ -29,6 +29,8 @@ "devDependencies": { "@types/ws": "^8.5.13", "strip-ansi": "^7.1.0", - "@types/js-levenshtein": "^1.1.3" + "@types/js-levenshtein": "^1.1.3", + "unstorage": "^1.13.1", + "ai": "^4.0.10" } } diff --git a/packages/evalite/src/index.ts b/packages/evalite/src/index.ts index 2771128..e229011 100644 --- a/packages/evalite/src/index.ts +++ b/packages/evalite/src/index.ts @@ -1,6 +1,7 @@ import type { Evalite } from "@evalite/core"; import levenshtein from "js-levenshtein"; import { inject, it } from "vitest"; +import { reportTraceLocalStorage } from "./trace-model-async-storage.js"; declare module "vitest" { interface TaskMeta { @@ -41,6 +42,10 @@ export const evalite = ( throw new Error("You must provide at least one scorer."); } + const traces: Evalite.Trace[] = []; + + reportTraceLocalStorage.enterWith((trace) => traces.push(trace)); + const sourceCodeHash = inject("evaliteInputHash"); const data = await opts.data(); @@ -67,6 +72,7 @@ export const evalite = ( results, duration: Math.round(performance.now() - start), sourceCodeHash, + traces, }; }); }; @@ -90,6 +96,18 @@ export const Levenshtein = (args: Evalite.ScoreInput) => { }; }; +export const reportTrace = (trace: Evalite.Trace) => { + const _reportTrace = reportTraceLocalStorage.getStore(); + + if (!_reportTrace) { + throw new Error( + "An error occurred: reportTrace must be called inside an evalite eval" + ); + } + + _reportTrace(trace); +}; + export const numericDifference = (args: Evalite.ScoreInput) => { if (args.expected === undefined) { throw new Error("NumericDifferenceScorer requires an expected value"); diff --git a/packages/evalite/src/tests/fixtures/traces/traces.eval.ts b/packages/evalite/src/tests/fixtures/traces/traces.eval.ts new file mode 100644 index 0000000..4b5f9dc --- /dev/null +++ b/packages/evalite/src/tests/fixtures/traces/traces.eval.ts @@ -0,0 +1,33 @@ +import { evalite, Levenshtein, reportTrace } from "../../../index.js"; +import { setTimeout } from "node:timers/promises"; + +evalite("Traces", { + data: () => { + return [ + { + input: "abc", + expected: "abcdef", + }, + ]; + }, + task: async (input) => { + reportTrace({ + duration: 100, + start: 0, + end: 100, + output: "abcdef", + prompt: [ + { + role: "input", + content: "abc", + }, + ], + usage: { + completionTokens: 1, + promptTokens: 1, + }, + }); + return input + "def"; + }, + scorers: [Levenshtein], +}); diff --git a/packages/evalite/src/tests/traces.test.ts b/packages/evalite/src/tests/traces.test.ts new file mode 100644 index 0000000..69951ab --- /dev/null +++ b/packages/evalite/src/tests/traces.test.ts @@ -0,0 +1,43 @@ +import { getJsonDbEvals } from "@evalite/core"; +import { expect, it } from "vitest"; +import { runVitest } from "../command.js"; +import { captureStdout, loadFixture } from "./test-utils.js"; + +it("Should report traces from reportTrace", async () => { + using fixture = loadFixture("traces"); + + const captured = captureStdout(); + + await runVitest({ + cwd: fixture.dir, + path: undefined, + testOutputWritable: captured.writable, + }); + + const evals = await getJsonDbEvals({ dbLocation: fixture.jsonDbLocation }); + + expect(evals.Traces![0]).toMatchObject({ + results: [ + { + traces: [ + { + duration: 100, + end: 100, + output: "abcdef", + prompt: [ + { + content: "abc", + role: "input", + }, + ], + start: 0, + usage: { + completionTokens: 1, + promptTokens: 1, + }, + }, + ], + }, + ], + }); +}); diff --git a/packages/evalite/src/trace-model-async-storage.ts b/packages/evalite/src/trace-model-async-storage.ts new file mode 100644 index 0000000..4098c4e --- /dev/null +++ b/packages/evalite/src/trace-model-async-storage.ts @@ -0,0 +1,6 @@ +import type { Evalite } from "@evalite/core"; +import { AsyncLocalStorage } from "async_hooks"; + +export const reportTraceLocalStorage = new AsyncLocalStorage< + (trace: Evalite.Trace) => void +>(); diff --git a/packages/evalite/src/trace-model.ts b/packages/evalite/src/trace-model.ts new file mode 100644 index 0000000..4c2200e --- /dev/null +++ b/packages/evalite/src/trace-model.ts @@ -0,0 +1,51 @@ +import { experimental_wrapLanguageModel, type LanguageModelV1 } from "ai"; +import { reportTrace } from "./index.js"; + +export const traceAISDKModel = (model: LanguageModelV1) => { + return experimental_wrapLanguageModel({ + model, + middleware: { + wrapGenerate: async (opts) => { + const start = performance.now(); + const generated = await opts.doGenerate(); + const end = performance.now(); + + reportTrace({ + output: generated.text ?? "", + prompt: opts.params.prompt.map((prompt) => { + if (!Array.isArray(prompt.content)) { + return { + role: prompt.role, + content: prompt.content, + }; + } + + const content = prompt.content.map((content) => { + if (content.type !== "text") { + throw new Error( + `Unsupported content type: ${content.type}. Only text is currently supported.` + ); + } + + return { + type: "text" as const, + text: content.text, + }; + }); + + return { + role: prompt.role, + content, + }; + }), + usage: generated.usage, + duration: end - start, + start, + end, + }); + + return generated; + }, + }, + }); +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c65e834..226277d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -142,9 +142,15 @@ importers: '@types/ws': specifier: ^8.5.13 version: 8.5.13 + ai: + specifier: ^4.0.10 + version: 4.0.10(react@18.3.1)(zod@3.23.8) strip-ansi: specifier: ^7.1.0 version: 7.1.0 + unstorage: + specifier: ^1.13.1 + version: 1.13.1 packages/evalite-core: {} @@ -186,10 +192,23 @@ packages: zod: optional: true + '@ai-sdk/provider-utils@2.0.2': + resolution: {integrity: sha512-IAvhKhdlXqiSmvx/D4uNlFYCl8dWT+M9K+IuEcSgnE2Aj27GWu8sDIpAf4r4Voc+wOUkOECVKQhFo8g9pozdjA==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + peerDependenciesMeta: + zod: + optional: true + '@ai-sdk/provider@0.0.26': resolution: {integrity: sha512-dQkfBDs2lTYpKM8389oopPdQgIU007GQyCbuPPrV+K6MtSII3HBfE0stUIMXUb44L+LK1t6GXPP7wjSzjO6uKg==} engines: {node: '>=18'} + '@ai-sdk/provider@1.0.1': + resolution: {integrity: sha512-mV+3iNDkzUsZ0pR2jG0sVzU6xtQY5DtSCBy3JFycLp6PwjyLw/iodfL3MwdmMCRJWgs3dadcHejRnMvF9nGTBg==} + engines: {node: '>=18'} + '@ai-sdk/react@0.0.70': resolution: {integrity: sha512-GnwbtjW4/4z7MleLiW+TOZC2M29eCg1tOUpuEiYFMmFNZK8mkrqM0PFZMo6UsYeUYMWqEOOcPOU9OQVJMJh7IQ==} engines: {node: '>=18'} @@ -202,6 +221,18 @@ packages: zod: optional: true + '@ai-sdk/react@1.0.3': + resolution: {integrity: sha512-Mak7qIRlbgtP4I7EFoNKRIQTlABJHhgwrN8SV2WKKdmsfWK2RwcubQWz1hp88cQ0bpF6KxxjSY1UUnS/S9oR5g==} + engines: {node: '>=18'} + peerDependencies: + react: ^18 || ^19 || ^19.0.0-rc + zod: ^3.0.0 + peerDependenciesMeta: + react: + optional: true + zod: + optional: true + '@ai-sdk/solid@0.0.54': resolution: {integrity: sha512-96KWTVK+opdFeRubqrgaJXoNiDP89gNxFRWUp0PJOotZW816AbhUf4EnDjBjXTLjXL1n0h8tGSE9sZsRkj9wQQ==} engines: {node: '>=18'} @@ -229,6 +260,15 @@ packages: zod: optional: true + '@ai-sdk/ui-utils@1.0.2': + resolution: {integrity: sha512-hHrUdeThGHu/rsGZBWQ9PjrAU9Htxgbo9MFyR5B/aWoNbBeXn1HLMY1+uMEnXL5pRPlmyVRjgIavWg7UgeNDOw==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.0.0 + peerDependenciesMeta: + zod: + optional: true + '@ai-sdk/vue@0.0.59': resolution: {integrity: sha512-+ofYlnqdc8c4F6tM0IKF0+7NagZRAiqBJpGDJ+6EYhDW8FHLUP/JFBgu32SjxSxC6IKFZxEnl68ZoP/Z38EMlw==} engines: {node: '>=18'} @@ -1481,6 +1521,18 @@ packages: zod: optional: true + ai@4.0.10: + resolution: {integrity: sha512-40GaEGLbp7if1F50zp3Kr03vcqyGS8svyJWpbkgec7G5Ik2rEtnbDWiUoOJuAVqgP5/iy4NgZQfvX3jRmOyQrw==} + engines: {node: '>=18'} + peerDependencies: + react: ^18 || ^19 || ^19.0.0-rc + zod: ^3.0.0 + peerDependenciesMeta: + react: + optional: true + zod: + optional: true + ajv-formats@3.0.1: resolution: {integrity: sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==} peerDependencies: @@ -2287,6 +2339,10 @@ packages: resolution: {integrity: sha512-v0eOBUbiaFojBu2s2NPBfYUoRR9GjcDNvCXVaqEf5vVfpIAh9f8RCo4vXTP8c63QRKCFwoLpMpTdPwwhEKVgzA==} engines: {node: '>=14.18'} + eventsource-parser@3.0.0: + resolution: {integrity: sha512-T1C0XCUimhxVQzW4zFipdx0SficT651NnkR0ZSH3yQwh+mFMdLfgjABVi4YtMTtaL4s168593DaoaRLMqryavA==} + engines: {node: '>=18.0.0'} + execa@5.1.1: resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} engines: {node: '>=10'} @@ -4774,10 +4830,23 @@ snapshots: optionalDependencies: zod: 3.23.8 + '@ai-sdk/provider-utils@2.0.2(zod@3.23.8)': + dependencies: + '@ai-sdk/provider': 1.0.1 + eventsource-parser: 3.0.0 + nanoid: 3.3.7 + secure-json-parse: 2.7.0 + optionalDependencies: + zod: 3.23.8 + '@ai-sdk/provider@0.0.26': dependencies: json-schema: 0.4.0 + '@ai-sdk/provider@1.0.1': + dependencies: + json-schema: 0.4.0 + '@ai-sdk/react@0.0.70(react@18.3.1)(zod@3.23.8)': dependencies: '@ai-sdk/provider-utils': 1.0.22(zod@3.23.8) @@ -4788,6 +4857,16 @@ snapshots: react: 18.3.1 zod: 3.23.8 + '@ai-sdk/react@1.0.3(react@18.3.1)(zod@3.23.8)': + dependencies: + '@ai-sdk/provider-utils': 2.0.2(zod@3.23.8) + '@ai-sdk/ui-utils': 1.0.2(zod@3.23.8) + swr: 2.2.5(react@18.3.1) + throttleit: 2.1.0 + optionalDependencies: + react: 18.3.1 + zod: 3.23.8 + '@ai-sdk/solid@0.0.54(zod@3.23.8)': dependencies: '@ai-sdk/provider-utils': 1.0.22(zod@3.23.8) @@ -4815,6 +4894,14 @@ snapshots: optionalDependencies: zod: 3.23.8 + '@ai-sdk/ui-utils@1.0.2(zod@3.23.8)': + dependencies: + '@ai-sdk/provider': 1.0.1 + '@ai-sdk/provider-utils': 2.0.2(zod@3.23.8) + zod-to-json-schema: 3.23.5(zod@3.23.8) + optionalDependencies: + zod: 3.23.8 + '@ai-sdk/vue@0.0.59(vue@3.5.12(typescript@5.6.2))(zod@3.23.8)': dependencies: '@ai-sdk/provider-utils': 1.0.22(zod@3.23.8) @@ -6209,6 +6296,19 @@ snapshots: - solid-js - vue + ai@4.0.10(react@18.3.1)(zod@3.23.8): + dependencies: + '@ai-sdk/provider': 1.0.1 + '@ai-sdk/provider-utils': 2.0.2(zod@3.23.8) + '@ai-sdk/react': 1.0.3(react@18.3.1)(zod@3.23.8) + '@ai-sdk/ui-utils': 1.0.2(zod@3.23.8) + '@opentelemetry/api': 1.9.0 + jsondiffpatch: 0.6.0 + zod-to-json-schema: 3.23.5(zod@3.23.8) + optionalDependencies: + react: 18.3.1 + zod: 3.23.8 + ajv-formats@3.0.1(ajv@8.17.1): optionalDependencies: ajv: 8.17.1 @@ -7196,6 +7296,8 @@ snapshots: eventsource-parser@1.1.2: {} + eventsource-parser@3.0.0: {} + execa@5.1.1: dependencies: cross-spawn: 7.0.3