From b9cd0e62ab8270b5372cf7af8c861ec73b367085 Mon Sep 17 00:00:00 2001 From: Navid Pour Date: Thu, 3 Oct 2024 12:43:49 -0700 Subject: [PATCH] add more information into the eval outputs --- evals/bananalyzer-ts/index.ts | 34 ++++++++++++++--- evals/index.eval.ts | 71 ++++++++++++++++++++++++++--------- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/evals/bananalyzer-ts/index.ts b/evals/bananalyzer-ts/index.ts index 0d335f93..b23f4c7b 100644 --- a/evals/bananalyzer-ts/index.ts +++ b/evals/bananalyzer-ts/index.ts @@ -47,7 +47,7 @@ export async function evaluateExample( launchServer: true, serverPort: 6778, }, -): Promise { +): Promise { await new Promise((resolve) => setTimeout(resolve, 2000)); const examples = JSON.parse( @@ -162,7 +162,12 @@ export async function evaluateExample( if (evalItem.expected) { if (!validateJsonMatch(evalItem.expected, extractionResult)) { console.log("❌ JSON match failed"); - return false; + return { + _success: false, + case: "json_mismatch_1", + expected: evalItem.expected, + actual: extractionResult, + }; } } else if (evalItem.options) { const matchesAny = evalItem.options.some((option) => @@ -170,7 +175,12 @@ export async function evaluateExample( ); if (!matchesAny) { console.log("❌ No JSON match found in options"); - return false; + return { + _success: false, + case: "json_mismatch_2", + expected: evalItem.expected, + actual: extractionResult, + }; } } } else if ( @@ -181,16 +191,28 @@ export async function evaluateExample( !validateEndUrlMatch(evalItem.expected, await stagehand.page.url()) ) { console.log("❌ URL match failed"); - return false; + return { + _success: false, + case: "url_mismatch", + expected: evalItem.expected, + actual: await stagehand.page.url(), + }; } } } console.log("✅ All evaluations passed"); - return true; + return { + _success: true, + expected: extractionResult, + actual: extractionResult, + }; } catch (error) { console.error("Error during evaluation:", error); - return false; + return { + _success: false, + error: error, + }; } finally { try { const deleteResponse = await fetch( diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 33e6d597..199d362c 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -41,7 +41,11 @@ const vanta = async () => { await stagehand.context.close(); - return observationResult == expectedResult; + return { + _success: observationResult == expectedResult, + expected: expectedResult, + actual: observationResult, + }; }; const vanta_h = async () => { @@ -59,7 +63,10 @@ const vanta_h = async () => { await stagehand.context.close(); // we should have no saved observation since the element shouldn't exist - return observation === null; + return { + _success: observation === null, + observation, + }; }; const simple_google_search = async () => { @@ -80,7 +87,10 @@ const simple_google_search = async () => { await stagehand.context.close(); - return currentUrl.startsWith(expectedUrl); + return { + _success: currentUrl.startsWith(expectedUrl), + currentUrl, + }; }; const peeler_simple = async () => { @@ -101,7 +111,9 @@ const peeler_simple = async () => { const isVisible = await successMessageLocator.isVisible(); await stagehand.context.close(); - return isVisible; + return { + _success: isVisible, + }; }; const peeler_complex = async () => { @@ -130,7 +142,10 @@ const peeler_complex = async () => { await stagehand.context.close(); - return price !== null; + return { + _success: price !== null, + price, + }; }; const extract_collaborators_from_github_repository = async () => { @@ -164,7 +179,10 @@ const extract_collaborators_from_github_repository = async () => { console.log("Extracted collaborators:", contributors); await stagehand.context.close(); - return contributors.length === 20; + return { + _success: contributors.length === 20, + contributors, + }; } catch (error) { console.error("Error or timeout occurred:", error); await stagehand.context.close(); @@ -201,7 +219,10 @@ const extract_last_twenty_github_commits = async () => { console.log("Extracted commits:", commits); await stagehand.context.close(); - return commits.length === 20; + return { + _success: commits.length === 20, + commits, + }; } catch (error) { console.error("Error or timeout occurred:", error); await stagehand.context.close(); @@ -226,7 +247,11 @@ const wikipedia = async () => { const currentUrl = await stagehand.page.url(); await stagehand.context.close(); - return currentUrl === url; + return { + _success: currentUrl === url, + expected: url, + actual: currentUrl, + }; }; const costar = async () => { @@ -268,10 +293,10 @@ const costar = async () => { await stagehand.context.close(); - return isTitleValid; + return { title: articleTitle.title, _success: isTitleValid }; } catch (error) { console.error(`Error in costar function: ${error.message}`); - return { title: null }; + return { title: null, _success: false } as any; } finally { await stagehand.context.close(); } @@ -349,7 +374,7 @@ const google_jobs = async () => { console.log("Job Details valid:", isJobDetailsValid); - return isJobDetailsValid; + return { _success: isJobDetailsValid, jobDetails }; }; const tasks = { @@ -365,12 +390,20 @@ const tasks = { google_jobs, }; -const exactMatch = (args: { input; output; expected? }) => { +const exactMatch = (args: { input: any; output: any; expected?: any }) => { console.log(`Task "${args.input.name}" returned: ${args.output}`); + const expected = args.expected ?? true; + if (expected === true) { + return { + name: "Exact match", + score: args.output === true || args.output?._success == true, + }; + } + return { name: "Exact match", - score: args.output === true || args.output?.success == true, + score: args.output === expected, }; }; @@ -395,9 +428,13 @@ const testcases = [ }, { input: { name: "peeler_complex" } }, { input: { name: "simple_google_search" } }, - { input: { name: "extract_collaborators_from_github_repository" } }, + { + input: { + name: "extract_collaborators_from_github_repository", + }, + }, { input: { name: "extract_last_twenty_github_commits" } }, - // { input: { name: "costar" } }, + // { input: { name: "costar", expected: true } }, { input: { name: "google_jobs" } }, ...chosenBananalyzerEvals.map((evalItem: any) => ({ input: { @@ -416,7 +453,7 @@ Eval("stagehand", { data: () => { return testcases; }, - task: async (input) => { + task: async (input: any) => { // console.log("input", input); try { if ("source" in input && input.source === "bananalyzer-ts") { @@ -440,7 +477,7 @@ Eval("stagehand", { return result; } else { // Handle predefined tasks - const result = await tasks[input.name](input); + const result = await (tasks as any)[input.name](input); if (result) { console.log(`✅ ${input.name}: Passed`); } else {