Skip to content

Commit

Permalink
Merge branch 'npour/more-info-in-eval' into npour/fix-bananalyzer-3
Browse files Browse the repository at this point in the history
  • Loading branch information
navidkpr committed Oct 3, 2024
2 parents d7decc5 + b9cd0e6 commit 3d9edcb
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 23 deletions.
34 changes: 28 additions & 6 deletions evals/bananalyzer-ts/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export async function evaluateExample(
launchServer: true,
serverPort: 6778,
},
): Promise<boolean> {
): Promise<any> {
await new Promise((resolve) => setTimeout(resolve, 2000));

const examples = JSON.parse(
Expand Down Expand Up @@ -162,15 +162,25 @@ export async function evaluateExample(
if (evalItem.expected) {
if (!validateJsonMatch(evalItem.expected, extractionResult)) {
console.log("❌ JSON match failed");
return false;
return {
_success: false,
case: "json_mismatch_1",
expected: evalItem.expected,
actual: extractionResult,
};
}
} else if (evalItem.options) {
const matchesAny = evalItem.options.some((option) =>
validateJsonMatch(option, extractionResult),
);
if (!matchesAny) {
console.log("❌ No JSON match found in options");
return false;
return {
_success: false,
case: "json_mismatch_2",
expected: evalItem.expected,
actual: extractionResult,
};
}
}
} else if (
Expand All @@ -181,16 +191,28 @@ export async function evaluateExample(
!validateEndUrlMatch(evalItem.expected, await stagehand.page.url())
) {
console.log("❌ URL match failed");
return false;
return {
_success: false,
case: "url_mismatch",
expected: evalItem.expected,
actual: await stagehand.page.url(),
};
}
}
}

console.log("✅ All evaluations passed");
return true;
return {
_success: true,
expected: extractionResult,
actual: extractionResult,
};
} catch (error) {
console.error("Error during evaluation:", error);
return false;
return {
_success: false,
error: error,
};
} finally {
try {
const deleteResponse = await fetch(
Expand Down
71 changes: 54 additions & 17 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ const vanta = async () => {

await stagehand.context.close();

return observationResult == expectedResult;
return {
_success: observationResult == expectedResult,
expected: expectedResult,
actual: observationResult,
};
};

const vanta_h = async () => {
Expand All @@ -59,7 +63,10 @@ const vanta_h = async () => {
await stagehand.context.close();

// we should have no saved observation since the element shouldn't exist
return observation === null;
return {
_success: observation === null,
observation,
};
};

const simple_google_search = async () => {
Expand All @@ -80,7 +87,10 @@ const simple_google_search = async () => {

await stagehand.context.close();

return currentUrl.startsWith(expectedUrl);
return {
_success: currentUrl.startsWith(expectedUrl),
currentUrl,
};
};

const peeler_simple = async () => {
Expand All @@ -101,7 +111,9 @@ const peeler_simple = async () => {
const isVisible = await successMessageLocator.isVisible();

await stagehand.context.close();
return isVisible;
return {
_success: isVisible,
};
};

const peeler_complex = async () => {
Expand Down Expand Up @@ -130,7 +142,10 @@ const peeler_complex = async () => {

await stagehand.context.close();

return price !== null;
return {
_success: price !== null,
price,
};
};

const extract_collaborators_from_github_repository = async () => {
Expand Down Expand Up @@ -164,7 +179,10 @@ const extract_collaborators_from_github_repository = async () => {

console.log("Extracted collaborators:", contributors);
await stagehand.context.close();
return contributors.length === 20;
return {
_success: contributors.length === 20,
contributors,
};
} catch (error) {
console.error("Error or timeout occurred:", error);
await stagehand.context.close();
Expand Down Expand Up @@ -201,7 +219,10 @@ const extract_last_twenty_github_commits = async () => {

console.log("Extracted commits:", commits);
await stagehand.context.close();
return commits.length === 20;
return {
_success: commits.length === 20,
commits,
};
} catch (error) {
console.error("Error or timeout occurred:", error);
await stagehand.context.close();
Expand All @@ -226,7 +247,11 @@ const wikipedia = async () => {
const currentUrl = await stagehand.page.url();
await stagehand.context.close();

return currentUrl === url;
return {
_success: currentUrl === url,
expected: url,
actual: currentUrl,
};
};

const costar = async () => {
Expand Down Expand Up @@ -268,10 +293,10 @@ const costar = async () => {

await stagehand.context.close();

return isTitleValid;
return { title: articleTitle.title, _success: isTitleValid };
} catch (error) {
console.error(`Error in costar function: ${error.message}`);
return { title: null };
return { title: null, _success: false } as any;
} finally {
await stagehand.context.close();
}
Expand Down Expand Up @@ -349,7 +374,7 @@ const google_jobs = async () => {

console.log("Job Details valid:", isJobDetailsValid);

return isJobDetailsValid;
return { _success: isJobDetailsValid, jobDetails };
};

const tasks = {
Expand All @@ -365,12 +390,20 @@ const tasks = {
google_jobs,
};

const exactMatch = (args: { input; output; expected? }) => {
const exactMatch = (args: { input: any; output: any; expected?: any }) => {
console.log(`Task "${args.input.name}" returned: ${args.output}`);

const expected = args.expected ?? true;
if (expected === true) {
return {
name: "Exact match",
score: args.output === true || args.output?._success == true,
};
}

return {
name: "Exact match",
score: args.output === true || args.output?.success == true,
score: args.output === expected,
};
};

Expand All @@ -395,9 +428,13 @@ const testcases = [
},
{ input: { name: "peeler_complex" } },
{ input: { name: "simple_google_search" } },
{ input: { name: "extract_collaborators_from_github_repository" } },
{
input: {
name: "extract_collaborators_from_github_repository",
},
},
{ input: { name: "extract_last_twenty_github_commits" } },
// { input: { name: "costar" } },
// { input: { name: "costar", expected: true } },
{ input: { name: "google_jobs" } },
...chosenBananalyzerEvals.map((evalItem: any) => ({
input: {
Expand All @@ -416,7 +453,7 @@ Eval("stagehand", {
data: () => {
return testcases;
},
task: async (input) => {
task: async (input: any) => {
// console.log("input", input);
try {
if ("source" in input && input.source === "bananalyzer-ts") {
Expand All @@ -440,7 +477,7 @@ Eval("stagehand", {
return result;
} else {
// Handle predefined tasks
const result = await tasks[input.name](input);
const result = await (tasks as any)[input.name](input);
if (result) {
console.log(`✅ ${input.name}: Passed`);
} else {
Expand Down

0 comments on commit 3d9edcb

Please sign in to comment.