Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support multi-chunk extracts + prompt updates #89

Merged
merged 47 commits into from
Oct 4, 2024
Merged
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
9683e03
add simple google search eval
navidkpr Sep 22, 2024
85ceeef
add 2 more evals
navidkpr Sep 22, 2024
c2c6e3a
make sure extract continues to use the same model on repeated call
navidkpr Sep 22, 2024
7805bc1
add twitter sign up eval case
navidkpr Sep 22, 2024
0113aaf
update eval
navidkpr Sep 22, 2024
8a2afc5
Merge remote-tracking branch 'origin' into npour/first-eval
navidkpr Sep 24, 2024
eb3a864
add basic banalayzer eval system
navidkpr Sep 24, 2024
5fbafd4
add server
navidkpr Sep 24, 2024
9a5a571
update package jsons
navidkpr Sep 24, 2024
0319827
clean up the files
navidkpr Sep 24, 2024
a6930a0
clean up
navidkpr Sep 24, 2024
bdb6d29
fix the bananalyzer eval system + add it to the main eval script
navidkpr Sep 30, 2024
e278483
remove all public files on server exit
navidkpr Sep 30, 2024
eccb0e8
fix the package.json playwright issue
navidkpr Sep 30, 2024
9c5e731
clean up logs
navidkpr Sep 30, 2024
073e605
remove .vscode
navidkpr Sep 30, 2024
d7ccb0e
cleanup
navidkpr Sep 30, 2024
bedb996
Merge remote-tracking branch 'origin' into npour/more-evals
navidkpr Sep 30, 2024
fcdbf49
move the test evals to the playground script
navidkpr Sep 30, 2024
ddf3cf7
cleanup
navidkpr Sep 30, 2024
2aeac41
cleanup
navidkpr Sep 30, 2024
85a00b0
add server/public to gitignore
navidkpr Sep 30, 2024
f154ca2
test -> playround (much better name)
navidkpr Sep 30, 2024
884d942
fix the resource deletion issue
navidkpr Sep 30, 2024
0d5289d
update readme + cleanup
navidkpr Sep 30, 2024
2f54d16
cleanup of readme
navidkpr Sep 30, 2024
82b309e
remove the changes in teh lib folder
navidkpr Sep 30, 2024
042acd7
cleanup readme
navidkpr Sep 30, 2024
2c6b8cf
cleanup
navidkpr Sep 30, 2024
5701f76
cleanup
navidkpr Sep 30, 2024
d097939
update readme
navidkpr Sep 30, 2024
33e3f2a
make top element look if the element is top in multiple points in the…
navidkpr Oct 1, 2024
c3e20a5
Merge remote-tracking branch 'origin' into npour/fix-bananalyzer-1
navidkpr Oct 1, 2024
f0be458
switch to my repo (so we can edit the examples when they don't make s…
navidkpr Oct 2, 2024
1757233
fix bug: now we properly support multi-chunk extracts
navidkpr Oct 3, 2024
b9cd0e6
add more information into the eval outputs
navidkpr Oct 3, 2024
d7decc5
Merge remote-tracking branch 'origin' into npour/fix-bananalyzer-3
navidkpr Oct 3, 2024
3d9edcb
Merge branch 'npour/more-info-in-eval' into npour/fix-bananalyzer-3
navidkpr Oct 3, 2024
a87a8f9
fix issues with bananalyzer 2 + stabalize github test cases
navidkpr Oct 3, 2024
a244604
Merge remote-tracking branch 'origin' into npour/fix-bananalyzer-3
navidkpr Oct 3, 2024
61c9be7
add homedepot task case to evals
navidkpr Oct 3, 2024
3332fd6
update error output
navidkpr Oct 3, 2024
ade328c
fix more eval cases
navidkpr Oct 3, 2024
4d28507
cleanup
navidkpr Oct 3, 2024
00a23dc
emulate a full browser better
navidkpr Oct 3, 2024
594596e
use true home depot eval
pkiv Oct 4, 2024
072d499
Merge branch 'main' into npour/fix-bananalyzer-3
pkiv Oct 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add more information into the eval outputs
  • Loading branch information
navidkpr committed Oct 3, 2024
commit b9cd0e62ab8270b5372cf7af8c861ec73b367085
34 changes: 28 additions & 6 deletions evals/bananalyzer-ts/index.ts
Original file line number Diff line number Diff line change
@@ -47,7 +47,7 @@ export async function evaluateExample(
launchServer: true,
serverPort: 6778,
},
): Promise<boolean> {
): Promise<any> {
await new Promise((resolve) => setTimeout(resolve, 2000));

const examples = JSON.parse(
@@ -162,15 +162,25 @@ export async function evaluateExample(
if (evalItem.expected) {
if (!validateJsonMatch(evalItem.expected, extractionResult)) {
console.log("❌ JSON match failed");
return false;
return {
_success: false,
case: "json_mismatch_1",
expected: evalItem.expected,
actual: extractionResult,
};
}
} else if (evalItem.options) {
const matchesAny = evalItem.options.some((option) =>
validateJsonMatch(option, extractionResult),
);
if (!matchesAny) {
console.log("❌ No JSON match found in options");
return false;
return {
_success: false,
case: "json_mismatch_2",
expected: evalItem.expected,
actual: extractionResult,
};
}
}
} else if (
@@ -181,16 +191,28 @@ export async function evaluateExample(
!validateEndUrlMatch(evalItem.expected, await stagehand.page.url())
) {
console.log("❌ URL match failed");
return false;
return {
_success: false,
case: "url_mismatch",
expected: evalItem.expected,
actual: await stagehand.page.url(),
};
}
}
}

console.log("✅ All evaluations passed");
return true;
return {
_success: true,
expected: extractionResult,
actual: extractionResult,
};
} catch (error) {
console.error("Error during evaluation:", error);
return false;
return {
_success: false,
error: error,
};
} finally {
try {
const deleteResponse = await fetch(
71 changes: 54 additions & 17 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
@@ -41,7 +41,11 @@ const vanta = async () => {

await stagehand.context.close();

return observationResult == expectedResult;
return {
_success: observationResult == expectedResult,
expected: expectedResult,
actual: observationResult,
};
};

const vanta_h = async () => {
@@ -59,7 +63,10 @@ const vanta_h = async () => {
await stagehand.context.close();

// we should have no saved observation since the element shouldn't exist
return observation === null;
return {
_success: observation === null,
observation,
};
};

const simple_google_search = async () => {
@@ -80,7 +87,10 @@ const simple_google_search = async () => {

await stagehand.context.close();

return currentUrl.startsWith(expectedUrl);
return {
_success: currentUrl.startsWith(expectedUrl),
currentUrl,
};
};

const peeler_simple = async () => {
@@ -101,7 +111,9 @@ const peeler_simple = async () => {
const isVisible = await successMessageLocator.isVisible();

await stagehand.context.close();
return isVisible;
return {
_success: isVisible,
};
};

const peeler_complex = async () => {
@@ -130,7 +142,10 @@ const peeler_complex = async () => {

await stagehand.context.close();

return price !== null;
return {
_success: price !== null,
price,
};
};

const extract_collaborators_from_github_repository = async () => {
@@ -164,7 +179,10 @@ const extract_collaborators_from_github_repository = async () => {

console.log("Extracted collaborators:", contributors);
await stagehand.context.close();
return contributors.length === 20;
return {
_success: contributors.length === 20,
contributors,
};
} catch (error) {
console.error("Error or timeout occurred:", error);
await stagehand.context.close();
@@ -201,7 +219,10 @@ const extract_last_twenty_github_commits = async () => {

console.log("Extracted commits:", commits);
await stagehand.context.close();
return commits.length === 20;
return {
_success: commits.length === 20,
commits,
};
} catch (error) {
console.error("Error or timeout occurred:", error);
await stagehand.context.close();
@@ -226,7 +247,11 @@ const wikipedia = async () => {
const currentUrl = await stagehand.page.url();
await stagehand.context.close();

return currentUrl === url;
return {
_success: currentUrl === url,
expected: url,
actual: currentUrl,
};
};

const costar = async () => {
@@ -268,10 +293,10 @@ const costar = async () => {

await stagehand.context.close();

return isTitleValid;
return { title: articleTitle.title, _success: isTitleValid };
} catch (error) {
console.error(`Error in costar function: ${error.message}`);
return { title: null };
return { title: null, _success: false } as any;
} finally {
await stagehand.context.close();
}
@@ -349,7 +374,7 @@ const google_jobs = async () => {

console.log("Job Details valid:", isJobDetailsValid);

return isJobDetailsValid;
return { _success: isJobDetailsValid, jobDetails };
};

const tasks = {
@@ -365,12 +390,20 @@ const tasks = {
google_jobs,
};

const exactMatch = (args: { input; output; expected? }) => {
const exactMatch = (args: { input: any; output: any; expected?: any }) => {
console.log(`Task "${args.input.name}" returned: ${args.output}`);

const expected = args.expected ?? true;
if (expected === true) {
return {
name: "Exact match",
score: args.output === true || args.output?._success == true,
};
}

return {
name: "Exact match",
score: args.output === true || args.output?.success == true,
score: args.output === expected,
};
};

@@ -395,9 +428,13 @@ const testcases = [
},
{ input: { name: "peeler_complex" } },
{ input: { name: "simple_google_search" } },
{ input: { name: "extract_collaborators_from_github_repository" } },
{
input: {
name: "extract_collaborators_from_github_repository",
},
},
{ input: { name: "extract_last_twenty_github_commits" } },
// { input: { name: "costar" } },
// { input: { name: "costar", expected: true } },
{ input: { name: "google_jobs" } },
...chosenBananalyzerEvals.map((evalItem: any) => ({
input: {
@@ -416,7 +453,7 @@ Eval("stagehand", {
data: () => {
return testcases;
},
task: async (input) => {
task: async (input: any) => {
// console.log("input", input);
try {
if ("source" in input && input.source === "bananalyzer-ts") {
@@ -440,7 +477,7 @@ Eval("stagehand", {
return result;
} else {
// Handle predefined tasks
const result = await tasks[input.name](input);
const result = await (tasks as any)[input.name](input);
if (result) {
console.log(`✅ ${input.name}: Passed`);
} else {