leave comments for eval performance #975
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
- labeled | |
env: | |
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" | |
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" | |
concurrency: | |
group: ${{ github.ref }} | |
cancel-in-progress: true | |
permissions: | |
pull-requests: write | |
contents: read | |
jobs: | |
determine-evals: | |
runs-on: ubuntu-latest | |
outputs: | |
run-extract: ${{ steps.check-labels.outputs.run-extract }} | |
run-act: ${{ steps.check-labels.outputs.run-act }} | |
run-observe: ${{ steps.check-labels.outputs.run-observe }} | |
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} | |
steps: | |
- id: check-labels | |
run: | | |
# Default to running all tests on main branch | |
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
echo "Running all tests for main branch" | |
echo "run-extract=true" >> $GITHUB_OUTPUT | |
echo "run-act=true" >> $GITHUB_OUTPUT | |
echo "run-observe=true" >> $GITHUB_OUTPUT | |
echo "run-text-extract=true" >> $GITHUB_OUTPUT | |
exit 0 | |
fi | |
# Check for specific labels | |
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT | |
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT | |
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT | |
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT | |
run-lint: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Lint | |
run: npm run lint | |
run-build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Run Build | |
run: npm run build | |
run-e2e-tests: | |
needs: [run-lint, run-build] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run E2E Tests | |
run: npm run e2e | |
run-combination-evals: | |
needs: [run-e2e-tests, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 40 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Combination Evals | |
run: npm run evals category combination | |
- name: Log Combination Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
combination_score=$(jq '.categories.combination' eval-summary.json) | |
echo "Combination category score: $combination_score%" | |
# Create comment body | |
echo "### 🔄 Combination Eval Results" > comment.md | |
echo "Score: ${combination_score}%" >> comment.md | |
echo "[View detailed results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName})" >> comment.md | |
# Post comment | |
gh pr comment ${{ github.event.pull_request.number }} --body-file comment.md | |
exit 0 | |
else | |
echo "Eval summary not found for combination category. Failing CI." | |
exit 1 | |
fi | |
env: | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run-act-evals: | |
needs: [run-e2e-tests, determine-evals, run-combination-evals] | |
if: needs.determine-evals.outputs.run-act == 'true' | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Act Evals | |
run: npm run evals category act | |
- name: Log Act Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
if [ -f eval-summary.json ]; then | |
act_score=$(jq '.categories.act' eval-summary.json) | |
echo "Act category score: $act_score%" | |
# Create comment body | |
echo "### 🎯 Act Eval Results" > comment.md | |
echo "Score: ${act_score}%" >> comment.md | |
echo "[View detailed results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName})" >> comment.md | |
# Post comment | |
gh pr comment ${{ github.event.pull_request.number }} --body-file comment.md | |
if (( $(echo "$act_score < 80" | bc -l) )); then | |
echo "Act category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for act category. Failing CI." | |
exit 1 | |
fi | |
env: | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run-extract-evals: | |
needs: [run-e2e-tests, determine-evals, run-combination-evals] | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
# 1. Run extract category with domExtract | |
- name: Run Extract Evals (domExtract) | |
run: npm run evals category extract -- --extract-method=domExtract | |
- name: Save Extract Dom Results | |
run: mv eval-summary.json eval-summary-extract-dom.json | |
# 2. Once domExtract finishes, run extract category with textExtract | |
- name: Run Extract Evals (textExtract) | |
run: npm run evals category extract -- --extract-method=textExtract | |
- name: Save Extract Text Results | |
run: mv eval-summary.json eval-summary-extract-text.json | |
# 3. Log and Compare Extract Evals Performance | |
- name: Log and Compare Extract Evals Performance | |
run: | | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
echo "DomExtract Extract category score: $dom_score%" | |
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) | |
text_score=$(jq '.categories.extract' eval-summary-extract-text.json) | |
echo "TextExtract Extract category score: $text_score%" | |
# Create comment body | |
echo "### 🔍 Extract Eval Results" > comment.md | |
echo "**DomExtract Score:** ${dom_score}%" >> comment.md | |
echo "[View domExtract results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom})" >> comment.md | |
echo "**TextExtract Score:** ${text_score}%" >> comment.md | |
echo "[View textExtract results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText})" >> comment.md | |
# Post comment | |
gh pr comment ${{ github.event.pull_request.number }} --body-file comment.md | |
if (( $(echo "$dom_score < 80" | bc -l) )); then | |
echo "DomExtract extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
env: | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run-text-extract-evals: | |
needs: [run-e2e-tests, determine-evals, run-combination-evals] | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
# 1. Run text_extract category with textExtract first | |
- name: Run text_extract Evals (textExtract) | |
run: npm run evals category text_extract -- --extract-method=textExtract | |
- name: Save text_extract Text Results | |
run: mv eval-summary.json eval-summary-text_extract-text.json | |
# 2. Then run text_extract category with domExtract | |
- name: Run text_extract Evals (domExtract) | |
run: npm run evals category text_extract -- --extract-method=domExtract | |
- name: Save text_extract Dom Results | |
run: mv eval-summary.json eval-summary-text_extract-dom.json | |
# 3. Log and Compare text_extract Evals Performance | |
- name: Log and Compare text_extract Evals Performance | |
run: | | |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) | |
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) | |
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) | |
# Create comment body | |
echo "### 📝 Text Extract Eval Results" > comment.md | |
echo "**TextExtract Score:** ${text_score}%" >> comment.md | |
echo "[View textExtract results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText})" >> comment.md | |
echo "**DomExtract Score:** ${dom_score}%" >> comment.md | |
echo "[View domExtract results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom})" >> comment.md | |
# Post comment | |
gh pr comment ${{ github.event.pull_request.number }} --body-file comment.md | |
if (( $(echo "$text_score < 80" | bc -l) )); then | |
echo "textExtract text_extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
env: | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run-observe-evals: | |
needs: [run-e2e-tests, determine-evals, run-combination-evals] | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: npm install --no-frozen-lockfile | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Observe Evals | |
run: npm run evals category observe | |
- name: Log Observe Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
if [ -f eval-summary.json ]; then | |
observe_score=$(jq '.categories.observe' eval-summary.json) | |
echo "Observe category score: $observe_score%" | |
# Create comment body | |
echo "### 👀 Observe Eval Results" > comment.md | |
echo "Score: ${observe_score}%" >> comment.md | |
echo "[View detailed results](https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName})" >> comment.md | |
# Post comment | |
gh pr comment ${{ github.event.pull_request.number }} --body-file comment.md | |
if (( $(echo "$observe_score < 80" | bc -l) )); then | |
echo "Observe category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for observe category. Failing CI." | |
exit 1 | |
fi | |
env: | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} |