Skip to content

allow systemPrompt input #995

allow systemPrompt input

allow systemPrompt input #995

Workflow file for this run

name: Evals
on:
pull_request:
types:
- opened
- synchronize
- labeled
env:
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract"
concurrency:
group: ${{ github.ref }}
cancel-in-progress: true
jobs:
determine-evals:
runs-on: ubuntu-latest
outputs:
run-extract: ${{ steps.check-labels.outputs.run-extract }}
run-act: ${{ steps.check-labels.outputs.run-act }}
run-observe: ${{ steps.check-labels.outputs.run-observe }}
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }}
steps:
- id: check-labels
run: |
# Default to running all tests on main branch
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
echo "Running all tests for main branch"
echo "run-extract=true" >> $GITHUB_OUTPUT
echo "run-act=true" >> $GITHUB_OUTPUT
echo "run-observe=true" >> $GITHUB_OUTPUT
echo "run-text-extract=true" >> $GITHUB_OUTPUT
exit 0
fi
# Check for specific labels
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT
run-lint:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Run Lint
run: npm run lint
run-build:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Run Build
run: npm run build
run-e2e-tests:
needs: [run-lint, run-build]
runs-on: ubuntu-latest
timeout-minutes: 50
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
HEADLESS: true
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
- name: Run E2E Tests (Deterministic Playwright)
run: npm run e2e
run-e2e-bb-tests:
needs: [run-e2e-tests]
runs-on: ubuntu-latest
timeout-minutes: 50
if: >
github.event_name == 'push' ||
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository)
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
- name: Run E2E Tests (browserbase)
run: npm run e2e:bb
run-combination-evals:
needs: [run-e2e-tests, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 40
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
- name: Run Combination Evals
run: npm run evals category combination
- name: Log Combination Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
combination_score=$(jq '.categories.combination' eval-summary.json)
echo "Combination category score: $combination_score%"
exit 0
else
echo "Eval summary not found for combination category. Failing CI."
exit 1
fi
run-act-evals:
needs: [run-e2e-tests, determine-evals, run-combination-evals]
if: needs.determine-evals.outputs.run-act == 'true'
runs-on: ubuntu-latest
timeout-minutes: 25
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
- name: Run Act Evals
run: npm run evals category act
- name: Log Act Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
act_score=$(jq '.categories.act' eval-summary.json)
echo "Act category score: $act_score%"
if (( $(echo "$act_score < 80" | bc -l) )); then
echo "Act category score is below 80%. Failing CI."
exit 1
fi
else
echo "Eval summary not found for act category. Failing CI."
exit 1
fi
run-extract-evals:
needs: [run-e2e-tests, determine-evals, run-combination-evals]
if: needs.determine-evals.outputs.run-extract == 'true'
runs-on: ubuntu-latest
timeout-minutes: 50
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
# 1. Run extract category with domExtract
- name: Run Extract Evals (domExtract)
run: npm run evals category extract -- --extract-method=domExtract
- name: Save Extract Dom Results
run: mv eval-summary.json eval-summary-extract-dom.json
# 2. Once domExtract finishes, run extract category with textExtract
- name: Run Extract Evals (textExtract)
run: npm run evals category extract -- --extract-method=textExtract
- name: Save Extract Text Results
run: mv eval-summary.json eval-summary-extract-text.json
# 3. Log and Compare Extract Evals Performance
- name: Log and Compare Extract Evals Performance
run: |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json)
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json)
echo "DomExtract Extract category score: $dom_score%"
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json)
text_score=$(jq '.categories.extract' eval-summary-extract-text.json)
echo "TextExtract Extract category score: $text_score%"
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
# 4. If domExtract <80% fail CI
if (( $(echo "$dom_score < 80" | bc -l) )); then
echo "DomExtract extract category score is below 80%. Failing CI."
exit 1
fi
run-text-extract-evals:
needs: [run-e2e-tests, determine-evals, run-combination-evals]
if: needs.determine-evals.outputs.run-text-extract == 'true'
runs-on: ubuntu-latest
timeout-minutes: 120
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
# 1. Run text_extract category with textExtract first
- name: Run text_extract Evals (textExtract)
run: npm run evals category text_extract -- --extract-method=textExtract
- name: Save text_extract Text Results
run: mv eval-summary.json eval-summary-text_extract-text.json
# 2. Then run text_extract category with domExtract
- name: Run text_extract Evals (domExtract)
run: npm run evals category text_extract -- --extract-method=domExtract
- name: Save text_extract Dom Results
run: mv eval-summary.json eval-summary-text_extract-dom.json
# 3. Log and Compare text_extract Evals Performance
- name: Log and Compare text_extract Evals Performance
run: |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json)
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json)
echo "TextExtract text_extract category score: $text_score%"
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}"
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json)
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json)
echo "DomExtract text_extract category score: $dom_score%"
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}"
# 4. If textExtract (for text_extract category) <80% fail CI
if (( $(echo "$text_score < 80" | bc -l) )); then
echo "textExtract text_extract category score is below 80%. Failing CI."
exit 1
fi
run-observe-evals:
needs: [run-e2e-tests, determine-evals, run-combination-evals]
if: needs.determine-evals.outputs.run-observe == 'true'
runs-on: ubuntu-latest
timeout-minutes: 25
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install dependencies
run: npm install --no-frozen-lockfile
- name: Install Playwright browsers
run: npm exec playwright install --with-deps
- name: Run Observe Evals
run: npm run evals category observe
- name: Log Observe Evals Performance
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
observe_score=$(jq '.categories.observe' eval-summary.json)
echo "Observe category score: $observe_score%"
if (( $(echo "$observe_score < 80" | bc -l) )); then
echo "Observe category score is below 80%. Failing CI."
exit 1
fi
else
echo "Eval summary not found for observe category. Failing CI."
exit 1
fi