test: add gpt-4o-mini to smoke test github workflow

- Add gpt-4o-mini to smoke test github workflow - Add gpt-4o-mini smoke test golden files - Remove outdated gpt-4o model and 4-turbo smoke test config and golden files - Add golden files for gpt-4o-2024-08-06 - Regenerate golden files for existing models to drop callProgress events (we weren't comparing these anyway) Signed-off-by: Nick Hale <[email protected]>
gptscript-ai · Oct 14, 2024 · eefe829 · eefe829
1 parent 9e3893c
commit eefe829
Show file tree

Hide file tree

Showing 15 changed files with 2,609 additions and 12,337 deletions.
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
@@ -59,7 +59,7 @@ jobs:
 
           echo "run_smoke_tests=false" >> $GITHUB_OUTPUT
 
-  gpt-4o-2024-05-13:
+  gpt-4o-2024-08-06:
     needs: check-label
     if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
     runs-on: ubuntu-22.04
@@ -81,14 +81,14 @@ jobs:
           go-version: "1.21"
       - env:
           OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
-          GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-05-13
-        name: Run smoke test for gpt-4o-2024-05-13
+          GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-08-06
+        name: Run smoke test for gpt-4o-2024-08-06
         run: |
-          echo "Running smoke test for model gpt-4o-2024-05-13"
+          echo "Running smoke test for model gpt-4o-2024-08-06"
           export PATH="$(pwd)/bin:${PATH}"
           make smoke
 
-  gpt-4-turbo-2024-04-09:
+  gpt-4o-mini-2024-07-18:
     needs: check-label
     if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
     runs-on: ubuntu-22.04
@@ -110,10 +110,10 @@ jobs:
           go-version: "1.21"
       - env:
           OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
-          GPTSCRIPT_DEFAULT_MODEL: gpt-4-turbo-2024-04-09
-        name: Run smoke test for gpt-4-turbo-2024-04-09
+          GPTSCRIPT_DEFAULT_MODEL: gpt-4o-mini-2024-07-18
+        name: Run smoke test for gpt-4o-mini-2024-07-18
         run: |
-          echo "Running smoke test for model gpt-4-turbo-2024-04-09"
+          echo "Running smoke test for model gpt-4o-mini-2024-07-18"
           export PATH="$(pwd)/bin:${PATH}"
           make smoke
 

diff --git a/pkg/tests/judge/judge.go b/pkg/tests/judge/judge.go
@@ -86,10 +86,10 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
 }
 
 func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
-	comparisonJSON, err := json.MarshalIndent(&comparison[T]{
+	comparisonJSON, err := json.Marshal(&comparison[T]{
 		Expected: expected,
 		Actual:   actual,
-	}, "", "    ")
+	})
 	if err != nil {
 		return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
 	}

diff --git a/pkg/tests/smoke/smoke_test.go b/pkg/tests/smoke/smoke_test.go
@@ -175,6 +175,11 @@ func getActualEvents(t *testing.T, eventsFile string) []event {
 
 		var e event
 		require.NoError(t, json.Unmarshal([]byte(line), &e))
+
+		if e.Type == runner.EventTypeCallProgress {
+			continue
+		}
+
 		events = append(events, e)
 	}
 

diff --git a/pkg/tests/smoke/testdata/Bob/claude-3-5-sonnet-20240620-expected.json b/pkg/tests/smoke/testdata/Bob/claude-3-5-sonnet-20240620-expected.json