gptscript-ai · njhale · Aug 6, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
@@ -59,6 +59,35 @@ jobs:
 
           echo "run_smoke_tests=false" >> $GITHUB_OUTPUT
 
+  gpt-4o-2024-08-06:
+    needs: check-label
+    if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout base repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Checkout PR code if running for a PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+      - uses: actions/setup-go@v5
+        with:
+          cache: false
+          go-version: "1.21"
+      - env:
+          OPENAI_API_KEY: ${{ secrets.SMOKE_OPENAI_API_KEY }}
+          GPTSCRIPT_DEFAULT_MODEL: gpt-4o-2024-08-06
+        name: Run smoke test for gpt-4o-2024-08-06
+        run: |
+          echo "Running smoke test for model gpt-4o-2024-08-06"
+          export PATH="$(pwd)/bin:${PATH}"
+          make smoke
+
   gpt-4o-2024-05-13:
     needs: check-label
     if: ${{ needs.check-label.outputs.run_smoke_tests == 'true' }}

diff --git a/pkg/tests/judge/judge.go b/pkg/tests/judge/judge.go
@@ -40,6 +40,8 @@ After making a determination, respond with a JSON object that conforms to the fo
   ]
 }
 
+If you determine actual and expected are not equivalent, include a diff of the parts of actual and expected that are not equivalent in the reasoning field of your response.
+
 Your responses are concise and include only the json object described above.
 `
 
@@ -84,10 +86,10 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
 }
 
 func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
-	comparisonJSON, err := json.MarshalIndent(&comparison[T]{
+	comparisonJSON, err := json.Marshal(&comparison[T]{
 		Expected: expected,
 		Actual:   actual,
-	}, "", "    ")
+	})
 	if err != nil {
 		return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
 	}

diff --git a/pkg/tests/smoke/smoke_test.go b/pkg/tests/smoke/smoke_test.go
@@ -82,8 +82,8 @@ func TestSmoke(t *testing.T) {
 				expectedEvents,
 				actualEvents,
 				`
-- disregard differences in timestamps, generated IDs, natural language verbiage, and event order
-- omit callProgress events from the comparison
+- disregard differences in event order, timestamps, generated IDs, and natural language verbiage, grammar, and punctuation
+- compare events with matching event types
 - the overall stream of events and set of tools called should roughly match
 - arguments passed in tool calls should be roughly the same
 - the final callFinish event should be semantically similar
@@ -175,6 +175,11 @@ func getActualEvents(t *testing.T, eventsFile string) []event {
 
 		var e event
 		require.NoError(t, json.Unmarshal([]byte(line), &e))
+
+		if e.Type == runner.EventTypeCallProgress {
+			continue
+		}
+
 		events = append(events, e)
 	}