@@ -3,6 +3,7 @@ package extproc
3
3
import (
4
4
"encoding/json"
5
5
"strconv"
6
+ "strings"
6
7
"time"
7
8
8
9
core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
@@ -17,6 +18,9 @@ import (
17
18
func (r * OpenAIRouter ) handleResponseHeaders (v * ext_proc.ProcessingRequest_ResponseHeaders , ctx * RequestContext ) (* ext_proc.ProcessingResponse , error ) {
18
19
// Detect upstream HTTP status and record non-2xx as errors
19
20
if v != nil && v .ResponseHeaders != nil && v .ResponseHeaders .Headers != nil {
21
+ // Determine if the response is streaming based on Content-Type
22
+ ctx .IsStreamingResponse = isStreamingContentType (v .ResponseHeaders .Headers )
23
+
20
24
if statusCode := getStatusFromHeaders (v .ResponseHeaders .Headers ); statusCode != 0 {
21
25
if statusCode >= 500 {
22
26
metrics .RecordRequestError (getModelFromCtx (ctx ), "upstream_5xx" )
@@ -26,8 +30,10 @@ func (r *OpenAIRouter) handleResponseHeaders(v *ext_proc.ProcessingRequest_Respo
26
30
}
27
31
}
28
32
29
- // Best-effort TTFT measurement: record on first response headers if we have a start time and model
30
- if ctx != nil && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
33
+ // Best-effort TTFT measurement:
34
+ // - For non-streaming responses, record on first response headers (approx TTFB ~= TTFT)
35
+ // - For streaming responses (SSE), defer TTFT until the first response body chunk arrives
36
+ if ctx != nil && ! ctx .IsStreamingResponse && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
31
37
ttft := time .Since (ctx .ProcessingStartTime ).Seconds ()
32
38
if ttft > 0 {
33
39
metrics .RecordModelTTFT (ctx .RequestModel , ttft )
@@ -79,13 +85,58 @@ func getModelFromCtx(ctx *RequestContext) string {
79
85
return ctx .RequestModel
80
86
}
81
87
88
+ // isStreamingContentType checks if the response content-type indicates streaming (SSE)
89
+ func isStreamingContentType (headerMap * core.HeaderMap ) bool {
90
+ if headerMap == nil {
91
+ return false
92
+ }
93
+ for _ , hv := range headerMap .Headers {
94
+ if strings .ToLower (hv .Key ) == "content-type" {
95
+ val := hv .Value
96
+ if val == "" && len (hv .RawValue ) > 0 {
97
+ val = string (hv .RawValue )
98
+ }
99
+ if strings .Contains (strings .ToLower (val ), "text/event-stream" ) {
100
+ return true
101
+ }
102
+ }
103
+ }
104
+ return false
105
+ }
106
+
82
107
// handleResponseBody processes the response body
83
108
func (r * OpenAIRouter ) handleResponseBody (v * ext_proc.ProcessingRequest_ResponseBody , ctx * RequestContext ) (* ext_proc.ProcessingResponse , error ) {
84
109
completionLatency := time .Since (ctx .StartTime )
85
110
86
111
// Process the response for caching
87
112
responseBody := v .ResponseBody .Body
88
113
114
+ // If this is a streaming response (e.g., SSE), record TTFT on the first body chunk
115
+ // and skip JSON parsing/caching which are not applicable for SSE chunks.
116
+ if ctx .IsStreamingResponse {
117
+ if ctx != nil && ! ctx .TTFTRecorded && ! ctx .ProcessingStartTime .IsZero () && ctx .RequestModel != "" {
118
+ ttft := time .Since (ctx .ProcessingStartTime ).Seconds ()
119
+ if ttft > 0 {
120
+ metrics .RecordModelTTFT (ctx .RequestModel , ttft )
121
+ ctx .TTFTSeconds = ttft
122
+ ctx .TTFTRecorded = true
123
+ observability .Infof ("Recorded TTFT on first streamed body chunk: %.3fs" , ttft )
124
+ }
125
+ }
126
+
127
+ // For streaming chunks, just continue (no token parsing or cache update)
128
+ response := & ext_proc.ProcessingResponse {
129
+ Response : & ext_proc.ProcessingResponse_ResponseBody {
130
+ ResponseBody : & ext_proc.BodyResponse {
131
+ Response : & ext_proc.CommonResponse {
132
+ Status : ext_proc .CommonResponse_CONTINUE ,
133
+ },
134
+ },
135
+ },
136
+ }
137
+ return response , nil
138
+ }
139
+
89
140
// Parse tokens from the response JSON using OpenAI SDK types
90
141
var parsed openai.ChatCompletion
91
142
if err := json .Unmarshal (responseBody , & parsed ); err != nil {
0 commit comments