Support multiple outputs in chunk and response of chat service (#39)

mark-xai · web-flow · commit 78d7de9fb005 · 2025-10-17T16:58:01.000+01:00
# Pull Request Title

## Checklist
- [x] I have read both the [CONTRIBUTING.md](CONTRIBUTING.md) and
[Contributor License Agreement](CLA.md) documents.
- [x] I have created an issue or feature request and received approval
from xAI maintainers. (minor changes like fixing typos can skip this
step)
- [x] I have tested my changes locally and they pass all CI checks.
- [x] I have added necessary documentation or updated existing
documentation.

## Description
Provide a clear and concise description of the changes in this PR.
Explain the purpose, the problem it solves, and any relevant context.

## Related Issue
If applicable, link to the related feature request or bug report issue
(e.g., #123). If none, state "N/A".

## Type of Change
- [ ] Bug fix
- [x] New feature
- [ ] Documentation update
- [ ] Other (please specify)

## Additional Notes
Add any other information or context that might be helpful for
reviewers.
diff --git a/src/xai_sdk/aio/chat.py b/src/xai_sdk/aio/chat.py
@@ -106,8 +106,9 @@ async def sample(self) -> Response:
             kind=SpanKind.CLIENT,
             attributes=self._make_span_request_attributes(),
         ) as span:
+            index = None if self._uses_server_side_tools() else 0
             response_pb = await self._stub.GetCompletion(self._make_request(1))
-            response = Response(response_pb, 0)
+            response = Response(response_pb, index)
             span.set_attributes(self._make_span_response_attributes([response]))
             return response
 
@@ -180,7 +181,8 @@ async def stream(self) -> AsyncIterator[tuple[Response, Chunk]]:
             kind=SpanKind.CLIENT,
             attributes=self._make_span_request_attributes(),
         ) as span:
-            response = Response(chat_pb2.GetChatCompletionResponse(outputs=[chat_pb2.CompletionOutput()]), 0)
+            index = None if self._uses_server_side_tools() else 0
+            response = Response(chat_pb2.GetChatCompletionResponse(outputs=[chat_pb2.CompletionOutput()]), index)
             stream = self._stub.GetCompletionChunk(self._make_request(1))
 
             async for chunk in stream:
@@ -191,7 +193,7 @@ async def stream(self) -> AsyncIterator[tuple[Response, Chunk]]:
                     first_chunk_received = True
 
                 response.process_chunk(chunk)
-                chunk_obj = Chunk(chunk, 0)
+                chunk_obj = Chunk(chunk, index)
                 yield response, chunk_obj
 
             span.set_attributes(self._make_span_response_attributes([response]))
diff --git a/src/xai_sdk/chat.py b/src/xai_sdk/chat.py
@@ -288,7 +288,7 @@ def append(self, message: Union[chat_pb2.Message, "Response"]) -> Self:
         elif isinstance(message, Response):
             self._proto.messages.append(
                 chat_pb2.Message(
-                    role=message._choice.message.role,
+                    role=message._get_output().message.role,
                     content=[text(message.content)],
                     tool_calls=message.tool_calls,
                 )
@@ -472,6 +472,10 @@ def _get_span_completion_attributes(self, responses: Sequence["Response"]) -> di
 
         return completion_attributes
 
+    def _uses_server_side_tools(self) -> bool:
+        """Returns True if the server side tool is used in the request."""
+        return any(tool.WhichOneof("tool") != "function" for tool in self._proto.tools)
+
     @property
     def messages(self) -> Sequence[chat_pb2.Message]:
         """Returns the messages in the conversation."""
@@ -682,22 +686,26 @@ def _format_type_to_proto(format_type: ResponseFormat) -> chat_pb2.FormatType:
 class Chunk(ProtoDecorator[chat_pb2.GetChatCompletionChunk]):
     """Adds convenience functions to the chunk proto."""
 
-    _index: int
+    _index: int | None
 
-    def __init__(self, proto: chat_pb2.GetChatCompletionChunk, index: int):
+    def __init__(self, proto: chat_pb2.GetChatCompletionChunk, index: int | None):
         """Creates a new decorator instance.
 
         Args:
             proto: Chunk proto to wrap.
-            index: Index of the response to track.
+            index: Index of the response to track. If set to None, the chunk will expose all assistant outputs.
         """
         super().__init__(proto)
         self._index = index
 
     @property
     def choices(self) -> Sequence["ChoiceChunk"]:
         """Returns the choices belonging to this index."""
-        return [ChoiceChunk(c) for c in self.proto.outputs if c.index == self._index]
+        return [
+            ChoiceChunk(c)
+            for c in self.proto.outputs
+            if c.delta.role == chat_pb2.MessageRole.ROLE_ASSISTANT and (c.index == self._index or self._index is None)
+        ]
 
     @property
     def output(self) -> str:
@@ -777,6 +785,14 @@ def process_chunk(self, chunk: chat_pb2.GetChatCompletionChunk):
         self._proto.system_fingerprint = chunk.system_fingerprint
         self._proto.citations.extend(chunk.citations)
 
+        # Make sure all chunk outputs has corresponding response outputs.
+        if chunk.outputs:
+            max_index = max(c.index for c in chunk.outputs)
+            if max_index >= len(self._proto.outputs):
+                self._proto.outputs.extend(
+                    [chat_pb2.CompletionOutput() for _ in range(max_index + 1 - len(self._proto.outputs))]
+                )
+
         for c in chunk.outputs:
             choice = self._proto.outputs[c.index]
             choice.index = c.index
@@ -792,29 +808,30 @@ class Response(_ResponseProtoDecorator):
 
     # A single request can produce multiple responses. This index is used to retrieve the content of
     # a single answer from the response proto.
-    _index: int
-    # Cache to the answer indexed by this response.
-    _choice: chat_pb2.CompletionOutput
+    _index: int | None
 
-    def __init__(self, response: chat_pb2.GetChatCompletionResponse, index: int) -> None:
+    def __init__(self, response: chat_pb2.GetChatCompletionResponse, index: int | None) -> None:
         """Initializes a new instance of the `Response` class.
 
         Args:
             response: The response proto, which can hold multiple answers.
             index: The index of the answer this class exposes via its convenience methods.
+                If set to None, the response will expose all answers, the content and reasoning content
+                will be only from the assistant response.
         """
         super().__init__(response)
         self._index = index
 
-        # Find and cache the answer identified by the index.
-        choices = [c for c in response.outputs if c.index == index]
-
-        if not choices:
-            raise ValueError(f"Invalid response proto or index. {response:} {index:}")
-        elif len(choices) > 1:
-            raise ValueError(f"More than one response for index {index:}. {response:}")
-        else:
-            self._choice = choices[0]
+    def _get_output(self) -> chat_pb2.CompletionOutput:
+        outputs = [
+            output
+            for output in self.proto.outputs
+            if output.message.role == chat_pb2.MessageRole.ROLE_ASSISTANT
+            and (output.index == self._index or self._index is None)
+        ]
+        if not outputs:
+            return chat_pb2.CompletionOutput()
+        return outputs[-1]
 
     @property
     def id(self) -> str:
@@ -824,12 +841,12 @@ def id(self) -> str:
     @property
     def content(self) -> str:
         """Returns the answer content of this response."""
-        return self._choice.message.content
+        return self._get_output().message.content
 
     @property
     def role(self) -> str:
         """Returns the role of this response."""
-        return chat_pb2.MessageRole.Name(self._choice.message.role)
+        return chat_pb2.MessageRole.Name(self._get_output().message.role)
 
     @property
     def usage(self) -> usage_pb2.SamplingUsage:
@@ -842,17 +859,17 @@ def reasoning_content(self) -> str:
 
         This is only available for models that support reasoning.
         """
-        return self._choice.message.reasoning_content
+        return self._get_output().message.reasoning_content
 
     @property
     def finish_reason(self) -> str:
         """Returns the finish reason of this response."""
-        return sample_pb2.FinishReason.Name(self._choice.finish_reason)
+        return sample_pb2.FinishReason.Name(self._get_output().finish_reason)
 
     @property
     def logprobs(self) -> chat_pb2.LogProbs:
         """Returns the logprobs of this response."""
-        return self._choice.logprobs
+        return self._get_output().logprobs
 
     @property
     def system_fingerprint(self) -> str:
@@ -861,8 +878,13 @@ def system_fingerprint(self) -> str:
 
     @property
     def tool_calls(self) -> Sequence[chat_pb2.ToolCall]:
-        """Returns the tool calls of this response."""
-        return self._choice.message.tool_calls
+        """Returns the all tool calls of this response."""
+        return [
+            tc
+            for c in self.proto.outputs
+            if c.message.role == chat_pb2.MessageRole.ROLE_ASSISTANT
+            for tc in c.message.tool_calls
+        ]
 
     @property
     def citations(self) -> Sequence[str]:
diff --git a/src/xai_sdk/sync/chat.py b/src/xai_sdk/sync/chat.py
@@ -104,8 +104,9 @@ def sample(self) -> Response:
             kind=SpanKind.CLIENT,
             attributes=self._make_span_request_attributes(),
         ) as span:
+            index = None if self._uses_server_side_tools() else 0
             response_pb = self._stub.GetCompletion(self._make_request(1))
-            response = Response(response_pb, 0)
+            response = Response(response_pb, index)
             span.set_attributes(self._make_span_response_attributes([response]))
             return response
 
@@ -176,7 +177,8 @@ def stream(self) -> Iterator[tuple[Response, Chunk]]:
             kind=SpanKind.CLIENT,
             attributes=self._make_span_request_attributes(),
         ) as span:
-            response = Response(chat_pb2.GetChatCompletionResponse(outputs=[chat_pb2.CompletionOutput()]), 0)
+            index = None if self._uses_server_side_tools() else 0
+            response = Response(chat_pb2.GetChatCompletionResponse(outputs=[chat_pb2.CompletionOutput()]), index)
             stream = self._stub.GetCompletionChunk(self._make_request(1))
 
             for chunk in stream:
@@ -187,7 +189,7 @@ def stream(self) -> Iterator[tuple[Response, Chunk]]:
                     first_chunk_received = True
 
                 response.process_chunk(chunk)
-                chunk_obj = Chunk(chunk, 0)
+                chunk_obj = Chunk(chunk, index)
                 yield response, chunk_obj
 
             span.set_attributes(self._make_span_response_attributes([response]))
diff --git a/tests/aio/chat_test.py b/tests/aio/chat_test.py
@@ -352,6 +352,67 @@ async def test_function_calling_streaming_batch(client):
         assert response.tool_calls[0].function.arguments == '{"city":"London","units":"C"}'
 
 
+@pytest.mark.asyncio(loop_scope="session")
+async def test_agentic_tool_calling_streaming(client):
+    chat = client.chat.create(
+        "grok-4-fast",
+        tools=[web_search()],
+    )
+    chat.append(user("What is the weather in London?"))
+    stream = chat.stream()
+
+    expected_chunks = [
+        "I",
+        " am",
+        " searching",
+        ".",
+        "",  # Final chunk is a tool call which has no content set
+    ]
+
+    last_response = None
+    i = 0
+    async for response, chunk in stream:
+        last_response = response
+        if i == 0:
+            assert chunk.tool_calls[0].function.name == "web_search"
+            assert chunk.tool_calls[0].function.arguments == '{"query":"What is the weather in London?"}'
+        elif i == 1:
+            assert chunk.proto.outputs[0].delta.role == chat_pb2.ROLE_TOOL
+            assert chunk.proto.outputs[0].delta.content == "I am tool response"
+            assert chunk.content == ""
+        else:
+            assert chunk.content == expected_chunks[i - 2]
+        i += 1
+
+    assert last_response is not None
+    assert last_response.content == "I am searching."
+    assert len(last_response.tool_calls) == 1
+    assert last_response.finish_reason == "REASON_STOP"
+    assert last_response.role == "ROLE_ASSISTANT"
+    assert last_response.tool_calls[0].function.name == "web_search"
+    assert last_response.tool_calls[0].function.arguments == '{"query":"What is the weather in London?"}'
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_agentic_tool_calling_non_streaming(client):
+    chat = client.chat.create(
+        "grok-4-fast",
+        tools=[web_search()],
+    )
+    chat.append(user("What is the weather in London?"))
+    response = await chat.sample()
+
+    assert len(response.proto.outputs) == 3
+    assert response.proto.outputs[1].message.role == chat_pb2.ROLE_TOOL
+    assert response.proto.outputs[1].message.content == "I am tool response"
+    assert response.content == "I am searching."
+    assert len(response.tool_calls) == 1
+    assert response.finish_reason == "REASON_STOP"
+    assert response.role == "ROLE_ASSISTANT"
+    assert response.tool_calls[0].function.name == "web_search"
+    assert response.tool_calls[0].function.arguments == '{"query":"What is the weather in London?"}'
+
+
 @pytest.mark.asyncio(loop_scope="session")
 async def test_structured_output(client):
     class Weather(BaseModel):
diff --git a/tests/server.py b/tests/server.py
diff --git a/tests/sync/chat_test.py b/tests/sync/chat_test.py