utils functions for PP

Chenyaaang · Chenyaaang · commit 114fd74fc25a · 2025-11-07T17:33:25.000Z
Signed-off-by: Chenyaaang &lt;chenyangli@google.com&gt;
diff --git a/tpu_inference/distributed/jax_parallel_state.py b/tpu_inference/distributed/jax_parallel_state.py
@@ -0,0 +1,67 @@
+from typing import Any, Optional
+
+import jax
+from jax.experimental import transfer
+
+BASE_JAX_PORT = 5000
+
+
+class GroupCoordinator:
+    """
+    Jax ProcessGroup wrapper for a group of Pipeline Parallel processes.
+    This is a simplfied version which aligns the APIs with pytorch's
+        GroupdCoordinator in vllm/distributed/parallel_state.py.
+    GroupCoordinator takes charge of the communication operations among
+        the processes in the group. Currently the communication is
+        send/recv intermediate tensor (tensor_dict) between consecutive PP
+        processes.
+    """
+    rank_in_group: int
+    world_size: int
+    transfer_server: Optional[Any]
+    connection: Optional[Any]
+
+    def __init__(self, rank_in_group: int, world_size: int):
+        self.rank_in_group = rank_in_group
+        self.world_size = world_size
+        self.transfer_server = None
+        self.connection = None
+
+    def send_tensor_dict(self, uuid: int, tensor_dict: dict[str, jax.Array]):
+        self.transfer_server.await_pull(uuid, tensor_dict)
+
+    def recv_tensor_dict(self, uuid: int,
+                         tensor_spec: dict[str, jax.ShapeDtypeStruct]):
+        return self.connection.pull(uuid, tensor_spec)
+
+    @property
+    def is_first_rank(self):
+        return self.rank_in_group == 0
+
+    @property
+    def is_last_rank(self):
+        return self.rank_in_group == self.world_size - 1
+
+
+def init_pp_distributed_environment(ip: str, rank: int, world_size: int,
+                                    device: Any, need_pp: bool):
+    global _PP
+    _PP = GroupCoordinator(rank, world_size)
+    if need_pp:
+        port_number = BASE_JAX_PORT + rank
+        server_address = f"{ip}:{port_number}"
+        transfer_server = transfer.start_transfer_server(
+            device.client, server_address, [f"{ip}:0", f"{ip}:0"])
+        _PP.transfer_server = transfer_server
+
+
+def connect(prev_ip: str, prev_rank: int):
+    prev_port_number = BASE_JAX_PORT + prev_rank
+    connection = _PP.transfer_server.connect(f'{prev_ip}:{prev_port_number}')
+    _PP.connection = connection
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, (
+        "pipeline model parallel group is not initialized")
+    return _PP
diff --git a/tpu_inference/models/jax/jax_intermediate_tensor.py b/tpu_inference/models/jax/jax_intermediate_tensor.py
@@ -0,0 +1,79 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Union
+
+import jax
+from jax.tree_util import register_pytree_node_class
+from torchax.interop import jax_view, torch_view
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.v1.worker.kv_connector_model_runner_mixin import \
+        KVConnectorOutput
+else:
+    KVConnectorOutput = Any
+
+
+@register_pytree_node_class
+@dataclass
+class JaxIntermediateTensors:
+    """For all pipeline stages except the last, we need to return the 
+    intermediate tensor which is the hidden states (and residuals) to be 
+    sent to the next stage. This data structure contains the 
+    intermediate tensor for a request.
+
+    There is a PyTorch IntermediateTensors (in vllm/sequence.py) class in vllm 
+    for the same purpose.
+
+    Each stage also needs to handle its own kv_connector_output.
+
+    This class also contains the from_torch and to_torch functions, the goal is
+    to convert between pytorch's intermediate tensor 
+    and Jax's intermediate tensor in torchax path.
+    """
+
+    tensors: Dict[str, Any]
+    kv_connector_output: KVConnectorOutput = None
+
+    def tree_flatten(self):
+        children = (self.tensors, )
+        aux_data = self.kv_connector_output
+        return (children, aux_data)
+
+    @classmethod
+    def tree_unflatten(cls, aux_data, children):
+        return cls(children[0], aux_data)
+
+    @classmethod
+    def from_torch(cls, torch_obj: IntermediateTensors):
+        kv_connector_output = getattr(torch_obj, 'kv_connector_output', None)
+        jax_tensors = {k: jax_view(v) for k, v in torch_obj.tensors.items()}
+        return cls(jax_tensors, kv_connector_output)
+
+    def to_torch(self) -> IntermediateTensors:
+        torch_tensors = {k: torch_view(v) for k, v in self.tensors.items()}
+        return IntermediateTensors(torch_tensors)
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value: Any):
+        self.tensors[key] = value
+
+    def keys(self):
+        return self.tensors.keys()
+
+    def items(self):
+        return self.tensors.items()
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def block_until_ready(self):
+        for tensor in self.tensors.values():
+            assert isinstance(
+                tensor, jax.Array
+            ), "block_until_ready needs to be applied on jax arrays"
+            tensor.block_until_ready()