save changes

juncgu-google · juncgu-google · commit 07e7fd313e70 · 2025-11-05T07:02:55.000Z
Signed-off-by: Juncheng Gu &lt;jcgu@google.com&gt;
diff --git a/tests/distributed/cache_util_test.py b/tests/distributed/cache_util_test.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import functools
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from absl.testing import parameterized
+from jax._src import compilation_cache as cc
+from jax._src import test_util as jtu
+from jax.sharding import NamedSharding, PartitionSpec
+
+from tpu_inference.distributed.cache_util import get_kv_cache_swap_fn
+
+
+class TestGetKVCacheSwapFn(jtu.JaxTestCase):
+    """Test the get_kv_cache_swap_fn functionality."""
+
+    def setUp(self):
+        super().setUp()
+        self.num_layers = 2
+        self.num_tokens = 128
+        self.num_heads = 8
+        self.head_size = 128
+        self.mesh = self.create_mesh((1, 8), ("data", "model"))
+        if self.mesh is None:
+            self.skipTest("Cannot create mesh. Must be run on a TPU node.")
+            return
+
+        # Define cache properties
+        self.cache_shape = (
+            self.num_tokens,
+            self.num_heads,
+            2,
+            self.head_size,
+        )
+        self.cache_dtype = jnp.bfloat16
+
+        # Define shardings, mirroring the setup in TPUConnectorWorker
+        partition_spec = PartitionSpec(None, "model")
+        self.device_sharding = NamedSharding(self.mesh,
+                                             partition_spec,
+                                             memory_kind="device")
+        self.host_sharding = NamedSharding(self.mesh,
+                                           partition_spec,
+                                           memory_kind="pinned_host")
+
+    def tearDown(self):
+        super().tearDown()
+        # Reset the cache after each test.
+        # This can also be achieved by running with JAX_TEST_WITH_PERSISTENT_COMPILATION_CACHE=True
+        cc.reset_cache()
+
+    def create_mesh(self, axis_shapes, axis_names):
+        """Creates a JAX device mesh with the default device order."""
+        try:
+            num_required_devices = np.prod(axis_shapes)
+            devices = np.array(jax.devices())
+            if len(devices) < num_required_devices:
+                self.skipTest(
+                    f"Not enough devices to create mesh of shape {axis_shapes}."
+                )
+            device_array = devices[:num_required_devices].reshape(axis_shapes)
+            return jax.sharding.Mesh(device_array, axis_names)
+        except RuntimeError:
+            return None
+
+    @parameterized.named_parameters(
+        dict(testcase_name="_swap_op_jax_jitted",
+             swap_op_type="jax",
+             jitted=True),
+        dict(testcase_name="_swap_op_pallas_jitted",
+             swap_op_type="pallas",
+             jitted=True),
+        dict(testcase_name="_swap_op_jax_unjitted",
+             swap_op_type="jax",
+             jitted=False),
+        dict(testcase_name="_swap_op_pallas_unjitted",
+             swap_op_type="pallas",
+             jitted=False),
+    )
+    def test_kv_cache_swap_roundtrip(self, swap_op_type: str, jitted: bool):
+        """
+        Tests the round-trip transfer of KV cache data: Device -> Host -> Device.
+
+        This test verifies that the `swap_in_fn` and `swap_out_fn` generated by
+        `get_kv_cache_swap_fn` correctly transfer data between TPU HBM and
+        host memory without corruption. It also exercises the code path that
+        enables buffer donation for the device-to-host transfer.
+        """
+        # 1. Get the swap functions to be tested.
+        swap_in_fn, swap_out_fn = get_kv_cache_swap_fn(
+            swap_op_type=swap_op_type,
+            host_sharding=self.host_sharding,
+            device_sharding=self.device_sharding,
+            jitted=jitted,
+        )
+
+        # 2. Create original source data on the TPU device.
+        @functools.partial(jax.jit, out_shardings=self.device_sharding)
+        def create_on_device(key):
+            return jax.random.uniform(key,
+                                      shape=self.cache_shape,
+                                      dtype=self.cache_dtype)
+
+        original_data_tpu = [
+            create_on_device(jax.random.key(i)) for i in range(self.num_layers)
+        ]
+        jax.block_until_ready(original_data_tpu)
+
+        # 3. Perform Device-to-Host (D2H) transfer (swap out).
+        # This call exercises the `donate_argnames` functionality when jitted.
+        data_cpu = swap_out_fn(original_data_tpu)
+        jax.block_until_ready(data_cpu)
+
+        # 4. Verify the data on the host.
+        for i in range(self.num_layers):
+            self.assertIs(data_cpu[i].sharding.memory_kind, "pinned_host")
+            self.assertEqual(data_cpu[i].sharding, self.host_sharding)
+            self.assertArraysEqual(np.array(data_cpu[i]),
+                                   np.array(original_data_tpu[i]))
+
+        # 5. Perform Host-to-Device (H2D) transfer (swap in).
+        roundtrip_data_tpu = swap_in_fn(data_cpu)
+        jax.block_until_ready(roundtrip_data_tpu)
+
+        # 6. Verify the round-tripped data on the device.
+        for i in range(self.num_layers):
+            self.assertIs(roundtrip_data_tpu[i].sharding.memory_kind, "device")
+            self.assertEqual(roundtrip_data_tpu[i].sharding,
+                             self.device_sharding)
+            self.assertArraysEqual(np.array(roundtrip_data_tpu[i]),
+                                   np.array(original_data_tpu[i]))
diff --git a/tpu_inference/distributed/cache_util.py b/tpu_inference/distributed/cache_util.py
@@ -191,40 +191,41 @@ def get_kv_cache_swap_fn(
     jitted: bool = True,
 ) -> Tuple[KVCacheSwapFn, KVCacheSwapFn]:
     """get the right swap_in and swap_out functions
-
     Args:
         swap_op_type : (str) pallas or jax
         host_sharding:
         device_sharding:
-
     Returns:
         A tuple containing the jitted swap-in and swap-out functions.
     """
     _swap_fn: SwapFn = pallas_swap_kv_caches if swap_op_type == "pallas" else jax_swap_kv_caches
-    if jitted:
-        _swap_in_fn = jax.jit(
-            _swap_fn,
-            static_argnames=["src_sharding", "dst_sharding", "direction"],
-            out_shardings=device_sharding)
-        _swap_out_fn = jax.jit(
-            _swap_fn,
-            static_argnames=["src_sharding", "dst_sharding", "direction"],
-            out_shardings=host_sharding)
-    else:
-        _swap_in_fn = _swap_fn
-        _swap_out_fn = _swap_fn
 
     # swap_in (h2d)
-    swap_in_fn = functools.partial(_swap_in_fn,
-                                   src_sharding=host_sharding,
-                                   dst_sharding=device_sharding,
-                                   direction="h2d")
+    _swap_in_partial = functools.partial(_swap_fn,
+                                         src_sharding=host_sharding,
+                                         dst_sharding=device_sharding,
+                                         direction="h2d")
     # swap_out (d2h)
-    swap_out_fn = functools.partial(_swap_out_fn,
-                                    src_sharding=device_sharding,
-                                    dst_sharding=host_sharding,
-                                    direction="d2h")
-    return swap_in_fn, swap_out_fn
+    _swap_out_partial = functools.partial(_swap_fn,
+                                          src_sharding=device_sharding,
+                                          dst_sharding=host_sharding,
+                                          direction="d2h")
+
+    if jitted:
+
+        def swap_in_fn(src_kv_caches: List[jax.Array]) -> List[jax.Array]:
+            return _swap_in_partial(src_kv_caches=src_kv_caches)
+
+        def swap_out_fn(src_kv_caches: List[jax.Array]) -> List[jax.Array]:
+            return _swap_out_partial(src_kv_caches=src_kv_caches)
+
+        swap_in_fn = jax.jit(swap_in_fn, out_shardings=device_sharding)
+        swap_out_fn = jax.jit(swap_out_fn,
+                              donate_argnames=["src_kv_caches"],
+                              out_shardings=host_sharding)
+        return swap_in_fn, swap_out_fn
+    else:
+        return _swap_in_partial, _swap_out_partial
 
 
 @functools.partial(
diff --git a/tpu_inference/distributed/tpu_connector_local.py b/tpu_inference/distributed/tpu_connector_local.py
@@ -1038,11 +1038,15 @@ def _save_blocks_to_cpu(self, req_id: ReqId, full_block_ids: list[int],
                 self.runner.kv_caches, blocks_to_save)
 
             jax.block_until_ready(flat_kv_caches_tpu)
+            flat_kv_caches_tpu_copy = flat_kv_caches_tpu
             logger.info(
                 f"extracted_blocks_tpu: {flat_kv_caches_tpu[0].shape}, {flat_kv_caches_tpu[0].sharding}"
             )
 
             flat_kv_caches_cpu = self.swap_out_fn(flat_kv_caches_tpu)
+            logger.info(
+                f"---debug----: flat_kv_caches_tpu_copy: {flat_kv_caches_tpu_copy[0].shape}, {flat_kv_caches_tpu_copy[0].sharding}"
+            )
             # Block until the transfer is complete
             if flat_kv_caches_cpu:
                 jax.block_until_ready(flat_kv_caches_cpu)