Integrate MoE kernel for torchax path (#996)

lsy323 · web-flow · commit aa58a9c23679 · 2025-11-04T21:03:18.000-08:00
Signed-off-by: Siyuan Liu &lt;lsiyuan@google.com&gt;
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
@@ -128,7 +128,8 @@ steps:
              python3 -m pytest -s -v -x /workspace/tpu_inference/tests/kernels \
              --ignore=/workspace/tpu_inference/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
              --ignore=/workspace/tpu_inference/tests/kernels/ragged_kv_cache_update_v2_test.py \
-             --ignore=/workspace/tpu_inference/tests/kernels/collectives
+             --ignore=/workspace/tpu_inference/tests/kernels/collectives \
+             --ignore=/workspace/tpu_inference/tests/kernels/fused_moe_v1_test.py
          else
            echo "Skipping: no changes detected in kernels, tests/kernels, or requirements.txt"
            exit 0
diff --git a/tests/kernels/fused_moe_v1_test.py b/tests/kernels/fused_moe_v1_test.py
@@ -1,5 +1,6 @@
 import jax
 import jax.numpy as jnp
+import numpy as np
 from absl.testing import absltest
 from jax._src import test_util as jtu
 from jax.sharding import Mesh
@@ -59,7 +60,8 @@ def setUp(self):
                 (-1 if x.coords[0] % 2 else 1) * x.coords[1],
             ),
         )
-        self.mesh = Mesh(devices=self.mesh_devices, axis_names=("model", ))
+        self.mesh = Mesh(np.array(self.mesh_devices).reshape(1, -1),
+                         axis_names=("data", "model"))
 
     def test_basic(self):
         dtype = jnp.bfloat16
diff --git a/tests/layers/vllm/test_unquantized.py b/tests/layers/vllm/test_unquantized.py
@@ -483,3 +483,123 @@ def test_fused_moe(use_ep, mesh, num_tokens, intermediate_size, hidden_size,
         atol=1e-2,
         rtol=1e-2,
     )
+
+
+@pytest.mark.parametrize("use_ep", [True])
+@pytest.mark.parametrize("mesh",
+                         [test_utils.get_spmd_mesh(jax.local_device_count())])
+@pytest.mark.parametrize("num_tokens", [128, 512])
+@pytest.mark.parametrize("intermediate_size", [256, 512])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [2])
+def test_fused_moe_use_kernel(use_ep, mesh, num_tokens, intermediate_size,
+                              hidden_size, num_experts, topk):
+
+    if jax.local_device_count() < 8:
+        pytest.skip("Test requires at least 8 devices")
+
+    os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+
+    a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
+    w1 = torch.randn(
+        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
+    w2 = torch.randn(
+        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+
+    # Use deterministic gating_output generation (same logic as fused_moe_v1_test.py)
+    # Generate base gating scores with deterministic pattern
+    score = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32) +
+        torch.arange(num_tokens * num_experts, dtype=torch.float32).reshape(
+            num_tokens, num_experts) / 100)
+
+    # Generate unique top-k indices
+    generator = torch.Generator()
+    generator.manual_seed(42)
+    top_k_indices = torch.randint(0,
+                                  num_experts - 1, (num_tokens, topk),
+                                  dtype=torch.int32,
+                                  generator=generator)
+
+    # Add one-hot encoding weighted by 10 to ensure selected experts have highest scores
+    one_hot = torch.nn.functional.one_hot(top_k_indices.long(),
+                                          num_classes=num_experts).float()
+    one_hot = one_hot.sum(dim=1) * 10
+
+    score = (score + one_hot).to(dtype)
+
+    torch_output = torch_moe(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        gating_output=score,
+        topk=topk,
+        global_num_experts=num_experts,
+        expert_map=None,
+        renormalize=False,
+    )
+
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = dtype
+    vllm_config.parallel_config = ParallelConfig(
+        tensor_parallel_size=mesh.devices.size, enable_expert_paralle=use_ep)
+
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        vllm_fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=topk,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            renormalize=False,
+            tp_size=mesh.devices.size,
+            dp_size=1,
+            quant_config=quant_config,
+        )
+        vllm_fused_moe.moe_parallel_config.use_ep = use_ep
+
+    vllm_fused_moe.w13_weight.data = w1
+    vllm_fused_moe.w2_weight.data = w2
+
+    p_spec = P('model', )
+    jax_a = torch_view(t2j(a, use_dlpack=False))
+    jax_a = jax_a.apply_jax_(jax.device_put, NamedSharding(mesh, p_spec))
+    score = torch_view(t2j(score))
+    score = score.apply_jax_(jax.device_put, NamedSharding(mesh, p_spec))
+
+    with torchax.default_env(), set_forward_context(None, vllm_config):
+        assert isinstance(vllm_fused_moe.quant_method,
+                          VllmUnquantizedFusedMoEMethod)
+        # Enable the kernel for this test
+        vllm_fused_moe.quant_method.use_kernel = True
+        vllm_fused_moe.quant_method.process_weights_after_loading(
+            vllm_fused_moe)
+        vllm_fused_moe.quant_method.block_size = {
+            "bt": 32,
+            "bf": 512,
+            "bd1": 512,
+            "bd2": 512,
+            "btc": 32,
+            "bfc": 256,
+            "bd1c": 256,
+            "bd2c": 256,
+        }
+        jax_output = vllm_fused_moe(jax_a, score)
+        jax_output = j2t(jax_output.to(torch.float32)).to(dtype)
+
+    torch.testing.assert_close(
+        torch_output,
+        jax_output,
+        atol=1e-2,
+        rtol=1e-2,
+    )
diff --git a/tpu_inference/kernels/fused_moe/v1/kernel.py b/tpu_inference/kernels/fused_moe/v1/kernel.py
@@ -7,6 +7,7 @@
 from jax import lax
 from jax._src import dtypes
 from jax.experimental import pallas as pl
+from jax.experimental import shard_map
 from jax.experimental.pallas import tpu as pltpu
 
 P = jax.sharding.PartitionSpec
@@ -144,7 +145,7 @@ def _fused_ep_moe_kernel(
         a2a_acc_sem,
         *,
         top_k: int,
-        ep_name: str,
+        ep_axis_name: str,
         # Kernel tuning params.
         bt: int,  # Block size of local_num_tokens.
         bf: int,  # Block size of intermediate_size.
@@ -155,8 +156,8 @@ def _fused_ep_moe_kernel(
         bd1c: int,  # Compute size of block hidden_size.
         bd2c: int,  # Compute size of block hidden_size.
 ):
-    my_id = lax.axis_index(ep_name)
-    num_devices = lax.axis_size(ep_name)
+    my_id = lax.axis_index(ep_axis_name)
+    num_devices = lax.axis_size(ep_axis_name)
     local_num_tokens = tokens_hbm.shape[0]
     local_num_experts, intermediate_size, hidden_size = w2_hbm.shape
     # num_experts = local_num_experts * num_devices
@@ -186,8 +187,8 @@ def sync_barrier():
         barrier_sem = pltpu.get_barrier_semaphore()
         pltpu.semaphore_signal(
             barrier_sem,
-            device_id=right_id,
-            device_id_type=pltpu.DeviceIdType.LOGICAL,
+            device_id=(0, right_id),
+            device_id_type=pltpu.DeviceIdType.MESH,
         )
         pltpu.semaphore_wait(barrier_sem, 1)
 
@@ -276,7 +277,7 @@ def _all_reduce_metadata(
                     dst_ref=d2e_count_vmem.at[row_id],
                     send_sem=send_sem,
                     recv_sem=recv_sem,
-                    device_id=(right_id, ),
+                    device_id=(0, right_id),
                     device_id_type=pltpu.DeviceIdType.MESH,
                 ).wait()
                 row_id = (row_id + num_devices - 1) % num_devices
@@ -358,7 +359,10 @@ def start_a2a_scatter(bt_id, e_sem_id, local_e_id):
                                              pl.ds(start, remote_sz)],
                     send_sem=send_sems.at[e_sem_id],
                     recv_sem=recv_sems.at[e_sem_id],
-                    device_id=(recv_id, ),
+                    device_id=(
+                        0,
+                        recv_id,
+                    ),
                 ).start()
         a2a_s_sends_x2_smem[e_sem_id] = send_sz
 
@@ -402,7 +406,7 @@ def start_a2a_gather(bt_id, e_sem_id, local_e_id):
                 dst_ref=a2a_g_hbm.at[my_e_id, pl.ds(0, remote_sz)],
                 send_sem=send_sems.at[e_sem_id],
                 recv_sem=a2a_gather_sem,
-                device_id=(recv_id, ),
+                device_id=(0, recv_id),
             ).start()
             start += sz
 
@@ -831,6 +835,7 @@ def _():
         "bfc",
         "bd1c",
         "bd2c",
+        "ep_axis_name",
     ],
 )
 def fused_ep_moe(
@@ -850,12 +855,14 @@ def fused_ep_moe(
     bfc: int,
     bd1c: int,
     bd2c: int,
+    ep_axis_name: str = 'model',
 ):
-    if len(mesh.axis_names) != 1:
-        raise ValueError("Mesh must have only one axis")
+    # Assert all other axes have length of 1
+    assert len(mesh.shape) == 2, "Expect 2D mesh in tpu-inference"
+    assert 'data' in mesh.shape and mesh.shape['data'] == 1, \
+        "Expect data axis size of 1 in tpu-inference"
 
-    ep_name = mesh.axis_names[0]
-    ep_size = mesh.axis_sizes[0]
+    ep_size = mesh.shape[ep_axis_name]
     num_devices = ep_size
 
     num_tokens, actual_hidden_size = tokens.shape
@@ -907,7 +914,7 @@ def fused_ep_moe(
             functools.partial(
                 _fused_ep_moe_kernel,
                 top_k=top_k,
-                ep_name=ep_name,
+                ep_axis_name=ep_axis_name,
                 bt=bt,
                 bf=bf,
                 bd1=bd1,
@@ -999,11 +1006,13 @@ def fused_ep_moe(
         ))
 
     @jax.jit
-    @jax.shard_map(
+    @functools.partial(
+        shard_map.shard_map,
         mesh=mesh,
-        in_specs=(P(ep_name), P(ep_name), P(ep_name), P(ep_name), P()),
-        out_specs=P(ep_name),
-        check_vma=False,
+        in_specs=(P(ep_axis_name), P(ep_axis_name), P(ep_axis_name),
+                  P(ep_axis_name), P()),
+        out_specs=P(ep_axis_name),
+        check_rep=False,
     )
     def kernel(tokens, w1, w2, gating_output, a2a_g_hbm_scratch):
         return fused_moe(
diff --git a/tpu_inference/layers/vllm/quantization/unquantized.py b/tpu_inference/layers/vllm/quantization/unquantized.py