fix bug on seq aux loss

tongliu · tongliu · commit baed11ce70e1 · 2025-06-19T00:06:07.000-07:00
Signed-off-by: tongliu &lt;tongliu@nvidia.com&gt;
diff --git a/tests/pytorch/test_fused_router.py b/tests/pytorch/test_fused_router.py
@@ -1,7 +1,7 @@
 import torch
 import math
 from typing import Optional, Dict
-from transformer_engine.pytorch.router_func import (
+from transformer_engine.pytorch.router import (
     fused_topk_softmax_sigmoid,
     fused_compute_scores_for_aux_loss,
     fused_aux_loss,
@@ -199,7 +199,7 @@ def run_comparison(
         expert_bias=expert_bias_clone,
     )
 
-    assert torch.allclose(probs, probs_fused, atol=atol, rtol=rtol)
+    assert torch.allclose(probs, probs_fused, atol=atol, rtol=rtol), f"probs are not close: {probs} != {probs_fused}"
     assert torch.allclose(routing_map, routing_map_fused, atol=atol, rtol=rtol)
 
     # Fake the loss
@@ -342,7 +342,7 @@ def test_fused_aux_loss(dtype, num_tokens, num_experts, topk):
     aux_loss_fused = fused_aux_loss(
         probs=probs_clone,
         tokens_per_expert=tokens_per_expert,
-        num_tokens=num_tokens,
+        total_num_tokens=num_tokens,
         num_experts=num_experts,
         topk=topk,
         coeff=coeff,
diff --git a/transformer_engine/common/fused_router/fused_aux_loss.cu b/transformer_engine/common/fused_router/fused_aux_loss.cu
@@ -11,7 +11,7 @@ namespace transformer_engine {
 
 template <typename DataType, typename IndexType>
 __global__ void fused_aux_loss_forward_kernel(const DataType* probs,
-                                              const IndexType* tokens_per_expert, int num_tokens,
+                                              const IndexType* tokens_per_expert, int total_num_tokens, int num_tokens,
                                               int num_experts, int topk, float coeff,
                                               DataType* aux_loss, float* Const_buf) {
   int warp_num = blockDim.x / kThreadsPerWarp;
@@ -61,7 +61,7 @@ __global__ void fused_aux_loss_forward_kernel(const DataType* probs,
       /**
              * Section: Compute the aux_loss
              */
-      float C_coeff = (num_experts * coeff) / topk / num_tokens / num_tokens;
+      float C_coeff = (num_experts * coeff) / topk / total_num_tokens / total_num_tokens;
       aux_loss[0] = DataType(double(intermediate_result) * C_coeff);
       Const_buf[0] = C_coeff;
     }
@@ -70,7 +70,7 @@ __global__ void fused_aux_loss_forward_kernel(const DataType* probs,
 
 template <typename DataType, typename IndexType>
 void fused_aux_loss_forward_kernel_launcher(const DataType* probs,
-                                            const IndexType* tokens_per_expert, int num_tokens,
+                                            const IndexType* tokens_per_expert, int total_num_tokens, int num_tokens,
                                             int num_experts, int topk, float coeff,
                                             DataType* aux_loss, float* Const_buf,
                                             cudaStream_t stream) {
@@ -81,10 +81,10 @@ void fused_aux_loss_forward_kernel_launcher(const DataType* probs,
   int block_size = 1024;
   fused_aux_loss_forward_kernel<DataType, IndexType>
       <<<grid_size, block_size, shared_memory_size, stream>>>(
-          probs, tokens_per_expert, num_tokens, num_experts, topk, coeff, aux_loss, Const_buf);
+          probs, tokens_per_expert, total_num_tokens, num_tokens, num_experts, topk, coeff, aux_loss, Const_buf);
 }
 
-void fused_aux_loss_forward(const Tensor& probs, const Tensor& tokens_per_expert, int num_tokens,
+void fused_aux_loss_forward(const Tensor& probs, const Tensor& tokens_per_expert, int total_num_tokens, int num_tokens,
                             int num_experts, int topk, float coeff, Tensor& aux_loss,
                             Tensor& Const_buf, cudaStream_t stream) {
   TE_ROUTER_PROBS_TYPE_SWITCH_ALL(
@@ -93,7 +93,7 @@ void fused_aux_loss_forward(const Tensor& probs, const Tensor& tokens_per_expert
           tokens_per_expert.data.dtype, IndexType,
           fused_aux_loss_forward_kernel_launcher<DataType, IndexType>(
               reinterpret_cast<DataType*>(probs.data.dptr),
-              reinterpret_cast<IndexType*>(tokens_per_expert.data.dptr), num_tokens, num_experts,
+              reinterpret_cast<IndexType*>(tokens_per_expert.data.dptr), total_num_tokens, num_tokens, num_experts,
               topk, coeff, reinterpret_cast<DataType*>(aux_loss.data.dptr),
               reinterpret_cast<float*>(Const_buf.data.dptr), stream);););
 }
@@ -148,12 +148,13 @@ void fused_aux_loss_backward(const Tensor& Const_buf, const Tensor& tokens_per_e
 }  // namespace transformer_engine
 
 void nvte_fused_aux_loss_forward(const NVTETensor probs, const NVTETensor tokens_per_expert,
+                                 int total_num_tokens,
                                  int num_tokens, int num_experts, int topk, float coeff,
                                  NVTETensor aux_loss, NVTETensor Const_buf, cudaStream_t stream) {
   NVTE_API_CALL(nvte_fused_aux_loss_forward);
   using namespace transformer_engine;
   fused_aux_loss_forward(*convertNVTETensorCheck(probs), *convertNVTETensorCheck(tokens_per_expert),
-                         num_tokens, num_experts, topk, coeff, *convertNVTETensorCheck(aux_loss),
+                         total_num_tokens, num_tokens, num_experts, topk, coeff, *convertNVTETensorCheck(aux_loss),
                          *convertNVTETensorCheck(Const_buf), stream);
 }
 
diff --git a/transformer_engine/common/include/transformer_engine/fused_router.h b/transformer_engine/common/include/transformer_engine/fused_router.h
@@ -96,7 +96,8 @@ void nvte_fused_scores_for_aux_loss_backward(const NVTETensor intermediate_outpu
  *
  *  \param[in]     probs           Probabilities from the forward pass.
  *  \param[in]     tokens_per_expert  Number of tokens per expert.
- *  \param[in]     num_tokens      Number of total tokens.
+ *  \param[in]     total_num_tokens   Number of total tokens. Will be used in seq/global aux loss.
+ *  \param[in]     num_tokens      Number of tokens.
  *  \param[in]     num_experts     Number of experts.
  *  \param[in]     topk            Topk value.
  *  \param[in]     coeff           Coefficient.
@@ -105,7 +106,7 @@ void nvte_fused_scores_for_aux_loss_backward(const NVTETensor intermediate_outpu
  *  \param[in]     stream          CUDA stream used for the operation.
  */
 void nvte_fused_aux_loss_forward(const NVTETensor probs, const NVTETensor tokens_per_expert,
-                                 int num_tokens, int num_experts, int topk, float coeff,
+                                 int total_num_tokens, int num_tokens, int num_experts, int topk, float coeff,
                                  NVTETensor aux_loss, NVTETensor Const_buf, cudaStream_t stream);
 
 /*! \brief Backward pass for auxiliary loss.
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -36,8 +36,8 @@ at::Tensor fused_scores_for_aux_loss_bwd(int num_tokens, int num_experts,
                                          int topk, std::string score_function);
 
 std::tuple<at::Tensor, at::Tensor> fused_aux_loss_fwd(at::Tensor probs,
-                                                      at::Tensor tokens_per_expert, int num_tokens,
-                                                      int num_experts, int topk, float coeff);
+                                                      at::Tensor tokens_per_expert, int total_num_tokens,
+                                                      int num_tokens, int num_experts, int topk, float coeff);
 
 at::Tensor fused_aux_loss_bwd(at::Tensor Const_buf, at::Tensor tokens_per_expert, int num_tokens,
                               int num_experts, at::Tensor grad_aux_loss);
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -272,7 +272,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("num_experts"), py::arg("intermediate_output"), py::arg("grad_scores"),
         py::arg("topk"), py::arg("score_function"), "Fused topk softmax bwd");
   m.def("fused_aux_loss_fwd", &transformer_engine::pytorch::fused_aux_loss_fwd, py::arg("probs"),
-        py::arg("tokens_per_expert"), py::arg("num_tokens"), py::arg("num_experts"),
+        py::arg("tokens_per_expert"), py::arg("total_num_tokens"), py::arg("num_tokens"), py::arg("num_experts"),
         py::arg("topk"), py::arg("coeff"), "Fused aux loss fwd");
   m.def("fused_aux_loss_bwd", &transformer_engine::pytorch::fused_aux_loss_bwd,
         py::arg("Const_buf"), py::arg("tokens_per_expert"), py::arg("num_tokens"),
diff --git a/transformer_engine/pytorch/csrc/extensions/router.cpp b/transformer_engine/pytorch/csrc/extensions/router.cpp
@@ -145,10 +145,10 @@ at::Tensor fused_scores_for_aux_loss_bwd(int num_tokens, int num_experts,
 }
 
 std::tuple<at::Tensor, at::Tensor> fused_aux_loss_fwd(at::Tensor probs,
-                                                      at::Tensor tokens_per_expert, int num_tokens,
-                                                      int num_experts, int topk, float coeff) {
+                                                      at::Tensor tokens_per_expert, int total_num_tokens,
+                                                      int num_tokens, int num_experts, int topk, float coeff) {
   TORCH_CHECK(topk > 0, "topk must be greater than 0");
-  TORCH_CHECK(num_tokens > 0, "num_tokens must be greater than 0");
+  TORCH_CHECK(total_num_tokens > 0, "total_num_tokens must be greater than 0");
   TORCH_CHECK(num_experts > 0, "num_experts must be greater than 0");
 
   // Create the output tensor
@@ -160,15 +160,15 @@ std::tuple<at::Tensor, at::Tensor> fused_aux_loss_fwd(at::Tensor probs,
   auto aux_loss_cu = makeTransformerEngineTensor(aux_loss);
   auto Const_buf_cu = makeTransformerEngineTensor(Const_buf);
 
-  nvte_fused_aux_loss_forward(probs_cu.data(), tokens_per_expert_cu.data(), num_tokens, num_experts,
+  nvte_fused_aux_loss_forward(probs_cu.data(), tokens_per_expert_cu.data(), total_num_tokens, num_tokens, num_experts,
                               topk, coeff, aux_loss_cu.data(), Const_buf_cu.data(),
                               at::cuda::getCurrentCUDAStream());
 
   return std::make_tuple(aux_loss, Const_buf);
 }
 
-at::Tensor fused_aux_loss_bwd(at::Tensor Const_buf, at::Tensor tokens_per_expert, int num_tokens,
-                              int num_experts, at::Tensor grad_aux_loss) {
+at::Tensor fused_aux_loss_bwd(at::Tensor Const_buf, at::Tensor tokens_per_expert,
+                              int num_tokens, int num_experts, at::Tensor grad_aux_loss) {
   // Create the output tensor
   at::Tensor grad_probs = at::empty({num_tokens, num_experts},
                                     at::dtype(grad_aux_loss.scalar_type()).device(at::kCUDA));
diff --git a/transformer_engine/pytorch/router.py b/transformer_engine/pytorch/router.py
@@ -125,14 +125,16 @@ def forward(
         ctx,
         probs: torch.Tensor,
         tokens_per_expert: torch.Tensor,
-        num_tokens: int,
+        total_num_tokens: int,
         num_experts: int,
         topk: int,
         coeff: float,
     ):
+        num_tokens = probs.size(0)
         aux_loss, Const_buf = tex.fused_aux_loss_fwd(
             probs=probs,
             tokens_per_expert=tokens_per_expert,
+            total_num_tokens=total_num_tokens,
             num_tokens=num_tokens,
             num_experts=num_experts,
             topk=topk,
@@ -159,9 +161,9 @@ def backward(ctx, grad_aux_loss):
 def fused_aux_loss(
     probs: torch.Tensor,
     tokens_per_expert: torch.Tensor,
-    num_tokens: int,
+    total_num_tokens: int,
     num_experts: int,
     topk: int,
     coeff: float,
 ):
-    return FusedAuxLoss.apply(probs, tokens_per_expert, num_tokens, num_experts, topk, coeff)
+    return FusedAuxLoss.apply(probs, tokens_per_expert, total_num_tokens, num_experts, topk, coeff)