NVIDIA
diff --git a/‎transformer_engine/common/fused_router/fused_aux_loss.cu
Lines changed: 123 additions & 197 deletions b/‎transformer_engine/common/fused_router/fused_aux_loss.cu
Lines changed: 123 additions & 197 deletions
@@ -8,229 +8,155 @@
 
 namespace transformer_engine {
 
-template<typename DType, typename IndexType>
-__global__ void fused_aux_loss_forward_kernel(
-    DType* probs,
-    IndexType* tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    int topk,
-    float coeff,
-    DType* aux_loss,
-    float* Const_buf
-){
-    int warp_num = blockDim.x / kThreadsPerWarp;
-    int warp_id = threadIdx.x / kThreadsPerWarp;
-    int lane_id = threadIdx.x % kThreadsPerWarp;
-    extern __shared__ DType aggregated_probs_per_expert[];
-    // Clear the shmem
-    for(int i = threadIdx.x; i < num_experts; i += blockDim.x) {
-        aggregated_probs_per_expert[i] = 0;
-    }
-    __syncthreads();
-
-    /**
+template <typename DType, typename IndexType>
+__global__ void fused_aux_loss_forward_kernel(DType* probs, IndexType* tokens_per_expert,
+                                              int num_tokens, int num_experts, int topk,
+                                              float coeff, DType* aux_loss, float* Const_buf) {
+  int warp_num = blockDim.x / kThreadsPerWarp;
+  int warp_id = threadIdx.x / kThreadsPerWarp;
+  int lane_id = threadIdx.x % kThreadsPerWarp;
+  extern __shared__ DType aggregated_probs_per_expert[];
+  // Clear the shmem
+  for (int i = threadIdx.x; i < num_experts; i += blockDim.x) {
+    aggregated_probs_per_expert[i] = 0;
+  }
+  __syncthreads();
+
+  /**
      * Section: Reduce the probs to the aggregated_probs_per_expert
      */
-    // Loop: for all positions in each row
-    for(int i = lane_id; i < num_experts; i += kThreadsPerWarp) {
-        DType tmp = 0;
-        // Loop: for all rows that this warp is responsible for
-        for(int j = warp_id; j < num_tokens; j += warp_num) {
-            tmp += probs[j * num_experts + i];
-        }
-        atomicAdd(&aggregated_probs_per_expert[i], tmp);
+  // Loop: for all positions in each row
+  for (int i = lane_id; i < num_experts; i += kThreadsPerWarp) {
+    DType tmp = 0;
+    // Loop: for all rows that this warp is responsible for
+    for (int j = warp_id; j < num_tokens; j += warp_num) {
+      tmp += probs[j * num_experts + i];
     }
-    __syncthreads();
+    atomicAdd(&aggregated_probs_per_expert[i], tmp);
+  }
+  __syncthreads();
 
-    /**
+  /**
      * Section: aggregated_probs_per_expert * tokens_per_expert
      * In-place update on shmem
      */
-    for(int i = threadIdx.x; i < num_experts; i += blockDim.x) {
-        aggregated_probs_per_expert[i] *= tokens_per_expert[i];
-    }
-    __syncthreads();
+  for (int i = threadIdx.x; i < num_experts; i += blockDim.x) {
+    aggregated_probs_per_expert[i] *= tokens_per_expert[i];
+  }
+  __syncthreads();
 
-    if(warp_id == 0) {
-        /**
+  if (warp_id == 0) {
+    /**
          * Section: Reduce to get the sum of aggregated_probs_per_expert
          */
-        DType intermediate_result = warp_reduce_on_shmem(aggregated_probs_per_expert, num_experts, sum, lane_id);
-        __syncwarp();
+    DType intermediate_result =
+        warp_reduce_on_shmem(aggregated_probs_per_expert, num_experts, sum, lane_id);
+    __syncwarp();
 
-        if(lane_id == 0) {
-            /**
+    if (lane_id == 0) {
+      /**
              * Section: Compute the aux_loss
              */
-            float C_coeff = (num_experts * coeff) / topk / num_tokens / num_tokens;
-            aux_loss[0] = intermediate_result * C_coeff;
-            Const_buf[0] = C_coeff;
-        }
+      float C_coeff = (num_experts * coeff) / topk / num_tokens / num_tokens;
+      aux_loss[0] = intermediate_result * C_coeff;
+      Const_buf[0] = C_coeff;
     }
+  }
 }
 
-template<typename DType, typename IndexType>
-void fused_aux_loss_forward_kernel_launcher(
-    DType* probs,
-    IndexType* tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    int topk,
-    float coeff,
-    DType* aux_loss,
-    float* Const_buf,
-    cudaStream_t stream
-){
-    // Meta data for the kernel
-    size_t shared_memory_size = sizeof(DType) * num_experts * 2;
-    // Use Only 1 block/1024 threads to avoid the grid sync
-    int grid_size = 1;
-    int block_size = 1024;
-    fused_aux_loss_forward_kernel<DType, IndexType><<<grid_size, block_size, shared_memory_size, stream>>>(
-        probs,
-        tokens_per_expert,
-        num_tokens,
-        num_experts,
-        topk,
-        coeff,
-        aux_loss,
-        Const_buf
-    );
+template <typename DType, typename IndexType>
+void fused_aux_loss_forward_kernel_launcher(DType* probs, IndexType* tokens_per_expert,
+                                            int num_tokens, int num_experts, int topk, float coeff,
+                                            DType* aux_loss, float* Const_buf,
+                                            cudaStream_t stream) {
+  // Meta data for the kernel
+  size_t shared_memory_size = sizeof(DType) * num_experts * 2;
+  // Use Only 1 block/1024 threads to avoid the grid sync
+  int grid_size = 1;
+  int block_size = 1024;
+  fused_aux_loss_forward_kernel<DType, IndexType>
+      <<<grid_size, block_size, shared_memory_size, stream>>>(
+          probs, tokens_per_expert, num_tokens, num_experts, topk, coeff, aux_loss, Const_buf);
 }
 
-void fused_aux_loss_forward(
-    Tensor * probs,
-    Tensor * tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    int topk,
-    float coeff,
-    Tensor * aux_loss,
-    Tensor * Const_buf,
-    cudaStream_t stream
-){
-    TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
-        probs->data.dtype, DType,
-        TRANSFORMER_ENGINE_TYPE_SWITCH_INDEX(
-            tokens_per_expert->data.dtype, IndexType,
-            fused_aux_loss_forward_kernel_launcher<DType, IndexType>(
-                reinterpret_cast<DType *>(probs->data.dptr),
-                reinterpret_cast<IndexType *>(tokens_per_expert->data.dptr),
-                num_tokens,
-                num_experts,
-                topk,
-                coeff,
-                reinterpret_cast<DType *>(aux_loss->data.dptr),
-                reinterpret_cast<float *>(Const_buf->data.dptr),    
-                stream
-            );
-        );
-    );
+void fused_aux_loss_forward(Tensor* probs, Tensor* tokens_per_expert, int num_tokens,
+                            int num_experts, int topk, float coeff, Tensor* aux_loss,
+                            Tensor* Const_buf, cudaStream_t stream) {
+  TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
+      probs->data.dtype, DType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_INDEX(
+          tokens_per_expert->data.dtype, IndexType,
+          fused_aux_loss_forward_kernel_launcher<DType, IndexType>(
+              reinterpret_cast<DType*>(probs->data.dptr),
+              reinterpret_cast<IndexType*>(tokens_per_expert->data.dptr), num_tokens, num_experts,
+              topk, coeff, reinterpret_cast<DType*>(aux_loss->data.dptr),
+              reinterpret_cast<float*>(Const_buf->data.dptr), stream);););
 }
 
-template<typename DType, typename IndexType>
-__global__ void fused_aux_loss_backward_kernel(
-    float* Const_buf,
-    IndexType* tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    DType* grad_aux_loss,
-    DType* grad_probs
-){
-    int global_warp_num = gridDim.x * blockDim.x / kThreadsPerWarp;
-    int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kThreadsPerWarp;
-    int lane_id = threadIdx.x % kThreadsPerWarp;
-
-    // Loop: for all positions in each row
-    for(int i = lane_id; i < num_experts; i += kThreadsPerWarp) {
-        DType C_coeff = Const_buf[0];
-        IndexType tokens_per_expert_i = tokens_per_expert[i];
-        DType grad_aux_loss_value = grad_aux_loss[0];
-        // Loop: for all rows
-        for(int j = global_warp_id; j < num_tokens; j += global_warp_num) {
-            grad_probs[j * num_experts + i] = C_coeff * tokens_per_expert_i * grad_aux_loss_value;
-        }
+template <typename DType, typename IndexType>
+__global__ void fused_aux_loss_backward_kernel(float* Const_buf, IndexType* tokens_per_expert,
+                                               int num_tokens, int num_experts,
+                                               DType* grad_aux_loss, DType* grad_probs) {
+  int global_warp_num = gridDim.x * blockDim.x / kThreadsPerWarp;
+  int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kThreadsPerWarp;
+  int lane_id = threadIdx.x % kThreadsPerWarp;
+
+  // Loop: for all positions in each row
+  for (int i = lane_id; i < num_experts; i += kThreadsPerWarp) {
+    DType C_coeff = Const_buf[0];
+    IndexType tokens_per_expert_i = tokens_per_expert[i];
+    DType grad_aux_loss_value = grad_aux_loss[0];
+    // Loop: for all rows
+    for (int j = global_warp_id; j < num_tokens; j += global_warp_num) {
+      grad_probs[j * num_experts + i] = C_coeff * tokens_per_expert_i * grad_aux_loss_value;
     }
+  }
 }
 
-template<typename DType, typename IndexType>
-void fused_aux_loss_backward_kernel_launcher(
-    float* Const_buf,
-    IndexType* tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    DType* grad_aux_loss,
-    DType* grad_probs,
-    cudaStream_t stream
-){
-    // Meta data for the kernel
-    int block_size = 256;
-    int grid_size = (num_tokens + block_size - 1) / block_size;
-    fused_aux_loss_backward_kernel<DType, IndexType><<<grid_size, block_size, 0, stream>>>(
-        Const_buf,
-        tokens_per_expert,
-        num_tokens,
-        num_experts,
-        grad_aux_loss,
-        grad_probs
-    );
+template <typename DType, typename IndexType>
+void fused_aux_loss_backward_kernel_launcher(float* Const_buf, IndexType* tokens_per_expert,
+                                             int num_tokens, int num_experts, DType* grad_aux_loss,
+                                             DType* grad_probs, cudaStream_t stream) {
+  // Meta data for the kernel
+  int block_size = 256;
+  int grid_size = (num_tokens + block_size - 1) / block_size;
+  fused_aux_loss_backward_kernel<DType, IndexType><<<grid_size, block_size, 0, stream>>>(
+      Const_buf, tokens_per_expert, num_tokens, num_experts, grad_aux_loss, grad_probs);
 }
 
-void fused_aux_loss_backward(
-    Tensor * Const_buf,
-    Tensor * tokens_per_expert,
-    int num_tokens,
-    int num_experts,
-    Tensor * grad_aux_loss,
-    Tensor * grad_probs,
-    cudaStream_t stream
-){
-    TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
-        grad_aux_loss->data.dtype, DType,
-        TRANSFORMER_ENGINE_TYPE_SWITCH_INDEX(
-            tokens_per_expert->data.dtype, IndexType,
-            fused_aux_loss_backward_kernel_launcher<DType, IndexType>(
-                reinterpret_cast<float *>(Const_buf->data.dptr),
-                reinterpret_cast<IndexType *>(tokens_per_expert->data.dptr),
-                num_tokens,
-                num_experts,
-                reinterpret_cast<DType *>(grad_aux_loss->data.dptr),
-                reinterpret_cast<DType *>(grad_probs->data.dptr),
-                stream
-            );
-        );
-    );
+void fused_aux_loss_backward(Tensor* Const_buf, Tensor* tokens_per_expert, int num_tokens,
+                             int num_experts, Tensor* grad_aux_loss, Tensor* grad_probs,
+                             cudaStream_t stream) {
+  TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
+      grad_aux_loss->data.dtype, DType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_INDEX(
+          tokens_per_expert->data.dtype, IndexType,
+          fused_aux_loss_backward_kernel_launcher<DType, IndexType>(
+              reinterpret_cast<float*>(Const_buf->data.dptr),
+              reinterpret_cast<IndexType*>(tokens_per_expert->data.dptr), num_tokens, num_experts,
+              reinterpret_cast<DType*>(grad_aux_loss->data.dptr),
+              reinterpret_cast<DType*>(grad_probs->data.dptr), stream);););
 }
 
-} // namespace transformer_engine
-
-void nvte_fused_aux_loss_forward(const NVTETensor probs, const NVTETensor tokens_per_expert, int num_tokens, int num_experts, int topk, float coeff, NVTETensor aux_loss, NVTETensor Const_buf, cudaStream_t stream){
-    NVTE_API_CALL(nvte_fused_aux_loss_forward);
-    using namespace transformer_engine;
-    fused_aux_loss_forward(
-        convertNVTETensorCheck(probs),
-        convertNVTETensorCheck(tokens_per_expert),
-        num_tokens,
-        num_experts,
-        topk,
-        coeff,
-        convertNVTETensorCheck(aux_loss),
-        convertNVTETensorCheck(Const_buf),
-        stream
-    );
+}  // namespace transformer_engine
+
+void nvte_fused_aux_loss_forward(const NVTETensor probs, const NVTETensor tokens_per_expert,
+                                 int num_tokens, int num_experts, int topk, float coeff,
+                                 NVTETensor aux_loss, NVTETensor Const_buf, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_fused_aux_loss_forward);
+  using namespace transformer_engine;
+  fused_aux_loss_forward(convertNVTETensorCheck(probs), convertNVTETensorCheck(tokens_per_expert),
+                         num_tokens, num_experts, topk, coeff, convertNVTETensorCheck(aux_loss),
+                         convertNVTETensorCheck(Const_buf), stream);
 }
 
-void nvte_fused_aux_loss_backward(const NVTETensor Const_buf, const NVTETensor tokens_per_expert, int num_tokens, int num_experts, NVTETensor grad_aux_loss, NVTETensor grad_probs, cudaStream_t stream){
-    NVTE_API_CALL(nvte_fused_aux_loss_backward);
-    using namespace transformer_engine;
-    fused_aux_loss_backward(
-        convertNVTETensorCheck(Const_buf),
-        convertNVTETensorCheck(tokens_per_expert),
-        num_tokens,
-        num_experts,
-        convertNVTETensorCheck(grad_aux_loss),
-        convertNVTETensorCheck(grad_probs),
-        stream
-    );
-}
+void nvte_fused_aux_loss_backward(const NVTETensor Const_buf, const NVTETensor tokens_per_expert,
+                                  int num_tokens, int num_experts, NVTETensor grad_aux_loss,
+                                  NVTETensor grad_probs, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_fused_aux_loss_backward);
+  using namespace transformer_engine;
+  fused_aux_loss_backward(convertNVTETensorCheck(Const_buf),
+                          convertNVTETensorCheck(tokens_per_expert), num_tokens, num_experts,
+                          convertNVTETensorCheck(grad_aux_loss), convertNVTETensorCheck(grad_probs),
+                          stream);
+}