From d10c25016736183a5d53d2150352931997a59982 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 7 Jan 2025 15:35:27 -0800 Subject: [PATCH] Add support for `int32_t` indices in TBE training (2B/N) (#3371) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/618 X-link: https://github.com/facebookresearch/FBGEMM/pull/462 - Fix `hash_size_cumsum` to be `int64_t` in `linearize_index_index_select_kernel` and `linearize_index_kernel` - Enable more specializations for radix sort Reviewed By: r-barnes Differential Revision: D65923591 --- .../fbgemm_gpu/split_embeddings_utils.cuh | 8 +++-- .../radix_sort_pairs.cu | 8 +++-- .../transpose_embedding_input.cu | 32 +++++++++---------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh index 8351e046c2..9e6130f408 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh @@ -59,9 +59,13 @@ transpose_embedding_input( int end_bit = sizeof(KeyT) * 8, \ cudaStream_t stream = 0) +DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t); DECL_RADIX_SORT_PAIRS_FN(int64_t, float); DECL_RADIX_SORT_PAIRS_FN(int64_t, double); -DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t); -DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, int64_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, float); +DECL_RADIX_SORT_PAIRS_FN(int32_t, double); #undef DECL_RADIX_SORT_PAIRS_FN diff --git a/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu index 50d9757d25..93dab81a4f 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu @@ -77,7 +77,11 @@ using namespace fbgemm_gpu; } #endif +DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t); DEF_RADIX_SORT_PAIRS_FN(int64_t, float); DEF_RADIX_SORT_PAIRS_FN(int64_t, double); -DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t); -DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, int64_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, float); +DEF_RADIX_SORT_PAIRS_FN(int32_t, double); diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu index 83a06d78a0..ea2ea7d662 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu @@ -63,7 +63,7 @@ inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) { template __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( - const pta::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 hash_size_cumsum, const pta::PackedTensorAccessor32 indices, @@ -79,7 +79,7 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( // Use a raw pointer to avoid creating dummy PackedTensorAccessor const uint32_t* const __restrict__ vbe_b_t_map, FixedDivisor fd) { - const int32_t T = hash_size_cumsum.size(0) - 1; + const auto T = hash_size_cumsum.size(0) - 1; auto b_t = blockIdx.x * blockDim.x + threadIdx.x; int32_t b; int32_t t; @@ -97,21 +97,20 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( } const index_t hash_offset = valid ? hash_size_cumsum[t] : -1; - const index_t indices_start = valid ? offsets[b_t] : -1; - const int32_t L = valid ? offsets[b_t + 1] - indices_start : 0; + const auto indices_start = valid ? offsets[b_t] : -1; + const auto L = valid ? offsets[b_t + 1] - indices_start : 0; const int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize; // Compile-time conditional if (nobag) { for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) { - const index_t indices_start_warp = - fbgemm_gpu::shfl_sync(indices_start, j); - const int32_t t_warp = fbgemm_gpu::shfl_sync(t, j); - const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j); + const auto indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j); + const auto t_warp = fbgemm_gpu::shfl_sync(t, j); + const auto L_warp = fbgemm_gpu::shfl_sync(L, j); const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j); for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) { const index_t idx = __ldg(&indices[indices_start_warp + i]); - const int64_t l_t = (indices_start_warp + i) * T + t_warp; + const auto l_t = (indices_start_warp + i) * T + t_warp; infos[indices_start_warp + i] = l_t; linear_indices[indices_start_warp + i] = hash_offset_warp + idx; } @@ -124,10 +123,9 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( reinterpret_cast(&b)[0]; } for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) { - const index_t indices_start_warp = - fbgemm_gpu::shfl_sync(indices_start, j); - const uint32_t info_warp = fbgemm_gpu::shfl_sync(info, j); - const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j); + const auto indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j); + const auto info_warp = fbgemm_gpu::shfl_sync(info, j); + const auto L_warp = fbgemm_gpu::shfl_sync(L, j); const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j); for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) { const index_t idx = __ldg(&indices[indices_start_warp + i]); @@ -142,7 +140,7 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( template __global__ __launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel( - const pta::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 hash_size_cumsum, const pta::PackedTensorAccessor32 indices, @@ -153,7 +151,7 @@ __launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel( linear_indices, FixedDivisor fd, int32_t fixed_L_per_warp) { - const int32_t T = hash_size_cumsum.size(0) - 1; + const auto T = hash_size_cumsum.size(0) - 1; auto b_t = blockIdx.x * blockDim.x + threadIdx.x; int32_t b; int32_t t; @@ -258,7 +256,7 @@ transpose_embedding_input( kMaxThreads, \ 0, \ at::cuda::getCurrentCUDAStream()>>>( \ - MAKE_PTA_WITH_NAME(func_name, hash_size_cumsum, index_t, 1, 32), \ + MAKE_PTA_WITH_NAME(func_name, hash_size_cumsum, int64_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, infos, INFO_ACC_T, 1, 32), \ @@ -296,7 +294,7 @@ transpose_embedding_input( 0, at::cuda::getCurrentCUDAStream()>>>( MAKE_PTA_WITH_NAME( - func_name, hash_size_cumsum, index_t, 1, 32), + func_name, hash_size_cumsum, int64_t, 1, 32), MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32), MAKE_PTA_WITH_NAME( func_name, total_L_offsets.value(), index_t, 1, 32),