From c50f7b5964bec2fc70f549d1afb49b096f0b05cd Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 22 Feb 2024 03:46:44 +0000 Subject: [PATCH] out-of-place allgather --- apps/nccl/src/nccl.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 5b84fb095..f4cb31db2 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -485,6 +485,8 @@ __global__ void __launch_bounds__(1024, 1) template cudaError_t allgather(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + cudaError_t err = cudaMemcpyAsync(resultBuff + nelems * rank, buff, nelems * sizeof(T), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; allgather5<<<24, 1024, 0, stream>>>(rank, worldSize, nRanksPerNode, nelems); return cudaGetLastError(); }