Backport cudaMemcpyAsync bugfix for NCCL 2.18.3

easybuilders · Jul 10, 2024 · 6b4eaa0 · 6b4eaa0
1 parent 27e9782
commit 6b4eaa0
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 1 deletion.
diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3-GCCcore-12.3.0-CUDA-12.1.1.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3-GCCcore-12.3.0-CUDA-12.1.1.eb
@@ -11,11 +11,15 @@ toolchain = {'name': 'GCCcore', 'version': '12.3.0'}
 github_account = 'NVIDIA'
 source_urls = [GITHUB_SOURCE]
 sources = ['v%(version)s-1.tar.gz']
-patches = ['NCCL-2.16.2_fix-cpuid.patch']
+patches = [
+    'NCCL-2.16.2_fix-cpuid.patch',
+    'NCCL-2.18.3_fix-cudaMemcpyAsync.patch',
+]
 checksums = [
     ('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9',
      'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96'),
     {'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
+    {'NCCL-2.18.3_fix-cudaMemcpyAsync.patch': '7dc8d0d1b78e4f8acefbc400860f47432ef67c225b50d73c732999c23483de90'},
 ]
 
 builddependencies = [('binutils', '2.40')]

diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3_fix-cudaMemcpyAsync.patch b/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3_fix-cudaMemcpyAsync.patch
@@ -0,0 +1,174 @@
+Backported from (original message attached below):
+https://github.com/NVIDIA/nccl/commit/4365458757e4107ecbf629b2fd6e0e19a5d237c2
+
+From: Kaiming Ouyang <[email protected]>
+Date: Wed, 20 Sep 2023 05:51:14 -0700
+Subject: [PATCH] Fix cudaMemcpyAsync bug
+
+We are trying to use the copy result of first cudaMemcpyAsync in the
+second cudaMemcpyAsync without sync in between. This patch fixes it
+by allocating a CPU side array to cache device side addr so that we
+can avoid this consecutive cuda mem copy.
+
+Fixes #957
+---
+ src/channel.cc        | 12 ++++++++++++
+ src/include/comm.h    |  2 ++
+ src/init.cc           |  2 +-
+ src/transport.cc      |  8 ++------
+ src/transport/nvls.cc | 10 ++++------
+ 5 files changed, 21 insertions(+), 13 deletions(-)
+
+diff --git a/src/channel.cc b/src/channel.cc
+index 3edcc2f..245dfd5 100644
+--- a/src/channel.cc
++++ b/src/channel.cc
+@@ -42,9 +42,11 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+     /* channel->devPeers is not shared, so just free it when calling commFree() */
+     NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+     ncclCommPushCudaFree(comm, channel->devPeers);
++    NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
+     for (int r = 0; r < nRanks; r++) {
+       uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
+       NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
++      channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
+     }
+   }
+
+@@ -52,6 +54,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
+   NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+   ncclCommPushCudaFree(comm, channel->devRingUserRanks);
+
++  /* guarantee addr has been copied into channel->devPeers */
++  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+   NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+
+   return ncclSuccess;
+@@ -77,6 +81,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
+       uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
+       channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
+       NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
++      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
+       ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
+     }
+   } else {
+@@ -86,10 +91,12 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
+       uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
+       channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
+       NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
++      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
+       ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
+     }
+   }
+
++  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+   NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+
+   return ncclSuccess;
+@@ -114,6 +121,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
+     addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
+     channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
+     NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
++    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
+     ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
+   } else {
+     NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
+@@ -121,9 +129,11 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
+     addr = (uintptr_t)channel->collnetDevPeers;
+     channel->peers[comm->nRanks] = channel->collnetPeers;
+     NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
++    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
+     ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
+   }
+
++  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
+   NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
+
+   return ncclSuccess;
+@@ -156,5 +166,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRa
+       }
+     }
+   }
++
++  free(channel->devPeersHostPtr);
+   return ncclSuccess;
+ }
+diff --git a/src/include/comm.h b/src/include/comm.h
+index e79bf54..8986f93 100644
+--- a/src/include/comm.h
++++ b/src/include/comm.h
+@@ -124,6 +124,8 @@ struct ncclSharedResources {
+ struct ncclChannel {
+   struct ncclChannelPeer** peers;
+   struct ncclDevChannelPeer** devPeers;
++  /* devPeer pointer array used for host side access */
++  struct ncclDevChannelPeer** devPeersHostPtr;
+   struct ncclRing ring;
+   int* devRingUserRanks;
+   struct ncclTree tree;
+diff --git a/src/init.cc b/src/init.cc
+index 1ea1d7e..309ce10 100644
+--- a/src/init.cc
++++ b/src/init.cc
+@@ -437,7 +437,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
+
+   NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+ exit:
+-  CUDACHECK(cudaStreamSynchronize(comm->sharedRes->deviceStream.cudaStream));
++  NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
+   NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
+   return ret;
+ fail:
+diff --git a/src/transport.cc b/src/transport.cc
+index f4b8a2a..9817beb 100644
+--- a/src/transport.cc
++++ b/src/transport.cc
+@@ -147,11 +147,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
+             if (conn->connected == 0) {
+               NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[i] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
+               if (ret == ncclSuccess) {
+-                struct ncclDevChannelPeer* addr;
+                 conn->connected = 1;
+                 /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
+-                CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[sendPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+-                CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
++                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+               } else if (ret == ncclInProgress) {
+                 allChannelsConnected = false;
+               }
+@@ -167,11 +165,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
+             if (conn->connected == 0) {
+               NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[i] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
+               if (ret == ncclSuccess) {
+-                struct ncclDevChannelPeer* addr;
+                 conn->connected = 1;
+                 /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
+-                CUDACHECKGOTO(cudaMemcpyAsync(&addr, &comm->channels[c].devPeers[recvPeer], sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), ret, fail);
+-                CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
++                CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+               } else if (ret == ncclInProgress) {
+                 allChannelsConnected = false;
+               }
+diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
+index 633cb04..07be99d 100644
+--- a/src/transport/nvls.cc
++++ b/src/transport/nvls.cc
+@@ -359,12 +359,10 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
+         peer->send[0].conn.tail = (uint64_t*)(mem + buffSize + memSize / 2);
+         peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
+
+-        struct ncclDevChannelPeer* addr;
+-        CUDACHECKGOTO(cudaMemcpyAsync(&addr, comm->channels[c].devPeers + nvlsPeer, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+-        CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+-        CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+-        CUDACHECKGOTO(cudaMemcpyAsync(&addr->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+-        CUDACHECKGOTO(cudaMemcpyAsync(&addr->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
++        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
++        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
++        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
++        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, cleanup);
+
+         /*INFO(NCCL_INIT|NCCL_NVLS, "Peer %d Channel %d MC buff %p/%p UC Buff %p/%p",
+             nvlsPeer, c,
+--
+2.45.2