From 3521fb0280a2e561aac17c007eeca2f0a413aaa0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 14 Nov 2023 09:28:48 +0800 Subject: [PATCH] Clear minor warnings (#214) Clear warnings from the clang compiler. --- include/mscclpp/core.hpp | 5 -- python/test/_cpp/proxy_test.cpp | 2 +- test/allgather_test_cpp.cu | 46 +++++--------- test/allgather_test_host_offloading.cu | 82 +++++++++++-------------- test/mscclpp-test/allgather_test.cu | 27 ++++---- test/mscclpp-test/allreduce_test.cu | 85 ++++++++++++-------------- test/mscclpp-test/alltoall_test.cu | 8 +-- test/mscclpp-test/common.cc | 2 +- test/mscclpp-test/sendrecv_test.cu | 17 +++--- 9 files changed, 119 insertions(+), 155 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 09b93b248..03eb8cc69 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -551,11 +551,6 @@ class NonblockingFuture { /// @param future The shared future to move. NonblockingFuture(std::shared_future&& future) : future(std::move(future)) {} - /// Copy constructor. - /// - /// @param other The @ref NonblockingFuture to copy. - NonblockingFuture(const NonblockingFuture& other) = default; - /// Check if the value is ready to be retrieved. /// /// @return True if the value is ready, false otherwise. diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp index e44f0f6f5..4a1a0f754 100644 --- a/python/test/_cpp/proxy_test.cpp +++ b/python/test/_cpp/proxy_test.cpp @@ -39,7 +39,7 @@ class MyProxyService { semaphores_(semaphores), proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { int cudaDevice; - cudaGetDevice(&cudaDevice); + MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice)); deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice); } diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu index fb6f1a132..fcebd7590 100644 --- a/test/allgather_test_cpp.cu +++ b/test/allgather_test_cpp.cu @@ -18,18 +18,6 @@ static int nranksPerNode = 8; -// Propagate errors up - -#define MSCCLPPCHECK(call) \ - do { \ - mscclppResult_t res = call; \ - if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \ - return res; \ - } \ - } while (0) - // Check CUDA RT calls #define CUDACHECK(cmd) \ do { \ @@ -54,8 +42,7 @@ template using DeviceHandle = mscclpp::DeviceHandle; __constant__ DeviceHandle constProxyChans[16]; -__device__ void allgather0(DeviceHandle proxyChan, int rank, int world_size, - int remoteRank, size_t nelemsPerGPU) { +__device__ void allgather0(DeviceHandle proxyChan, int rank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall // this thread's role is a sender role @@ -70,8 +57,8 @@ __device__ void allgather0(DeviceHandle proxyChan, if ((threadIdx.x % 32) == 0) proxyChan.wait(); } -__device__ void localAllGather(DeviceHandle proxyChan, int rank, int world_size, - int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) { +__device__ void localAllGather(DeviceHandle proxyChan, int rank, int nranksPerNode, + int remoteRank, uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode // and waits for data from GPU rank (i-1) % nranksPerNode @@ -91,9 +78,9 @@ __device__ void localAllGather(DeviceHandle proxyCh } } -__device__ void allgather1(DeviceHandle proxyChan, int rank, int world_size, - int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { - localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), +__device__ void allgather1(DeviceHandle proxyChan, int rank, int nranksPerNode, + int remoteRank, size_t nelemsPerGPU) { + localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); if (remoteRank / nranksPerNode == rank / nranksPerNode) if ((threadIdx.x % 32) == 0) proxyChan.flush(); @@ -116,7 +103,7 @@ __device__ void allgather2(DeviceHandle proxyChan, // Step 1 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } // cross-node exchange @@ -134,7 +121,7 @@ __device__ void allgather2(DeviceHandle proxyChan, // local allgather int otherNghr = (rank + nranksPerNode) % world_size; if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + localAllGather(proxyChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); } @@ -152,7 +139,7 @@ __device__ void allgather2(DeviceHandle proxyChan, // Step 3 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, + localAllGather(proxyChan, rank, nranksPerNode, remoteRank, (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } @@ -170,9 +157,9 @@ __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelem DeviceHandle proxyChan = constProxyChans[warpId]; if (kernel == 0) - allgather0(proxyChan, rank, world_size, remoteRank, nelemsPerGPU); + allgather0(proxyChan, rank, nelemsPerGPU); else if (kernel == 1) - allgather1(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); + allgather1(proxyChan, rank, nranksPerNode, remoteRank, nelemsPerGPU); else if (kernel == 2) allgather2(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); } @@ -388,7 +375,6 @@ int main(int argc, const char* argv[]) { } ip_port = (char*)parsedArgs["ip_port"].c_str(); - int thisNode = rankToNode(rank); int cudaNum = rankToLocalRank(rank); CUDACHECK(cudaSetDevice(cudaNum)); @@ -452,19 +438,19 @@ int main(int argc, const char* argv[]) { if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); cudaGraph_t graph; cudaGraphExec_t instance; - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + CUDACHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); for (int i = 0; i < cudagraphiter; ++i) { kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum); } - cudaStreamEndCapture(stream, &graph); - cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + CUDACHECK(cudaStreamEndCapture(stream, &graph)); + CUDACHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0)); int cudagraphwarmup = 10; if (rank == 0) printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, cudagraphiter); for (int i = 0; i < cudagraphwarmup; ++i) { - cudaGraphLaunch(instance, stream); + CUDACHECK(cudaGraphLaunch(instance, stream)); } CUDACHECK(cudaStreamSynchronize(stream)); @@ -477,7 +463,7 @@ int main(int argc, const char* argv[]) { double t0, t1, ms, time_in_us; t0 = getTime(); for (int i = 0; i < cudagraphlaunch; ++i) { - cudaGraphLaunch(instance, stream); + CUDACHECK(cudaGraphLaunch(instance, stream)); } CUDACHECK(cudaStreamSynchronize(stream)); diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index a71b48546..76e6b6311 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -23,18 +23,6 @@ int nranksPerNode; int rank; int world_size; -// Propagate errors up - -// Check CUDA RT calls -#define CUCHECK(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (false) - // Measure current time in second. static double getTime(void) { struct timespec tspec; @@ -45,8 +33,8 @@ static double getTime(void) { return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec; } -__global__ void kernel(int r, int nranks, mscclpp::FifoDeviceHandle fifo, - mscclpp::Host2DeviceSemaphore::DeviceHandle* handles, int handleIndex) { +__global__ void kernel(int r, mscclpp::FifoDeviceHandle fifo, mscclpp::Host2DeviceSemaphore::DeviceHandle* handles, + int handleIndex) { int tid = threadIdx.x; __syncthreads(); // uint64_t tail; @@ -75,8 +63,8 @@ void print_usage(const char* prog) { void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h, int** data_d) { - CUCHECK(cudaMalloc(data_d, dataSize)); - CUCHECK(cudaMemset(*data_d, 0, dataSize)); + MSCCLPP_CUDATHROW(cudaMalloc(data_d, dataSize)); + MSCCLPP_CUDATHROW(cudaMemset(*data_d, 0, dataSize)); *data_h = new int[nelemsPerGPU * world_size]; for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { @@ -87,29 +75,29 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz (*data_h)[i] = 0; } } - CUCHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); + MSCCLPP_CUDATHROW(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice)); } class MyProxyService { private: - int deviceNumaNode_; - mscclpp::Proxy proxy_; + int dataSize_; std::vector remoteMemories_; mscclpp::RegisteredMemory localMemory_; std::vector> hostSemaphores_; std::vector> deviceSemaphores1_; std::vector> deviceSemaphores2_; std::vector> connections_; - int dataSize_; + mscclpp::Proxy proxy_; + int deviceNumaNode_; public: MyProxyService(mscclpp::Communicator& comm, int* data_d, int dataSize) - : remoteMemories_(world_size), + : dataSize_(dataSize), + remoteMemories_(world_size), connections_(world_size), - dataSize_(dataSize), proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) { int cudaDevice; - CUCHECK(cudaGetDevice(&cudaDevice)); + MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice)); deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice); int thisNode = rankToNode(rank); @@ -237,7 +225,7 @@ int main(int argc, char* argv[]) { MPI_Comm_free(&shmcomm); int cudaNum = rankToLocalRank(rank); - CUCHECK(cudaSetDevice(cudaNum)); + MSCCLPP_CUDATHROW(cudaSetDevice(cudaNum)); if (rank == 0) printf("Initializing MSCCL++\n"); auto bootstrap = std::make_shared(rank, world_size); @@ -268,30 +256,30 @@ int main(int argc, char* argv[]) { mscclpp::FifoDeviceHandle fifo = proxyService.fifo().deviceHandle(); if (rank == 0) printf("Testing the correctness of AllGather implementation\n"); cudaStream_t stream; - CUCHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); mscclpp::Host2DeviceSemaphore::DeviceHandle* deviceHandles1; mscclpp::Host2DeviceSemaphore::DeviceHandle* deviceHandles2; - CUCHECK(cudaMalloc(&deviceHandles1, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size)); + MSCCLPP_CUDATHROW(cudaMalloc(&deviceHandles1, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { if (i == rank) continue; auto handle = proxyService.getDeviceHandle1(i); - CUCHECK(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle), - cudaMemcpyHostToDevice)); + MSCCLPP_CUDATHROW(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle), + cudaMemcpyHostToDevice)); } - CUCHECK(cudaMalloc(&deviceHandles2, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size)); + MSCCLPP_CUDATHROW(cudaMalloc(&deviceHandles2, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size)); for (int i = 0; i < world_size; ++i) { if (i == rank) continue; auto handle = proxyService.getDeviceHandle2(i); - CUCHECK(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle), - cudaMemcpyHostToDevice)); + MSCCLPP_CUDATHROW(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle), + cudaMemcpyHostToDevice)); } - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); - CUCHECK(cudaStreamSynchronize(stream)); + kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); - CUCHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); + MSCCLPP_CUDATHROW(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { int val = i + 1; @@ -307,14 +295,14 @@ int main(int argc, char* argv[]) { double t0, t1, ms, time_in_us; int iterwithoutcudagraph = 10; if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph); - CUCHECK(cudaStreamSynchronize(stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); bootstrap->barrier(); t0 = getTime(); for (int i = 0; i < iterwithoutcudagraph; ++i) { - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2); + kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1); + kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles2, 2); } - CUCHECK(cudaStreamSynchronize(stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); bootstrap->barrier(); t1 = getTime(); ms = (t1 - t0) * 1000.0; @@ -327,22 +315,22 @@ int main(int argc, char* argv[]) { if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter); cudaGraph_t graph; cudaGraphExec_t instance; - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + MSCCLPP_CUDATHROW(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); for (int i = 0; i < cudagraphiter; ++i) { - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1); - kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2); + kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1); + kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles2, 2); } - cudaStreamEndCapture(stream, &graph); - cudaGraphInstantiate(&instance, graph, NULL, NULL, 0); + MSCCLPP_CUDATHROW(cudaStreamEndCapture(stream, &graph)); + MSCCLPP_CUDATHROW(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0)); int cudagraphwarmup = 10; if (rank == 0) printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup, cudagraphiter); for (int i = 0; i < cudagraphwarmup; ++i) { - cudaGraphLaunch(instance, stream); + MSCCLPP_CUDATHROW(cudaGraphLaunch(instance, stream)); } - CUCHECK(cudaStreamSynchronize(stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); // measure runtime int cudagraphlaunch = 10; @@ -352,9 +340,9 @@ int main(int argc, char* argv[]) { bootstrap->barrier(); t0 = getTime(); for (int i = 0; i < cudagraphlaunch; ++i) { - cudaGraphLaunch(instance, stream); + MSCCLPP_CUDATHROW(cudaGraphLaunch(instance, stream)); } - CUCHECK(cudaStreamSynchronize(stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); t1 = getTime(); ms = (t1 - t0) * 1000.0; diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 713b91035..ca050826e 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -21,7 +21,7 @@ __constant__ DeviceHandle constRawProxyChan[16]; __constant__ DeviceHandle constSmChans[8]; -__global__ void allgather0(int rank, int worldSize, size_t nelemsPerGPU) { +__global__ void allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / 32; // Each warp is responsible for one of the remote ranks @@ -41,9 +41,8 @@ __global__ void allgather0(int rank, int worldSize, size_t nelemsPerGPU) { if (threadIdx.x % 32 == 0) proxyChan.wait(); } -__device__ void localAllGather(DeviceHandle proxyChan, int rank, int worldSize, - int nRanksPerNode, int remoteRank, uint64_t offset, uint64_t size, - bool flushAfterSignal = true) { +__device__ void localAllGather(DeviceHandle proxyChan, int rank, int nRanksPerNode, + int remoteRank, uint64_t offset, uint64_t size, bool flushAfterSignal = true) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nRanksPerNode // and waits for data from GPU rank (i-1) % nRanksPerNode @@ -112,14 +111,14 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); } -__global__ void allgather1(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { +__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) { int warpId = threadIdx.x / 32; int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks DeviceHandle proxyChan = constProxyChans[warpId]; - localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } @@ -145,7 +144,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne // Step 1 // local allgather if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int), false); } // cross-node exchange @@ -170,7 +169,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne // local allgather int otherNghr = (rank + nRanksPerNode) % worldSize; if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int), false); } @@ -192,13 +191,13 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne // Step 3 // local allgather if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, + localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } } -__global__ void allgather3(int rank, int worldSize) { +__global__ void allgather3() { int warpId = threadIdx.x / 32; // Each warp is responsible for one of the remote ranks @@ -315,9 +314,9 @@ class AllGatherProxyService : public mscclpp::BaseProxyService { AllGatherProxyService::AllGatherProxyService(int worldSize, int rank, int cudaDevice) : worldSize_(worldSize), - sendBytes_(0), rank_(rank), cudaDevice_(cudaDevice), + sendBytes_(0), proxy_( std::make_shared([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { @@ -382,13 +381,13 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { nThreads = 32 * (worldSize - 1); } if (kernelNum == 0) { - allgather0<<>>(rank, worldSize, paramCount_); + allgather0<<>>(rank, paramCount_); } else if (kernelNum == 1) { - allgather1<<>>(rank, worldSize, nRanksPerNode, paramCount_); + allgather1<<>>(rank, nRanksPerNode, paramCount_); } else if (kernelNum == 2) { allgather2<<>>(rank, worldSize, nRanksPerNode, paramCount_); } else if (kernelNum == 3) { - allgather3<<>>(rank, worldSize); + allgather3<<>>(); } else if (kernelNum == 4) { allgather4<<>>(rank, worldSize, nRanksPerNode, paramCount_); } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 316a630fe..4df3f09e9 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -48,7 +48,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem, int size_t nLastInts = nElem % 4; int4* dst4 = (int4*)dst; int4* src4 = (int4*)src; - for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { dst4[i].w += src4[i].w; dst4[i].x += src4[i].x; dst4[i].y += src4[i].y; @@ -57,7 +57,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem, int if (nLastInts > 0) { int* dstLast = dst + nInt4 * 4; int* srcLast = src + nInt4 * 4; - for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { dstLast[i] += srcLast[i]; } } @@ -68,7 +68,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem) { } __device__ void vectorSumSingleBlock(int* dst, int* src, size_t nElem) { - for (int i = threadIdx.x; i < nElem; i += blockDim.x) { + for (size_t i = threadIdx.x; i < nElem; i += blockDim.x) { dst[i] += src[i]; } } @@ -277,10 +277,10 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel nelemsPerGPU / pipelineSize * sizeof(int)); } -__device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRanksPerNode, int startChunkIndex, - size_t offsetInChunk, size_t chunkSize, size_t nelems, int nBlocks) { +__device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk, + size_t chunkSize, size_t nelems, int nBlocks) { if (nRanksPerNode == 1) return; - if (blockIdx.x >= nBlocks) return; + if ((int)blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; DeviceHandle* smChans = constSmOutOfPlaceGetChans; @@ -299,7 +299,7 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan reduceScatterDeviceSyncer.sync(nBlocks); const size_t nInt4 = nelems / 4; - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { int4 sum = make_int4(0, 0, 0, 0); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { @@ -316,7 +316,7 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan } const size_t nLastInts = nelems % 4; - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { int sum = 0; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { int val = smChans[peerIdx].read(indexOffset + nInt4 * 4 + idx); @@ -326,10 +326,10 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan } } -__device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRanksPerNode, size_t chunkSize, - size_t nelems, int nBlocks) { +__device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems, + int nBlocks) { if (nRanksPerNode == 1) return; - if (blockIdx.x >= nBlocks) return; + if ((int)blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; DeviceHandle* smChans = constSmOutOfPlaceGetChans; @@ -344,7 +344,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa smChans[tid].signal(); } const int waitStart = nBlocks * blockDim.x - nPeer; - if (tid >= waitStart && tid < nBlocks * blockDim.x) { + if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) { smChans[tid - waitStart].wait(); } reduceScatterDeviceSyncer.sync(nBlocks); @@ -353,7 +353,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa for (int index = 0; index < nPeer; ++index) { int4 val; int peerIdx = (index + localRankIndexInNode) % nPeer; - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { val = smChans[peerIdx].read(indexOffset4 + idx); buff4[indexOffset4 + idx].w += val.w; buff4[indexOffset4 + idx].x += val.x; @@ -364,7 +364,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa const size_t nLastInts = nelems % 4; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read(indexOffset + nInt4 * 4 + idx); buff[indexOffset + nInt4 * 4 + idx] += val; } @@ -391,18 +391,18 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer int peerNodeId = peerRank / nRanksPerNode; int nBlocksForReduceScatter = (int)(nBlocksForReduceScatterRatio * gridDim.x) / (nRanksPerNode - 1) * (nRanksPerNode - 1); - int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter); + int isComm = (threadIdx.x == 0) && ((int)blockIdx.x == nBlocksForReduceScatter); int peer = (peerRank < rank) ? peerRank : peerRank - 1; int nBlocksRemain = gridDim.x - nBlocksForReduceScatter; DeviceHandle& proxyChan = constDevFstRoundChans[peer]; if (peerNodeId == rank / nRanksPerNode) { - localReduceScatterSm(buff, scratch, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); + localReduceScatterSm(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); return; } // step 1: local reduce int startChunkIndex = peerNodeId * nRanksPerNode; - localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, + localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, nBlocksForReduceScatter); deviceSyncer.sync(gridDim.x); @@ -412,12 +412,12 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer // opposite side proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); } - localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, + localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, 2 * chunkSize / pipelineSize, nBlocksForReduceScatter); if (isComm) { proxyChan.wait(); } - if (blockIdx.x >= nBlocksForReduceScatter) { + if ((int)blockIdx.x >= nBlocksForReduceScatter) { ibDeviceSyncer.sync(nBlocksRemain); // reduce data received from peer to related rank size_t offset = rank * chunkSize * sizeof(int); @@ -436,8 +436,7 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int); proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); } - localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, - nBlocksForReduceScatter); + localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter); if (isComm) { proxyChan.wait(); } @@ -509,7 +508,7 @@ __device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size, constSmInPlaceChans[tid].signal(); } int waitStart = nBlocks * blockDim.x - nPeer; - if (tid >= waitStart && tid < nBlocks * blockDim.x) { + if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) { constSmInPlaceChans[tid - waitStart].wait(); } allGatherDeviceSyncer.sync(nBlocks); @@ -631,8 +630,7 @@ __global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, siz } } -__global__ void __launch_bounds__(1024) - allreduce1(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) { +__global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int rank, int worldSize, size_t nelems) { int isComm = (threadIdx.x == 0) && (blockIdx.x == 0); int remoteSendRank = (rank + 1) % worldSize; int remoteRecvRank = (rank + worldSize - 1) % worldSize; @@ -849,23 +847,21 @@ __global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getP } __global__ void __launch_bounds__(1024) - allreduce3(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize, size_t nelems) { + allreduce3(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) { reduceScatter(buff, scratch, rank, nRanksPerNode, worldSize, nelems); if (threadIdx.x == 0 && blockIdx.x == 0) { allGather(rank, worldSize, nRanksPerNode, nelems / worldSize); } } -__global__ void allreduce4(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize, - size_t nelems) { +__global__ void allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) { reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems); deviceSyncer.sync(gridDim.x); allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize); } -__global__ void allreduce5(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize, - size_t nelems) { - localReduceScatterSm2(buff, scratch, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); +__global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) { + localReduceScatterSm2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); deviceSyncer.sync(gridDim.x); localRingAllGatherSm(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x); } @@ -984,20 +980,19 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { allreduce0<<>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_, scratchDataCount); else if (kernelNum == 1) - allreduce1<<>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_, - scratchDataCount); + allreduce1<<>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_); else if (kernelNum == 2) allreduce2<<>>((int*)inputBuff, tmpBuff, putPacketBuff, getPacketBuff, resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_); else if (kernelNum == 3) - allreduce3<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, - args.nRanksPerNode, worldSize, paramCount_); + allreduce3<<>>((int*)inputBuff, (int*)tmpBuff, rank, args.nRanksPerNode, + worldSize, paramCount_); else if (kernelNum == 4) - allreduce4<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, - args.nRanksPerNode, worldSize, paramCount_); + allreduce4<<>>((int*)inputBuff, (int*)tmpBuff, rank, args.nRanksPerNode, + worldSize, paramCount_); else if (kernelNum == 5) - allreduce5<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, - args.nRanksPerNode, worldSize, paramCount_); + allreduce5<<>>((int*)inputBuff, rank, args.nRanksPerNode, worldSize, + paramCount_); else if (kernelNum == 6) { allreduce6<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_); @@ -1042,19 +1037,19 @@ void AllReduceTestColl::setupCollTest(size_t size) { std::vector AllReduceTestColl::getKernelRestrictions() { return {// {kernelNum, kernelName, compatibleWithMultiNodes, countDivisorForMultiNodes, alignedBytes} - {0, "allreduce0", true, 1, .alignedBytes = 4 * worldSize_}, - {1, "allreduce1", true, 1, .alignedBytes = 4 * worldSize_}, - {2, "allreduce2", true, 1, .alignedBytes = 4 * worldSize_}, - {3, "allreduce3", true, 3, .alignedBytes = 4 * worldSize_}, + {0, "allreduce0", true, 1, 4 * worldSize_}, + {1, "allreduce1", true, 1, 4 * worldSize_}, + {2, "allreduce2", true, 1, 4 * worldSize_}, + {3, "allreduce3", true, 3, 4 * worldSize_}, { 4, "allreduce4", true, 3, - .alignedBytes = 16 * worldSize_ /*use ulong2 to transfer data*/, + 16 * worldSize_ /*use ulong2 to transfer data*/, }, - {5, "allreduce5", false, 1, .alignedBytes = 4 * worldSize_}, - {6, "allreduce6", false, 1, .alignedBytes = 4 * worldSize_}}; + {5, "allreduce5", false, 1, 4 * worldSize_}, + {6, "allreduce6", false, 1, 4 * worldSize_}}; } class AllReduceTestEngine : public BaseTestEngine { diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu index 89ab6be63..88f6fb4cc 100644 --- a/test/mscclpp-test/alltoall_test.cu +++ b/test/mscclpp-test/alltoall_test.cu @@ -14,7 +14,7 @@ void* localRecvBuff; void* localSendBuff; __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) { - int remoteRank = (blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; + int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; for (int i = 1; i < nRanksPerNode; i++) { DeviceHandle proxyChan = constProxyChans[blockIdx.x]; if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) { @@ -29,8 +29,8 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) { } } -__global__ void alltoall0(int rank, int worldSize, size_t nElements) { - int remoteRank = (blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; +__global__ void alltoall0(int rank, size_t nElements) { + int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; DeviceHandle proxyChan = constProxyChans[blockIdx.x]; if (threadIdx.x == 0) { proxyChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), @@ -68,7 +68,7 @@ void AllToAllTestColl::runColl(const TestArgs& args, cudaStream_t stream) { CUDATHROW(cudaMemcpyAsync((int*)localRecvBuff + paramCount_ * rank, (int*)localSendBuff + paramCount_ * rank, paramCount_ * sizeof(int), cudaMemcpyDeviceToDevice, stream)); if (kernelNum == 0) { - alltoall0<<>>(rank, worldSize, paramCount_); + alltoall0<<>>(rank, paramCount_); } else if (kernelNum == 1) { alltoall1<<>>(rank, nRanksPerNode, paramCount_); } diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index c56540d3a..a8e533ab9 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -188,7 +188,7 @@ BaseTestEngine::BaseTestEngine(const TestArgs& args, const std::string& name) CUDATHROW(cudaStreamCreateWithFlags(&this->stream_, cudaStreamNonBlocking)); } -BaseTestEngine::~BaseTestEngine() { cudaStreamDestroy(stream_); } +BaseTestEngine::~BaseTestEngine() { (void)cudaStreamDestroy(stream_); } void BaseTestColl::setupCollTest(const TestArgs& args, size_t size) { this->worldSize_ = args.totalRanks; diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu index b6074bccf..46cc0658f 100644 --- a/test/mscclpp-test/sendrecv_test.cu +++ b/test/mscclpp-test/sendrecv_test.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode __device__ mscclpp::DeviceSyncer deviceSyncer; -__global__ void kernel(int rank, size_t dataSize, size_t dataPerBlock) { +__global__ void kernel(size_t dataSize, size_t dataPerBlock) { size_t startIndex = blockIdx.x * dataPerBlock; size_t blockDataSize = min(dataSize - startIndex, dataPerBlock); int globalIndex = blockIdx.x * blockDim.x + threadIdx.x; @@ -63,12 +64,12 @@ class SendRecvTestColl : public BaseTestColl { std::vector getKernelRestrictions() override; }; -void SendRecvTestColl::runColl(const TestArgs& args, cudaStream_t stream) { +void SendRecvTestColl::runColl(const TestArgs&, cudaStream_t stream) { size_t sendBytes = sendCount_ * typeSize_; int blockNum = getBlockNum(sendBytes); size_t bytesPerBlock = (sendBytes + blockNum - 1) / blockNum; if (kernelNum_ == 0) { - kernel<<>>(args.rank, sendBytes, bytesPerBlock); + kernel<<>>(sendBytes, bytesPerBlock); } } @@ -87,11 +88,11 @@ std::vector SendRecvTestColl::getKernelRestrictions() { void SendRecvTestColl::initData(const TestArgs& args, std::vector sendBuff, void* expectedBuff) { int rank = args.rank; if (sendBuff.size() != 1) std::unexpected(); - CUDATHROW(cudaMemset(sendBuff[0], 0, sendCount_ * typeSize_)); + MSCCLPP_CUDATHROW(cudaMemset(sendBuff[0], 0, sendCount_ * typeSize_)); // TODO: The type should not limited to int. std::vector dataHost(std::max(sendCount_, recvCount_), rank); - CUDATHROW(cudaMemcpy(sendBuff[0], dataHost.data(), sendCount_ * typeSize_, cudaMemcpyHostToDevice)); + MSCCLPP_CUDATHROW(cudaMemcpy(sendBuff[0], dataHost.data(), sendCount_ * typeSize_, cudaMemcpyHostToDevice)); int peerRank = (rank - 1 + args.totalRanks) % args.totalRanks; for (size_t i = 0; i < recvCount_; i++) { @@ -110,7 +111,7 @@ void SendRecvTestColl::setupCollTest(size_t size) { expectedCount_ = base; mscclpp::DeviceSyncer syncer = {}; - CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); } class SendRecvTestEngine : public BaseTestEngine { @@ -189,8 +190,8 @@ void SendRecvTestEngine::setupConnections() { } std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(), [](const mscclpp::SmChannel& smChannel) { return smChannel.deviceHandle(); }); - cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), - sizeof(DeviceHandle) * smChannelHandles.size()); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), + sizeof(DeviceHandle) * smChannelHandles.size())); } std::vector SendRecvTestEngine::getSendBuff() { return {devicePtrs_[0].get()}; }