Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROCm support #213

Merged
merged 49 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
4337e3c
compiled
chhwang Nov 8, 2023
e510e90
updates
chhwang Nov 8, 2023
a79f840
fix
chhwang Nov 9, 2023
c386610
Merge branch 'main' into chhwang/rocm
chhwang Nov 9, 2023
886a0dc
bypass gpu check in codeql
chhwang Nov 9, 2023
aa387e9
cmake fix
chhwang Nov 9, 2023
f9cc607
fix lint workflow
chhwang Nov 9, 2023
82da429
fix python binding
chhwang Nov 9, 2023
b288c7d
Merge branch 'chhwang/rocm' of https://github.com/microsoft/mscclpp i…
chhwang Nov 9, 2023
549a8a6
Testing `hipExtMallocWithFlags`
chhwang Nov 10, 2023
e304976
Compile mscclpp-test & python bindings
chhwang Nov 11, 2023
965fa37
more fixes
chhwang Nov 11, 2023
397f18d
Fix semaphores
chhwang Nov 14, 2023
25e76e2
Merge branch 'main' into chhwang/rocm
chhwang Nov 14, 2023
3eaaf34
Fix merge
chhwang Nov 14, 2023
b3ca644
All looks good now
chhwang Nov 14, 2023
8de5ada
Merge branch 'main' into chhwang/rocm
chhwang Nov 14, 2023
9f0fcfe
Merge branch 'main' into chhwang/rocm
chhwang Nov 19, 2023
e398ea4
fix merge
chhwang Nov 19, 2023
654271a
revert semaphore.cc
chhwang Nov 19, 2023
8844483
change hard-coded warp size
chhwang Nov 19, 2023
3c49df4
Merge branch 'main' into chhwang/rocm
chhwang Nov 21, 2023
4fe210c
some cleanup
chhwang Nov 21, 2023
fd4adca
Fix multi-nodes test workflow
chhwang Nov 21, 2023
406a2c3
bring cuda-specific optimizations back
chhwang Nov 21, 2023
c6b127e
Lint
chhwang Nov 21, 2023
cac5cdd
Templatize Dockerfiles & update workflows
chhwang Nov 22, 2023
35b3dec
Merge branch 'main' into chhwang/docker
chhwang Nov 22, 2023
664f140
Merge branch 'main' into chhwang/rocm
chhwang Nov 22, 2023
82717d5
Drop CUDA 12.1 & use 12.2
chhwang Nov 22, 2023
6fa0520
update for multi-node test
Binyang2014 Nov 22, 2023
b8b4f50
Assume host code is always potentially mixed with device code
chhwang Nov 22, 2023
c6a7ea8
Fix
chhwang Nov 22, 2023
9a93dd9
fix workflows
chhwang Nov 22, 2023
abf9e9f
update pytest command
chhwang Nov 22, 2023
7ed2655
Fix Fifo binding
chhwang Nov 22, 2023
9d80330
Merge branch 'chhwang/docker' into chhwang/rocm
chhwang Nov 22, 2023
ffcbba0
Merge branch 'main' into chhwang/rocm
chhwang Nov 23, 2023
ff2eb08
Drop gfx940
Nov 23, 2023
f63810b
Revised interfaces
Nov 24, 2023
73844a1
Fix for cuda & lint
chhwang Nov 24, 2023
20f0bbd
typo fix
Nov 24, 2023
df5f616
tackle warnings from clang
Nov 24, 2023
aaf1480
Fix a potential sync bug
chhwang Nov 24, 2023
c2d8607
Lint
chhwang Nov 24, 2023
792aeb0
Add todos
Nov 24, 2023
9fc7ca7
Tackle comments
Nov 24, 2023
2022df4
tackle comments
Nov 24, 2023
9b2f413
Fix comments
Nov 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:

- name: Build
run: |
cmake -DBYPASS_PEERMEM_CHECK=ON .
cmake -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
make -j

- name: Perform CodeQL Analysis
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:

- name: Run cpplint
run: |
CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*")
CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}

pylint:
Expand Down
119 changes: 82 additions & 37 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,80 +9,125 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")

cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
enable_language(CXX)

# Format targets
include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

# Options
option(ENABLE_TRACE "Enable tracing" OFF)
option(USE_NPKIT "Use NPKIT" ON)
option(BUILD_TESTS "Build tests" ON)
option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF)
option(USE_CUDA "Use NVIDIA/CUDA." OFF)
option(USE_ROCM "Use AMD/ROCm." OFF)
option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)

# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version
find_package(CUDAToolkit REQUIRED)
if(CUDAToolkit_FOUND)
if(BYPASS_GPU_CHECK)
if(USE_CUDA)
message("Bypassing GPU check: using NVIDIA/CUDA.")
find_package(CUDAToolkit REQUIRED)
elseif(USE_ROCM)
message("Bypassing GPU check: using AMD/ROCm.")
# Temporal fix for rocm5.6
set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
find_package(hip REQUIRED)
else()
message(FATAL_ERROR "Bypassing GPU check: neither NVIDIA/CUDA nor AMD/ROCm is specified.")
endif()
else()
# Detect GPUs
include(CheckNvidiaGpu)
include(CheckAmdGpu)
if(NVIDIA_FOUND AND AMD_FOUND)
message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.")
set(USE_CUDA ON)
set(USE_ROCM OFF)
elseif(NVIDIA_FOUND)
message("Detected NVIDIA/CUDA.")
set(USE_CUDA ON)
set(USE_ROCM OFF)
elseif(AMD_FOUND)
message("Detected AMD/ROCm.")
set(USE_CUDA OFF)
set(USE_ROCM ON)
else()
message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.")
endif()
endif()

# Declare project
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
if(USE_CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
project(mscclpp LANGUAGES CXX CUDA)

# CUDA 11 or higher is required
if(CUDAToolkit_VERSION_MAJOR LESS 11)
message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
endif()

# Set CUDA architectures
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
set(CMAKE_CUDA_ARCHITECTURES 80)
endif()

# Hopper architecture
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90)
endif()
endif()
set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver)

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")

set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
endif()
endif()
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
project(mscclpp LANGUAGES CXX HIP)

set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)

set(GPU_LIBRARIES hip::host)
set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
endif()

# Format targets
include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)

# Find ibverbs and libnuma
find_package(IBVerbs REQUIRED)
find_package(NUMA REQUIRED)

# Find optional packages
if(ALLOW_GDRCOPY)
find_package(GDRCopy)
endif()

add_library(mscclpp_obj OBJECT)
target_include_directories(mscclpp_obj
PRIVATE
${CUDAToolkit_INCLUDE_DIRS}
${GPU_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS}
${GDRCOPY_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES})
${NUMA_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES})
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
if(USE_CUDA)
target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_compile_definitions(mscclpp_obj PRIVATE USE_ROCM)
endif()
if(ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE ENABLE_TRACE)
endif()
if(USE_NPKIT)
target_compile_definitions(mscclpp_obj PRIVATE ENABLE_NPKIT)
endif()
if(ALLOW_GDRCOPY AND GDRCOPY_FOUND)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
target_link_libraries(mscclpp_obj PRIVATE MSCCLPP::gdrcopy)
endif()

# libmscclpp
add_library(mscclpp SHARED)
Expand All @@ -103,7 +148,7 @@ install(TARGETS mscclpp_static
ARCHIVE DESTINATION lib)

# Tests
if (BUILD_TESTS)
if(BUILD_TESTS)
enable_testing() # Called here to allow ctest from the build directory
add_subdirectory(test)
endif()
Expand Down
25 changes: 25 additions & 0 deletions cmake/CheckAmdGpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

set(AMD_FOUND "FALSE")

set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
# Temporal fix for rocm5.6
set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr")
set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs")

find_package(hip QUIET)

if(NOT hip_FOUND)
return()
endif()

enable_language(HIP)

set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip")

try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})

if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
set(AMD_FOUND "TRUE")
endif()
36 changes: 36 additions & 0 deletions cmake/CheckNvidiaGpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

set(NVIDIA_FOUND "FALSE")

find_package(CUDAToolkit)

if(NOT CUDAToolkit_FOUND)
return()
endif()

set(CMAKE_CUDA_ARCHITECTURES "60")
if(NOT CMAKE_CUDA_COMPILER)
# In case the CUDA Toolkit directory is not in the PATH
find_program(CUDA_COMPILER
NAMES nvcc
PATHS ${CUDAToolkit_BIN_DIR})
if(NOT CUDA_COMPILER)
message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}")
unset(CMAKE_CUDA_ARCHITECTURES)
return()
endif()
set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}")
endif()
enable_language(CUDA)

set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu")

try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})

if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
set(NVIDIA_FOUND "TRUE")
else()
unset(CMAKE_CUDA_ARCHITECTURES)
unset(CMAKE_CUDA_COMPILER)
endif()
15 changes: 15 additions & 0 deletions cmake/check_amd_gpu.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <hip/hip_runtime.h>

__global__ void kernel() {}

int main() {
int cnt;
hipError_t err = hipGetDeviceCount(&cnt);
if (err != hipSuccess || cnt == 0) {
return 1;
}
return 0;
}
15 changes: 15 additions & 0 deletions cmake/check_nvidia_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <cuda_runtime.h>

__global__ void kernel() {}

int main() {
int cnt;
cudaError_t err = cudaGetDeviceCount(&cnt);
if (err != cudaSuccess || cnt == 0) {
return 1;
}
return 0;
}
65 changes: 65 additions & 0 deletions include/mscclpp/atomic_device.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef MSCCLPP_ATOMIC_DEVICE_HPP_
#define MSCCLPP_ATOMIC_DEVICE_HPP_

#include "device.hpp"

#if defined(MSCCLPP_DEVICE_CUDA)
#include <cuda/atomic>
#endif // defined(MSCCLPP_DEVICE_CUDA)

namespace mscclpp {

#if defined(MSCCLPP_DEVICE_CUDA)

constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.fetch_add(val, memoryOrder);
}

#elif defined(MSCCLPP_DEVICE_HIP)

constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
return __atomic_load_n(ptr, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
__atomic_store_n(ptr, val, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
return __atomic_fetch_add(ptr, val, memoryOrder);
}

#endif // defined(MSCCLPP_DEVICE_HIP)

} // namespace mscclpp

#endif // MSCCLPP_ATOMIC_DEVICE_HPP_
Loading