Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update nccl and gpu targets, add restore GPU build workflows #101

Merged
merged 4 commits into from
Jul 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
-DCOMPILE_CPU=on \
-DCOMPILE_CUDA=off \
-DCOMPILE_EXAMPLES=on \
-DCOMPILE_SERVER=on \
-DCOMPILE_SERVER=off \
-DCOMPILE_TESTS=on \
-DUSE_FBGEMM=on \
-DUSE_SENTENCEPIECE=on \
Expand Down
21 changes: 19 additions & 2 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ jobs:
include:
# Ubuntu 20.04 supports CUDA 11+
# Unit tests and examples are not compiled to save disk space
- name: "Ubuntu 20.04 gcc-9"
- name: "Ubuntu 20.04 gcc-9 full"
os: ubuntu-20.04
cuda: "11.2"
gcc: 9
cpu: true
gpu: true
unit_tests: false
examples: false
- name: "Ubuntu 20.04 gcc-9 CPU"
os: ubuntu-20.04
cuda: "11.2"
gcc: 9
Expand All @@ -22,7 +30,16 @@ jobs:
unit_tests: false
examples: false
# Unit tests and examples are not compiled to save disk space
- name: "Ubuntu 22.04 gcc-11"
- name: "Ubuntu 22.04 gcc-11 full"
os: ubuntu-22.04
cuda: "11.7"
gcc: 11
clang: ""
cpu: true
gpu: true
unit_tests: false
examples: false
- name: "Ubuntu 22.04 gcc-11 CPU"
os: ubuntu-22.04
cuda: "11.7"
gcc: 11
Expand Down
81 changes: 52 additions & 29 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -436,48 +436,71 @@
# We want to compile as many targets as possible but different CUDA versions support different targets.
# Let's instead enable options based on what cuda version we have.
if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF)
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF)
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
endif()
if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND CUDA_VERSION VERSION_LESS "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF)
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF)
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
option(COMPILE_TURING "Compile GPU version with SM75 support" ON)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
option(COMPILE_CUDA_SM80 "Compile GPU version with SM80 support" ON)
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
option(COMPILE_TURING "Compile GPU version with SM75 support" ON)
option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1")
option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON)
option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON)
option(COMPILE_TURING "Compile GPU version with SM75 support" ON)
option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON)
option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()

if(COMPILE_CUDA_SM35)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
endif(COMPILE_CUDA_SM35)
if(COMPILE_CUDA_SM50)
if(COMPILE_KEPLER)
message(STATUS "Compiling code for Kepler GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
endif(COMPILE_KEPLER)
if(COMPILE_MAXWELL)
message(STATUS "Compiling code for Maxwell GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;) # Maxwell GPUs
endif(COMPILE_CUDA_SM50)
if(COMPILE_CUDA_SM60)
endif(COMPILE_MAXWELL)
if(COMPILE_PASCAL)
message(STATUS "Compiling code for Pascal GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;) # Pascal GPUs
endif(COMPILE_CUDA_SM60)
if(COMPILE_CUDA_SM70)
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
endif(COMPILE_CUDA_SM70)
endif(COMPILE_PASCAL)
if(COMPILE_VOLTA)
message(STATUS "Compiling code for Volta GPUs")
LIST(APPEND COMPUTE -arch=sm_70; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
endif(COMPILE_VOLTA)
if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
if(COMPILE_CUDA_SM75)
if(COMPILE_TURING)
message(STATUS "Compiling code for Turing GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75) # Turing GPUs
endif(COMPILE_CUDA_SM75)
endif(COMPILE_TURING)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
if(COMPILE_CUDA_SM80)
if(COMPILE_AMPERE)
message(STATUS "Compiling code for Ampere GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs
endif(COMPILE_CUDA_SM80)
endif(COMPILE_AMPERE)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1")
if(COMPILE_AMPERE_RTX)
message(STATUS "Compiling code for Ampere RTX GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_86,code=sm_86; -gencode=arch=compute_86,code=compute_86) # Ampere RTX GPUs
endif(COMPILE_AMPERE_RTX)
endif()

if(USE_STATIC_LIBS)
Expand Down Expand Up @@ -546,7 +569,7 @@
endif(CUDA_FOUND)

else(COMPILE_CUDA)
message(WARNING "COMPILE_CUDA=off : Building only CPU version")

Check warning on line 572 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU-only

COMPILE_CUDA=off : Building only CPU version
endif(COMPILE_CUDA)

# TODO: make compatible with older CUDA versions
Expand Down
39 changes: 24 additions & 15 deletions src/3rd_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,34 +173,43 @@ if(CUDA_FOUND)
# disables compilation for sm_30 to avoid ptxas warning... that is general Kepler support. But K80s are supported for instance by sm_35

set(GENCODE "")
if(COMPILE_CUDA_SM35)
if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
set(GENCODE "${GENCODE} -Wno-deprecated-gpu-targets")
endif()
if(COMPILE_KEPLER)
set(GENCODE "${GENCODE} -gencode=arch=compute_35,code=sm_35")
endif(COMPILE_CUDA_SM35)
if(COMPILE_CUDA_SM50)
endif(COMPILE_KEPLER)
if(COMPILE_MAXWELL)
set(GENCODE "${GENCODE} -gencode=arch=compute_50,code=sm_50")
endif(COMPILE_CUDA_SM50)
if(COMPILE_CUDA_SM60)
endif(COMPILE_MAXWELL)
if(COMPILE_PASCAL)
set(GENCODE "${GENCODE} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
endif(COMPILE_CUDA_SM60)
if(COMPILE_CUDA_SM70)
set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70")
endif(COMPILE_CUDA_SM70)
if(COMPILE_CUDA_SM75)
endif(COMPILE_PASCAL)
if(COMPILE_VOLTA)
set(GENCODE "${GENCODE} -arch=sm_70 -gencode=arch=compute_70,code=sm_70")
endif(COMPILE_VOLTA)
if(COMPILE_TURING)
set(GENCODE "${GENCODE} -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75")
endif(COMPILE_CUDA_SM75)
if(COMPILE_CUDA_SM80)
endif(COMPILE_TURING)
if(COMPILE_AMPERE)
set(GENCODE "${GENCODE} -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80")
endif(COMPILE_CUDA_SM80)
endif(COMPILE_AMPERE)

# install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
# Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16.
# Apparently this does not get properly propagated otherwise and builts with only a single thread/process.
ExternalProject_Add(nccl_install
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl
CONFIGURE_COMMAND ""
BUILD_COMMAND
${CMAKE_MAKE_PROGRAM} -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
# Note: $(MAKE) here causes CMake cache generation via Ninja failing (Windows build with
# CMake only) because Ninja complains about unescaped $ signs. This happens only if
# COMPILE_CUDA=on and because of that, USE_NCCL=off is used in CMakeSettings.json for now.
# @TODO: find proper escaping that works for MSVC builds.
"\$(MAKE)" -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
BUILDDIR=${CMAKE_BINARY_DIR}/local CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}
CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER}
CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER} CXX_FLAGS=${NCCL_FLAGS}
INSTALL_COMMAND "")

set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/local/lib/libnccl_static.a)
Expand Down
2 changes: 1 addition & 1 deletion src/tensors/gpu/add.inc
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plu
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Add<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Aggregate<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
Loading
Loading