diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 84c6f88c2..660d44b7d 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -36,7 +36,7 @@ jobs: -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ -DCOMPILE_EXAMPLES=on \ - -DCOMPILE_SERVER=on \ + -DCOMPILE_SERVER=off \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on \ diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 0b4821812..2d2c373f3 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -13,7 +13,15 @@ jobs: include: # Ubuntu 20.04 supports CUDA 11+ # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 20.04 gcc-9" + - name: "Ubuntu 20.04 gcc-9 full" + os: ubuntu-20.04 + cuda: "11.2" + gcc: 9 + cpu: true + gpu: true + unit_tests: false + examples: false + - name: "Ubuntu 20.04 gcc-9 CPU" os: ubuntu-20.04 cuda: "11.2" gcc: 9 @@ -22,7 +30,16 @@ jobs: unit_tests: false examples: false # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 22.04 gcc-11" + - name: "Ubuntu 22.04 gcc-11 full" + os: ubuntu-22.04 + cuda: "11.7" + gcc: 11 + clang: "" + cpu: true + gpu: true + unit_tests: false + examples: false + - name: "Ubuntu 22.04 gcc-11 CPU" os: ubuntu-22.04 cuda: "11.7" gcc: 11 diff --git a/CMakeLists.txt b/CMakeLists.txt index ac29b8e66..789bd2ea0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -436,48 +436,71 @@ if(CUDA_FOUND) # We want to compile as many targets as possible but different CUDA versions support different targets. # Let's instead enable options based on what cuda version we have. if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0") - option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON) - option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON) - option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON) - option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON) + option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) + option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) + option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON) + option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON) endif() if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND CUDA_VERSION VERSION_LESS "11.0") - option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON) - option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON) - option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON) - option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON) - option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON) + option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) + option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) + option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON) + option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON) + option(COMPILE_TURING "Compile GPU version with SM75 support" ON) endif() if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0") - option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON) - option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON) - option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON) - option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON) - option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON) - option(COMPILE_CUDA_SM80 "Compile GPU version with SM80 support" ON) + option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11 + option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11 + option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON) + option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON) + option(COMPILE_TURING "Compile GPU version with SM75 support" ON) + option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON) + LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets) + endif() + if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1") + option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11 + option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11 + option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON) + option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON) + option(COMPILE_TURING "Compile GPU version with SM75 support" ON) + option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON) + option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON) + LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets) endif() - if(COMPILE_CUDA_SM35) - LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above - endif(COMPILE_CUDA_SM35) - if(COMPILE_CUDA_SM50) + if(COMPILE_KEPLER) + message(STATUS "Compiling code for Kepler GPUs") + LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above + endif(COMPILE_KEPLER) + if(COMPILE_MAXWELL) + message(STATUS "Compiling code for Maxwell GPUs") LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;) # Maxwell GPUs - endif(COMPILE_CUDA_SM50) - if(COMPILE_CUDA_SM60) + endif(COMPILE_MAXWELL) + if(COMPILE_PASCAL) + message(STATUS "Compiling code for Pascal GPUs") LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;) # Pascal GPUs - endif(COMPILE_CUDA_SM60) - if(COMPILE_CUDA_SM70) - LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs - endif(COMPILE_CUDA_SM70) + endif(COMPILE_PASCAL) + if(COMPILE_VOLTA) + message(STATUS "Compiling code for Volta GPUs") + LIST(APPEND COMPUTE -arch=sm_70; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs + endif(COMPILE_VOLTA) if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") - if(COMPILE_CUDA_SM75) + if(COMPILE_TURING) + message(STATUS "Compiling code for Turing GPUs") LIST(APPEND COMPUTE -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75) # Turing GPUs - endif(COMPILE_CUDA_SM75) + endif(COMPILE_TURING) endif() if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0") - if(COMPILE_CUDA_SM80) + if(COMPILE_AMPERE) + message(STATUS "Compiling code for Ampere GPUs") LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs - endif(COMPILE_CUDA_SM80) + endif(COMPILE_AMPERE) + endif() + if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1") + if(COMPILE_AMPERE_RTX) + message(STATUS "Compiling code for Ampere RTX GPUs") + LIST(APPEND COMPUTE -gencode=arch=compute_86,code=sm_86; -gencode=arch=compute_86,code=compute_86) # Ampere RTX GPUs + endif(COMPILE_AMPERE_RTX) endif() if(USE_STATIC_LIBS) diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index f2062c381..81bb571b7 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -173,34 +173,43 @@ if(CUDA_FOUND) # disables compilation for sm_30 to avoid ptxas warning... that is general Kepler support. But K80s are supported for instance by sm_35 set(GENCODE "") - if(COMPILE_CUDA_SM35) + if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0") + set(GENCODE "${GENCODE} -Wno-deprecated-gpu-targets") + endif() + if(COMPILE_KEPLER) set(GENCODE "${GENCODE} -gencode=arch=compute_35,code=sm_35") - endif(COMPILE_CUDA_SM35) - if(COMPILE_CUDA_SM50) + endif(COMPILE_KEPLER) + if(COMPILE_MAXWELL) set(GENCODE "${GENCODE} -gencode=arch=compute_50,code=sm_50") - endif(COMPILE_CUDA_SM50) - if(COMPILE_CUDA_SM60) + endif(COMPILE_MAXWELL) + if(COMPILE_PASCAL) set(GENCODE "${GENCODE} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61") - endif(COMPILE_CUDA_SM60) - if(COMPILE_CUDA_SM70) - set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70") - endif(COMPILE_CUDA_SM70) - if(COMPILE_CUDA_SM75) + endif(COMPILE_PASCAL) + if(COMPILE_VOLTA) + set(GENCODE "${GENCODE} -arch=sm_70 -gencode=arch=compute_70,code=sm_70") + endif(COMPILE_VOLTA) + if(COMPILE_TURING) set(GENCODE "${GENCODE} -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75") - endif(COMPILE_CUDA_SM75) - if(COMPILE_CUDA_SM80) + endif(COMPILE_TURING) + if(COMPILE_AMPERE) set(GENCODE "${GENCODE} -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80") - endif(COMPILE_CUDA_SM80) + endif(COMPILE_AMPERE) # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation + # Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16. + # Apparently this does not get properly propagated otherwise and builts with only a single thread/process. ExternalProject_Add(nccl_install SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl CONFIGURE_COMMAND "" BUILD_COMMAND - ${CMAKE_MAKE_PROGRAM} -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build + # Note: $(MAKE) here causes CMake cache generation via Ninja failing (Windows build with + # CMake only) because Ninja complains about unescaped $ signs. This happens only if + # COMPILE_CUDA=on and because of that, USE_NCCL=off is used in CMakeSettings.json for now. + # @TODO: find proper escaping that works for MSVC builds. + "\$(MAKE)" -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build BUILDDIR=${CMAKE_BINARY_DIR}/local CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} - CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER} + CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER} CXX_FLAGS=${NCCL_FLAGS} INSTALL_COMMAND "") set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/local/lib/libnccl_static.a) diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index 903ee3ba6..618ba5c24 100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -36,4 +36,4 @@ template void Add, BinaryFunctor >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::gpu::Add, marian::functional::Assignee<2> > >, IntrusivePtr, IntrusivePtr >(marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::gpu::Aggregate >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, IntrusivePtr >(marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); -template void marian::gpu::Add,marian::functional::UnaryFunctor > >,class IntrusivePtr,class IntrusivePtr >(marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,class IntrusivePtr,class IntrusivePtr,class IntrusivePtr); +template void marian::gpu::Add,marian::functional::UnaryFunctor > >,class IntrusivePtr,class IntrusivePtr >(marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,class IntrusivePtr,class IntrusivePtr,class IntrusivePtr); diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc index 29a3a5d61..52de513f6 100644 --- a/src/tensors/gpu/add_all.inc +++ b/src/tensors/gpu/add_all.inc @@ -32,11 +32,13 @@ template void AggregateAll, template void AggregateAll, Assignee<2>>, Assignee<3>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, Assignee<3>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void AggregateAll, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void AggregateAll, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); template void AggregateAll, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); template void marian::AggregateAll >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); -template void marian::AggregateAll,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); +template void marian::AggregateAll,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); template void marian::AggregateAll >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); + #if COMPILE_FP16 template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void AggregateAll<__half, float, BinaryFunctor>>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -73,6 +75,6 @@ template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); -template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); +template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); #endif