From 1393cb0a592cae42dde7b15d1b4c80dec8e2e9c4 Mon Sep 17 00:00:00 2001 From: kylosus <33132401+kylosus@users.noreply.github.com> Date: Mon, 1 Jan 2024 18:20:39 +0300 Subject: [PATCH] CMakeLists.txt Improvements for CUDA (#1337) This PR bumps `cmake` version to `3.17` and replaces the deprecated `find_package(CUDA)` with [FindCUDAToolkit](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html), with a number of improvements to the compilation process: - CUDA Include and Library directories are now handled automatically by `cmake` - CUDA architecture handling is reworked: No more regex in `CMakeLists.txt` or manual `-gencode` string generation in python code. - CUDA source files are now included directly in the targets: cmake handles proper compilation and linking of device code automatically. - Similar modifications to `OpenMP` and `Threads` targets --- dace/codegen/CMakeLists.txt | 86 ++++++++++++---------------- dace/codegen/targets/cuda.py | 8 +-- dace/codegen/tools/get_cuda_arch.cpp | 15 +++-- dace/config_schema.yml | 4 +- 4 files changed, 54 insertions(+), 59 deletions(-) diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt index d77fea65e4..23ee0e40ee 100644 --- a/dace/codegen/CMakeLists.txt +++ b/dace/codegen/CMakeLists.txt @@ -1,5 +1,5 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -cmake_minimum_required(VERSION 3.15) +cmake_minimum_required(VERSION 3.17) project(dace_program) # General options @@ -9,6 +9,9 @@ set(DACE_FILES "" CACHE STRING "List of host code files relative to the root of set(DACE_LIBS "" CACHE STRING "Extra libraries") set(HLSLIB_PART_NAME "${DACE_XILINX_PART_NAME}") +# CUDA +set(DACE_CUDA_ARCHITECTURES_DEFAULT "" CACHE STRING "Default CUDA architectures in case native not found") + # FPGA specific set(DACE_FPGA_AUTOBUILD_BITSTREAM OFF CACHE STRING "Automatically build bitstreams if they are not present.") @@ -60,7 +63,7 @@ foreach(DACE_FILE ${DACE_FILES}) set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE}) else() set(DACE_ENABLE_CUDA ON) - set(DACE_CUDA_FILES ${DACE_CUDA_FILES} ${DACE_FILE}) + set(DACE_CPP_FILES ${DACE_CPP_FILES} ${DACE_FILE}) endif() elseif(${DACE_FILE_TARGET} STREQUAL "xilinx") set(DACE_ENABLE_XILINX ON) @@ -103,24 +106,42 @@ include_directories(${DACE_RUNTIME_DIR}/include) # Global DaCe external dependencies find_package(Threads REQUIRED) find_package(OpenMP REQUIRED COMPONENTS CXX) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + +list(APPEND DACE_LIBS Threads::Threads) +list(APPEND DACE_LIBS OpenMP::OpenMP_CXX) + add_definitions(-DDACE_BINARY_DIR=\"${CMAKE_BINARY_DIR}\") -set(DACE_LIBS ${DACE_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${OpenMP_CXX_LIBRARIES}) + if(DACE_ENABLE_MPI) find_package(MPI REQUIRED) - include_directories(${MPI_CXX_INCLUDE_PATH}) - set(DACE_LIBS ${DACE_LIBS} ${MPI_CXX_LIBRARIES}) + list(APPEND DACE_LIBS MPI::MPI_CXX) endif() + if(DACE_ENABLE_CUDA) - find_package(CUDA REQUIRED) - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - include_directories(${CUDA_INCLUDE_DIRS}) - if (MSVC_IDE) - link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) - else() - link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) + set(CUDAToolkit_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + + find_package(CUDAToolkit REQUIRED) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + + # CMake 3.24: set_property(TARGET tgt PROPERTY CUDA_ARCHITECTURES native) + if (NOT DEFINED LOCAL_CUDA_ARCHITECTURES) + execute_process(COMMAND "${CUDAToolkit_NVCC_EXECUTABLE}" "--run" + "${CMAKE_SOURCE_DIR}/tools/get_cuda_arch.cpp" + OUTPUT_VARIABLE _local_arch RESULT_VARIABLE _arch_res) + + if(_arch_res EQUAL 0) + set(LOCAL_CUDA_ARCHITECTURES "${_local_arch}" CACHE STRING "Detected local GPUs for compilation") + message(STATUS "Local CUDA architectures detected: ${LOCAL_CUDA_ARCHITECTURES}") + else() + set(LOCAL_CUDA_ARCHITECTURES "${DACE_CUDA_ARCHITECTURES_DEFAULT}" CACHE STRING "Detected local GPUs for compilation") + message(STATUS "No local CUDA-capable GPUs found. Using default: ${DACE_CUDA_ARCHITECTURES_DEFAULT}") + endif() endif() - set(DACE_LIBS ${DACE_LIBS} ${CUDA_LIBRARIES}) + + set(CMAKE_CUDA_ARCHITECTURES "${LOCAL_CUDA_ARCHITECTURES}") + enable_language(CUDA) + list(APPEND DACE_LIBS CUDA::cudart) add_definitions(-DWITH_CUDA) if (MSVC_IDE) @@ -242,38 +263,6 @@ if (DACE_ENABLE_RTL AND DACE_ENABLE_XILINX) include ("${DACE_RTLLIB_DIR}/cmake/rtl_target.cmake") endif() -# Create CUDA object files -if(DACE_ENABLE_CUDA) - # Get local CUDA architectures - if (NOT DEFINED LOCAL_CUDA_ARCHITECTURES) - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin" "${CMAKE_CXX_COMPILER}" "--run" - "${CMAKE_SOURCE_DIR}/tools/get_cuda_arch.cpp" - OUTPUT_VARIABLE _arch_out RESULT_VARIABLE _arch_res - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(_arch_res EQUAL 0) - string(REGEX REPLACE "\n" ";" _arch_out "${_arch_out}") - list(GET _arch_out -1 _local_arch) - string(REGEX REPLACE " " ";" _local_arch "${_local_arch}") - set(LOCAL_CUDA_ARCHITECTURES "${_local_arch}" CACHE STRING "Detected local GPUs for compilation") - message(STATUS "Local CUDA architectures detected: ${LOCAL_CUDA_ARCHITECTURES}") - else() - set(LOCAL_CUDA_ARCHITECTURES "" CACHE STRING "Detected local GPUs for compilation") - message(STATUS "No local CUDA-capable GPUs found") - endif() - endif() - - # Add flags to compile for local CUDA architectures - foreach(var ${LOCAL_CUDA_ARCHITECTURES}) - list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${var},code=sm_${var}) - endforeach() - - cuda_include_directories(${DACE_RUNTIME_DIR}/include) - cuda_compile(DACE_CUDA_OBJECTS ${DACE_CUDA_FILES}) - set(DACE_OBJECTS ${DACE_OBJECTS} ${DACE_CUDA_OBJECTS}) -endif() # DACE_ENABLE_CUDA - - # Create HIP object files if(DACE_ENABLE_HIP) # Get local AMD architectures @@ -580,7 +569,7 @@ include("targets/mlir/mlir.cmake") # Create DaCe library file add_library(${DACE_PROGRAM_NAME} SHARED ${DACE_CPP_FILES} ${DACE_OBJECTS}) -target_link_libraries(${DACE_PROGRAM_NAME} ${DACE_LIBS}) +target_link_libraries(${DACE_PROGRAM_NAME} PUBLIC ${DACE_LIBS}) # Add additional required files if(DACE_ENABLE_INTELFPGA) @@ -599,6 +588,7 @@ if(DACE_ENABLE_INTELFPGA) DEPENDS ${DACE_PROGRAM_NAME}_hardware.aocx) endif() endif() + if(DACE_ENABLE_XILINX) if(DACE_XILINX_MODE STREQUAL "software_emulation" AND DACE_FPGA_AUTOBUILD_BITSTREAM) add_custom_target(autobuild_bitstream ALL @@ -619,7 +609,7 @@ endif() # Create DaCe loader stub add_library(dacestub_${DACE_PROGRAM_NAME} SHARED "${CMAKE_SOURCE_DIR}/tools/dacestub.cpp") -target_link_libraries(dacestub_${DACE_PROGRAM_NAME} ${CMAKE_THREAD_LIBS_INIT} ${OpenMP_CXX_LIBRARIES}) +target_link_libraries(dacestub_${DACE_PROGRAM_NAME} Threads::Threads OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS}) # Windows-specific fixes if (MSVC_IDE) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 5060339e18..4e008e13ac 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -484,11 +484,11 @@ def cmake_options(): cuda_arch = Config.get('compiler', 'cuda', 'cuda_arch').split(',') cuda_arch = [ca for ca in cuda_arch if ca is not None and len(ca) > 0] - flags = Config.get("compiler", "cuda", "args") - flags += ' ' + ' '.join('-gencode arch=compute_{arch},code=sm_{arch}'.format(arch=arch) - for arch in cuda_arch) + cuda_arch = ';'.join(cuda_arch) + options.append(f'-DDACE_CUDA_ARCHITECTURES_DEFAULT="{cuda_arch}"') - options.append("-DCUDA_NVCC_FLAGS=\"{}\"".format(flags)) + flags = Config.get("compiler", "cuda", "args") + options.append("-DCMAKE_CUDA_FLAGS=\"{}\"".format(flags)) if backend == 'hip': hip_arch = Config.get('compiler', 'cuda', 'hip_arch').split(',') diff --git a/dace/codegen/tools/get_cuda_arch.cpp b/dace/codegen/tools/get_cuda_arch.cpp index a8abf25de1..7dee1c2a25 100644 --- a/dace/codegen/tools/get_cuda_arch.cpp +++ b/dace/codegen/tools/get_cuda_arch.cpp @@ -1,12 +1,14 @@ // Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. #include +#include #include +#include #include #include #include -int main(int argc, char **argv) { +int main() { int count; if (cudaGetDeviceCount(&count) != cudaSuccess) return 1; @@ -22,10 +24,13 @@ int main(int argc, char **argv) { architectures.insert(ss.str()); } - // Print out architectures - for (std::set::iterator iter = architectures.begin(); - iter != architectures.end(); ++iter) - std::cout << *iter << " "; + if (architectures.empty()) { + return 1; + } + + std::copy(architectures.begin(), std::prev(architectures.end(), 1), + std::ostream_iterator(std::cout, ";")); + std::cout << *architectures.rbegin(); return 0; } diff --git a/dace/config_schema.yml b/dace/config_schema.yml index e6e2d568cc..737862cacc 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -289,8 +289,8 @@ required: type: str title: nvcc Arguments description: Compiler argument flags for CUDA - default: '-std=c++14 -Xcompiler -fPIC -O3 -Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter' - default_Windows: '-std=c++14 -O3 --use_fast_math' + default: '-Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter' + default_Windows: '-O3 --use_fast_math' hip_args: type: str