diff --git a/CMakeLists.txt b/CMakeLists.txt index 9eafd77e..487fa247 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ project( dwarf-p-cloudsc LANGUAGES C Fortran ) include( cmake/compat.cmake ) if( CMAKE_Fortran_COMPILER_ID MATCHES "GNU") - ecbuild_add_fortran_flags("-ffree-line-length-none") + # ecbuild_add_fortran_flags("-ffree-line-length-none") if( CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL "10.0") ecbuild_add_fortran_flags("-fallow-argument-mismatch") endif() diff --git a/README.md b/README.md index a9e9d59b..c43692ee 100644 --- a/README.md +++ b/README.md @@ -60,29 +60,6 @@ Balthasar Reuter (balthasar.reuter@ecmwf.int) move parameter structures to constant memory. To enable this variant, a suitable CUDA installation is required and the `--with-cuda` flag needs to be passed at the build stage. -- **dwarf-cloudsc-gpu-scc-cuf-k-caching**: GPU-enabled and further - optimized version of CLOUDSC that uses the SCC loop layout in - combination with loop fusion and temporary local array demotion, implemented - using CUDA-Fortran (CUF). To enable this variant, - a suitable CUDA installation is required and the `--with-cuda` flag - needs to be passed at the build stage. -- **CUDA C prototypes**: To enable these variants, a suitable - CUDA installation is required and the `--with-cuda` flag needs - to be pased at the build stage. - - **dwarf-cloudsc-cuda**: GPU-enabled, CUDA C version of CLOUDSC. - - **dwarf-cloudsc-cuda-hoist**: GPU-enabled, optimized CUDA C version - of CLOUDSC including host side hoisted temporary local variables. - - **dwarf-cloudsc-cuda-k-caching**: GPU-enabled, further optimized CUDA - C version of CLOUDSC including loop fusion and temporary local - array demotion. -- **dwarf-cloudsc-gpu-scc-field**: GPU-enabled and optimized version of - CLOUDSC that uses the SCC loop layout, and a dedicated Fortran FIELD - API to manage device offload and copyback. The intent is to demonstrate - the explicit use of pinned host memory to speed-up data transfers, as - provided by the shipped prototype implmentation, and investigate the - effect of different data storage allocation layouts. To enable this - variant, a suitable CUDA installation is required and the - `--with-cuda` flag needs to be passed at the build stage. ## Download and Installation @@ -231,6 +208,17 @@ cd build ./bin/dwarf-cloudsc-fortran 4 16384 32 # The cleaned-up Fortran ./bin/dwarf-cloudsc-c 4 16384 32 # The standalone C version ``` +### Building on NEC SX-AURORA TSUBAS +To build on NEC SX-AURORA TSUBAS system, run the following commands + +```sh +./cloudsc-bundle create +HDF5_ROOT=HDF5-installation-PATH ./cloudsc-bundle build --arch arch/ecmwf/aurora/nec/4.0.0/ [--single-precision] [--with-mpi] --hdf5 ON --cloudsc-fortran ON --cloudsc-prototype1 OFF --verbose --log DEBUG +``` + +Currently available `NEC ompiler/version` selections are: + +* `nec/4.0.0 (nfort, ncc, nc++)` ### Running on ECMWF's Atos BullSequana XH2000 @@ -272,27 +260,6 @@ srun bash -c "CUDA_VISIBLE_DEVICES=\$SLURM_LOCALID bin/dwarf-cloudsc-gpu-scc-hoi In principle, the same should work for multi-node execution (`-N 2`, `-N 4` etc.) once interconnect issues are resolved. -### GPU runs: Timing device kernels and data transfers - -For GPU-enabled runs two internal timer results are reported: - -* The isolated compute time of the main compute kernel on device (where `#BLKS == 1`) -* The overall time of the execution loop including data offload and copyback - -It is important to note that due to the nature of the kernel, data -transfer overheads will dominate timings, and that most supported GPU -variants aim to optimise compute kernel timings only. However, a -dedicated variant `dwarf-cloudsc-gpu-scc-field` has been added to -explore host-side memory pinning, which improves data transfer times -and alternative data layout strategies. By default, this will allocate -each array variable individually in pinned memory. A runtime flag -`CLOUDSC_PACKED_STORAGE=ON` can be used to enable "packed" storage, -where multiple arrays are stored in a single base allocation, eg. - -```sh -NV_ACC_CUDA_HEAPSIZE=8G CLOUDSC_PACKED_STORAGE=ON ./bin/dwarf-cloudsc-gpu-scc-field 1 80000 128 -``` - ## Loki transformations for CLOUDSC [Loki](https://github.com/ecmwf-ifs/loki) is an in-house developed diff --git a/arch/ecmwf/aurora/nec/4.0.0/env.sh b/arch/ecmwf/aurora/nec/4.0.0/env.sh new file mode 100644 index 00000000..aa9c3fbf --- /dev/null +++ b/arch/ecmwf/aurora/nec/4.0.0/env.sh @@ -0,0 +1,33 @@ +# Source me to get the correct configure/build/run environment + +# Store tracing and disable (module is *way* too verbose) +{ tracing_=${-//[^x]/}; set +x; } 2>/dev/null + +module_load() { + echo "+ module load $1" + module load $1 +} +module_unload() { + echo "+ module unload $1" + module unload $1 +} + +export FC=nfort +export CC=ncc +export CXX=nc++ + +set -x + +# Increase stack size to maximum +ulimit -S -s unlimited + +# Enable floating point error trapping at run time +export VE_FPE_ENABLE=DIV,INV,FOF,FUF,INE + + +export PATH="/local/hdd/nabr/openmpi/nvhpc-nompi/20.9/bin:$PATH" + +# Restore tracing to stored setting +if [[ -n "$tracing_" ]]; then set -x; else set +x; fi + +export ECBUILD_TOOLCHAIN="./toolchain.cmake" diff --git a/arch/ecmwf/aurora/nec/4.0.0/toolchain.cmake b/arch/ecmwf/aurora/nec/4.0.0/toolchain.cmake new file mode 100644 index 00000000..535635d9 --- /dev/null +++ b/arch/ecmwf/aurora/nec/4.0.0/toolchain.cmake @@ -0,0 +1,89 @@ + +#################################################################### +# COMPILER +#################################################################### + +include( /opt/nec/ve/share/cmake/toolchainVE.cmake ) + + +set( ECBUILD_FIND_MPI ON ) + +#################################################################### +# Enviroment Variables +#################################################################### +set(NMPI_ROOT /opt/nec/ve/mpi/2.23.0) + +#################################################################### +# OpenMP FLAGS +#################################################################### + +set( OpenMP_C_FLAGS "-fopenmp " ) +set( OpenMP_CXX_FLAGS "-fopenmp " ) +set( OpenMP_Fortran_FLAGS "-fopenmp " ) + +#################################################################### +# OpenAcc FLAGS +#################################################################### + +set( OpenACC_Fortran_FLAGS "-acc -ta=tesla:lineinfo,deepcopy,maxregcount:100,fastmath" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Mvect=levels:6" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Mconcur=levels:6" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Minfo" ) + +#################################################################### +# NEC MPI Compiler +#################################################################### + +set(MPI_C_COMPILER ${NMPI_ROOT}/bin/mpincc CACHE FILEPATH "") +set(MPI_C_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_C_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi.a CACHE FILEPATH "") +set(MPI_C_COMPILE_FLAGS "-D_MPIPP_INCLUDE" CACHE STRING "") + +set(MPI_CXX_COMPILER ${NMPI_ROOT}/bin/mpinc++ CACHE FILEPATH "") +set(MPI_CXX_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_CXX_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi++.a CACHE FILEPATH "") + +set(MPI_Fortran_COMPILER ${NMPI_ROOT}/bin/mpifort CACHE FILEPATH "") +set(MPI_Fortran_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_Fortran_ADDITIONAL_INCLUDE_DIR ${NMPI_ROOT}/lib/ve/module CACHE FILEPATH "") +set(MPI_Fortran_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi.a CACHE FILEPATH "") +set(MPI_Fortran_COMPILE_FLAGS "-D_MPIPP_INCLUDE" CACHE STRING "") +#################################################################### +# COMMON FLAGS +#################################################################### + +set(ECBUILD_Fortran_FLAGS "-fpic") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mstack-arrays") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fdiag-vector=3") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fcse-after-vectorization") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-collapse ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-fusion ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-interchange ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-unroll-complete=200 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -ftrace") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fmove-loop-invariants-if ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -freplace-loop-equation ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -msched-interblock ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-floating-divide-instruction ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-power-to-explog ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-sqrt-instruction ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-threshold=3 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-functions ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-max-depth=5 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-max-function-size=200 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-merge-conditional ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fivdep ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-strip-mine ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -muse-mmap ") +##set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-packed") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -report-all") + +set( ECBUILD_Fortran_FLAGS_BIT "-O4 -mvector-fma" ) + +set( ECBUILD_C_FLAGS "-O2 " ) + +set( ECBUILD_CXX_FLAGS "-O2" ) + +# Fix for C++ template headers needed for Serialbox +set( GNU_HEADER_INCLUDE "-I/usr/local/apps/gcc/7.3.0/lib/gcc/x86_64-linux-gnu/7.3.0/include-fixed" ) +set( ECBUILD_CXX_FLAGS "${ECBUILD_CXX_FLAGS} ${GNU_HEADER_INCLUDE}" ) diff --git a/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake b/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake index f191e9a7..ce8de9da 100644 --- a/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake +++ b/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake @@ -37,14 +37,6 @@ set( OpenACC_Fortran_FLAGS "-acc=gpu -mp=gpu -gpu=cc80,lineinfo,fastmath" CACHE # Enable this to get more detailed compiler output # set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Minfo" ) -#################################################################### -# CUDA FLAGS -#################################################################### - -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 80) -endif() - #################################################################### # COMMON FLAGS #################################################################### diff --git a/arch/toolchains/ecmwf-nec-aurora.cmake b/arch/toolchains/ecmwf-nec-aurora.cmake new file mode 100644 index 00000000..535635d9 --- /dev/null +++ b/arch/toolchains/ecmwf-nec-aurora.cmake @@ -0,0 +1,89 @@ + +#################################################################### +# COMPILER +#################################################################### + +include( /opt/nec/ve/share/cmake/toolchainVE.cmake ) + + +set( ECBUILD_FIND_MPI ON ) + +#################################################################### +# Enviroment Variables +#################################################################### +set(NMPI_ROOT /opt/nec/ve/mpi/2.23.0) + +#################################################################### +# OpenMP FLAGS +#################################################################### + +set( OpenMP_C_FLAGS "-fopenmp " ) +set( OpenMP_CXX_FLAGS "-fopenmp " ) +set( OpenMP_Fortran_FLAGS "-fopenmp " ) + +#################################################################### +# OpenAcc FLAGS +#################################################################### + +set( OpenACC_Fortran_FLAGS "-acc -ta=tesla:lineinfo,deepcopy,maxregcount:100,fastmath" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Mvect=levels:6" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Mconcur=levels:6" ) +set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Minfo" ) + +#################################################################### +# NEC MPI Compiler +#################################################################### + +set(MPI_C_COMPILER ${NMPI_ROOT}/bin/mpincc CACHE FILEPATH "") +set(MPI_C_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_C_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi.a CACHE FILEPATH "") +set(MPI_C_COMPILE_FLAGS "-D_MPIPP_INCLUDE" CACHE STRING "") + +set(MPI_CXX_COMPILER ${NMPI_ROOT}/bin/mpinc++ CACHE FILEPATH "") +set(MPI_CXX_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_CXX_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi++.a CACHE FILEPATH "") + +set(MPI_Fortran_COMPILER ${NMPI_ROOT}/bin/mpifort CACHE FILEPATH "") +set(MPI_Fortran_INCLUDE_PATH ${NMPI_ROOT}/include CACHE FILEPATH "") +set(MPI_Fortran_ADDITIONAL_INCLUDE_DIR ${NMPI_ROOT}/lib/ve/module CACHE FILEPATH "") +set(MPI_Fortran_LIBRARIES ${NMPI_ROOT}/lib64/ve/libmpi.a CACHE FILEPATH "") +set(MPI_Fortran_COMPILE_FLAGS "-D_MPIPP_INCLUDE" CACHE STRING "") +#################################################################### +# COMMON FLAGS +#################################################################### + +set(ECBUILD_Fortran_FLAGS "-fpic") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mstack-arrays") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fdiag-vector=3") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fcse-after-vectorization") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-collapse ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-fusion ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-interchange ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-unroll-complete=200 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -ftrace") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fmove-loop-invariants-if ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -freplace-loop-equation ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -msched-interblock ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-floating-divide-instruction ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-power-to-explog ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-sqrt-instruction ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-threshold=3 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-functions ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-max-depth=5 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -finline-max-function-size=200 ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-merge-conditional ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -fivdep ") +###set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -floop-strip-mine ") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -muse-mmap ") +##set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -mvector-packed") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -report-all") + +set( ECBUILD_Fortran_FLAGS_BIT "-O4 -mvector-fma" ) + +set( ECBUILD_C_FLAGS "-O2 " ) + +set( ECBUILD_CXX_FLAGS "-O2" ) + +# Fix for C++ template headers needed for Serialbox +set( GNU_HEADER_INCLUDE "-I/usr/local/apps/gcc/7.3.0/lib/gcc/x86_64-linux-gnu/7.3.0/include-fixed" ) +set( ECBUILD_CXX_FLAGS "${ECBUILD_CXX_FLAGS} ${GNU_HEADER_INCLUDE}" ) diff --git a/arch/toolchains/ecmwf-volta-pgi-gpu.cmake b/arch/toolchains/ecmwf-volta-pgi-gpu.cmake index df80087a..96359878 100644 --- a/arch/toolchains/ecmwf-volta-pgi-gpu.cmake +++ b/arch/toolchains/ecmwf-volta-pgi-gpu.cmake @@ -50,8 +50,6 @@ set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Ktrap=fp") set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Kieee") set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Mdaz") -set(ECBUILD_Fortran_LINK_FLAGS "-gpu=pinned") - set( ECBUILD_Fortran_FLAGS_BIT "-O2 -gopt" ) set( ECBUILD_C_FLAGS "-O2 -gopt -traceback" ) diff --git a/src/cloudsc_fortran/CMakeLists.txt b/src/cloudsc_fortran/CMakeLists.txt index 3719bcfc..80728fb6 100644 --- a/src/cloudsc_fortran/CMakeLists.txt +++ b/src/cloudsc_fortran/CMakeLists.txt @@ -21,11 +21,11 @@ if( HAVE_CLOUDSC_FORTRAN ) dwarf_cloudsc.F90 cloudsc_driver_mod.F90 cloudsc.F90 - LIBS - cloudsc-common-lib DEFINITIONS ${CLOUDSC_DEFINITIONS} ) + target_link_libraries( dwarf-cloudsc-fortran PRIVATE cloudsc-common-lib ) + # Create symlink for the input data if( HAVE_SERIALBOX ) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink @@ -33,6 +33,8 @@ if( HAVE_CLOUDSC_FORTRAN ) endif() if( HAVE_HDF5 ) + target_include_directories( dwarf-cloudsc-fortran PRIVATE ${HDF5_Fortran_INCLUDE_DIRS} ) + target_link_libraries( dwarf-cloudsc-fortran PRIVATE ${HDF5_LIBRARIES} ) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/../../config-files/input.h5 ${CMAKE_CURRENT_BINARY_DIR}/../../../input.h5 ) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink