diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1b6c763515..63622ebb08 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -50,6 +50,10 @@ # ON_QUARTZ: # Should the Quartz pipeline run? Set to "ON" or "OFF" to enable/disable. # +# SPACK_PREFIX: prefix used for shared spack installation. +# Usually this a spack version number that matches the version set in the uberenv_config.json file. +# Spack installs go in /usr/workspace/sundials/spack_installs/${SPACK_PREFIX}/$(hostname). +# # SHARED_SPACK: # If "ON", then a shared spack install that has been pre-configured is utilized. # If "OFF", then a new spack instance is created for every build (meaning all TPLs have to be installed). @@ -75,6 +79,8 @@ variables: VERBOSE_TEST: "OFF" ON_LASSEN: "ON" ON_QUARTZ: "ON" + ON_CORONA: "ON" + SPACK_PREFIX: "v0.19.1" SHARED_SPACK: "UPSTREAM" BENCHMARK: "OFF" BENCHMARK_NNODES: 4 @@ -89,7 +95,7 @@ stages: - q_build_and_test - l_build_and_test - l_build_and_bench - # - c_build_and_test + - c_build_and_test # These are also templates (.name) that define project specific build commands. # If an allocation exist with the name defined in this pipeline, the job will @@ -101,9 +107,10 @@ stages: --job-name=${ALLOC_NAME} .gitlab/build_and_test.sh # Corona -.build_toss_3_x86_64_ib_corona_script: +.build_toss_4_x86_64_ib_corona_script: script: - - srun -p mi60 --interactive -t ${DEFAULT_TIME} -N 1 .gitlab/build_and_test.sh + - echo ${ALLOC_NAME} + - flux alloc -N 1 -t ${DEFAULT_TIME} .gitlab/build_and_test.sh # CORAL systems use spectrum LSF instead of SLURM .build_blueos_3_ppc64le_ib_script: @@ -123,3 +130,5 @@ include: - local: .gitlab/quartz-jobs.yml - local: .gitlab/lassen-templates.yml - local: .gitlab/lassen-jobs.yml + - local: .gitlab/corona-templates.yml + - local: .gitlab/corona-jobs.yml diff --git a/.gitlab/build_and_test.sh b/.gitlab/build_and_test.sh index 51d5bc63bf..1bb0aec5bf 100755 --- a/.gitlab/build_and_test.sh +++ b/.gitlab/build_and_test.sh @@ -20,6 +20,7 @@ job_unique_id=${CI_JOB_ID:-""} sys_type=${SYS_TYPE:-""} py_env_path=${PYTHON_ENVIRONMENT_PATH:-""} +spack_prefix=${SHARED_SPACK_PREFIX:-"v0.19.1"} shared_spack=${SHARED_SPACK:-"UPSTREAM"} # Dependencies @@ -46,8 +47,16 @@ hostname=${hostname%%[0-9]*} BUILD_JOBS=${BUILD_JOBS:-"1"} # load newer python to try the clingo concretizer -echo "module load python/3.8.2" -module load python/3.8.2 +# machine specific loads +if [[ "${hostname}" == "corona" ]]; then + echo "module load python/3.9.12" + module load python/3.9.12 + echo "module load rocm/5.4.1" + module load rocm/5.4.1 +else + echo "module load python/3.8.2" + module load python/3.8.2 +fi if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]] then @@ -89,7 +98,7 @@ then if [[ -d /usr/workspace/sundials ]] then - upstream="/usr/workspace/sundials/spack_installs/${hostname}" + upstream="/usr/workspace/sundials/spack_installs/${spack_prefix}/${hostname}" mkdir -p "${upstream}" upstream_opt="--upstream=${upstream}" fi @@ -174,7 +183,7 @@ then mkdir -p "${build_dir}" && cd "${build_dir}" date - + $cmake_exe --version # configure diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml new file mode 100644 index 0000000000..450a9197ba --- /dev/null +++ b/.gitlab/corona-jobs.yml @@ -0,0 +1,37 @@ +# ------------------------------------------------------------------------------ +# SUNDIALS Copyright Start +# Copyright (c) 2002-2021, Lawrence Livermore National Security +# and Southern Methodist University. +# All rights reserved. +# +# See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-3-Clause +# SUNDIALS Copyright End +# ------------------------------------------------------------------------------ + +# ------------------------------------------------------------------------------ +# HIP +# ------------------------------------------------------------------------------ + +# Builds with Hip +corona_clang_hip: + parallel: + matrix: + - COMPILER_SPEC: rocmcc@5.4.1 + AMDGPU_TARGET: [gfx906] + variables: + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double amdgpu_target=${AMDGPU_TARGET} scheduler=flux +rocm+mpi" + extends: .corona_build_and_test + +# ------------------------------------------------------------------------------ +# HIP + TPLs +# ------------------------------------------------------------------------------ +corona_clang_hip_tpls: + parallel: + matrix: + - COMPILER_SPEC: rocmcc@5.4.1 + AMDGPU_TARGET: [gfx906] + variables: + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 amdgpu_target=${AMDGPU_TARGET} scheduler=flux +rocm+mpi+magma+raja ^magma+rocm amdgpu_target=${AMDGPU_TARGET} ^raja+rocm~openmp~examples~exercises amdgpu_target=${AMDGPU_TARGET}" + extends: .corona_build_and_test diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml new file mode 100644 index 0000000000..a901f4fa48 --- /dev/null +++ b/.gitlab/corona-templates.yml @@ -0,0 +1,35 @@ +# ------------------------------------------------------------------------------ +# SUNDIALS Copyright Start +# Copyright (c) 2002-2021, Lawrence Livermore National Security +# and Southern Methodist University. +# All rights reserved. +# +# See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-3-Clause +# SUNDIALS Copyright End +# ------------------------------------------------------------------------------ + +# ------------------------------------------------------------------------------ +# Tags and rules to run tests on Corona +# ------------------------------------------------------------------------------ + +# Generic Corona build job, extending build script for Toss 4 x86_64 Systems +.corona_build_and_test: + tags: + - shell + - corona + extends: [.build_toss_4_x86_64_ib_corona_script] + stage: c_build_and_test + needs: [] + artifacts: + paths: + - spack-*.txt + - build_*/* + when: always + rules: + # Don't run if... + - if: '$CI_COMMIT_BRANCH =~ /_cnone/ || $ON_CORONA == "OFF" || $BENCHMARK == "ON"' + when: never + # Default is to run if previous stage succeeded + - when: on_success diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 7dc51e99d4..388b95ec08 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -78,7 +78,7 @@ lassen_gcc_cuda_tpls: - COMPILER_SPEC: gcc@8.3.1 CUDA_SPEC: [cuda@11.5.0] variables: - SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 +mpi+openmp+cuda+raja+magma+superlu-dist cuda_arch=70 ^superlu-dist+cuda cuda_arch=70 ^magma+cuda cuda_arch=70 ^raja+cuda~openmp~examples~exercises cuda_arch=70 ^${CUDA_SPEC}" + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 precision=double ~int64 +mpi+openmp+cuda+raja+magma+superlu-dist+petsc+hypre+ginkgo cuda_arch=70 ^ginkgo+cuda cuda_arch=70 ^hypre ^petsc+cuda cuda_arch=70 ^superlu-dist+cuda cuda_arch=70 ^magma+cuda cuda_arch=70 ^raja+cuda~openmp~examples~exercises cuda_arch=70 ^${CUDA_SPEC}" extends: .lassen_build_and_test # ------------------------------------------------------------------------------ diff --git a/.gitlab/quartz-jobs.yml b/.gitlab/quartz-jobs.yml index 6e4b87a236..93599c3188 100644 --- a/.gitlab/quartz-jobs.yml +++ b/.gitlab/quartz-jobs.yml @@ -55,11 +55,11 @@ quartz_clang_tpls: parallel: matrix: - - COMPILER_SPEC: clang@12.0.0 + - COMPILER_SPEC: clang@12.0.1 INDEX_SPEC: [~int64] PRECISION_SPEC: [double] variables: - SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu" + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu +petsc ^suite-sparse@5.13.0 ^openblas" extends: .quartz_build_and_test quartz_gcc_tpls: @@ -69,7 +69,7 @@ quartz_gcc_tpls: INDEX_SPEC: [~int64] PRECISION_SPEC: [double] variables: - SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu" + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre +superlu-dist +lapack +klu +petsc ^suite-sparse@5.13.0" extends: .quartz_build_and_test quartz_intel_tpls: @@ -79,5 +79,5 @@ quartz_intel_tpls: INDEX_SPEC: [~int64] PRECISION_SPEC: [double] variables: - SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre ~superlu-dist +lapack +klu" + SPEC: "%${COMPILER_SPEC} cstd=99 cxxstd=14 ${INDEX_SPEC} precision=${PRECISION_SPEC} +mpi +openmp +hypre ~superlu-dist +lapack +klu ^suite-sparse@5.13.0" extends: .quartz_build_and_test diff --git a/.gitlab/radiuss-spack-configs b/.gitlab/radiuss-spack-configs index 8194d152ac..6f22afc1ec 160000 --- a/.gitlab/radiuss-spack-configs +++ b/.gitlab/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 8194d152acfdfcc8ad5a27051e9988f1e20e8779 +Subproject commit 6f22afc1ece86c479d2d2a64e14736ef00b632d6 diff --git a/.gitlab/spack_packages/camp/package.py b/.gitlab/spack_packages/camp/package.py deleted file mode 100644 index aa801d21dc..0000000000 --- a/.gitlab/spack_packages/camp/package.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - -from spack import * - - -class Camp(CMakePackage, CudaPackage, ROCmPackage): - """ - Compiler agnostic metaprogramming library providing concepts, - type operations and tuples for C++ and cuda - """ - - homepage = "https://github.com/LLNL/camp" - git = "https://github.com/LLNL/camp.git" - url = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz" - - maintainers = ['trws'] - - version('main', branch='main', submodules='True') - version('2022.03.2', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721') - version('2022.03.0', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721') - version('0.3.0', sha256='129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb') - version('0.2.3', sha256='58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7') - version('0.2.2', sha256='194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819') - version('0.1.0', sha256='fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc') - - # TODO: figure out gtest dependency and then set this default True. - variant('tests', default=False, description='Build tests') - - depends_on('cub', when='+cuda') - - depends_on('blt') - - def cmake_args(self): - spec = self.spec - - options = [] - - options.append("-DBLT_SOURCE_DIR={0}".format(spec['blt'].prefix)) - - if '+cuda' in spec: - options.extend([ - '-DENABLE_CUDA=ON', - '-DCUDA_TOOLKIT_ROOT_DIR=%s' % (spec['cuda'].prefix)]) - - if not spec.satisfies('cuda_arch=none'): - cuda_arch = spec.variants['cuda_arch'].value - options.append('-DCUDA_ARCH=sm_{0}'.format(cuda_arch[0])) - flag = '-arch sm_{0}'.format(cuda_arch[0]) - options.append('-DCMAKE_CUDA_FLAGS:STRING={0}'.format(flag)) - else: - options.append('-DENABLE_CUDA=OFF') - - if '+rocm' in spec: - options.extend([ - '-DENABLE_HIP=ON', - '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix) - ]) - archs = self.spec.variants['amdgpu_target'].value - if archs != 'none': - arch_str = ",".join(archs) - options.append( - '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}'.format(arch_str) - ) - else: - options.append('-DENABLE_HIP=OFF') - - options.append(self.define_from_variant('ENABLE_TESTS', 'tests')) - - return options \ No newline at end of file diff --git a/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch b/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch new file mode 100644 index 0000000000..c9e1707857 --- /dev/null +++ b/.gitlab/spack_packages/netlib-lapack/ibm-xl-3.9.1.patch @@ -0,0 +1,108 @@ +Fixes for IBM XL and Cray CCE builds: + +* Correct path to the fallback configuration used to handle mangling for + C++/Fortran compatibility (CCE, XL) + +* Change logic for detecting recursive fortran flags to (a) Include XL +(qrecur), and (b) Be explicit, since not every compiler will correctly reject +an incorrect option (ALL) + +NOTE: This patch has been accepted upstream +(see https://github.com/Reference-LAPACK/lapack/pull/621) + +############################################################################## + +diff -Naur a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt +--- a/CBLAS/CMakeLists.txt 2021-03-25 12:25:15.000000000 -0600 ++++ b/CBLAS/CMakeLists.txt 2021-09-01 16:27:23.561355382 -0600 +@@ -11,9 +11,7 @@ + MACRO_NAMESPACE "F77_" + SYMBOL_NAMESPACE "F77_") + if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND) +- message(WARNING "Reverting to pre-defined include/lapacke_mangling.h") +- configure_file(include/lapacke_mangling_with_flags.h.in +- ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h) ++ message(WARNING "Reverting to pre-defined include/cblas_mangling.h") + configure_file(include/cblas_mangling_with_flags.h.in + ${LAPACK_BINARY_DIR}/include/cblas_mangling.h) + endif() +diff -Naur a/CMakeLists.txt b/CMakeLists.txt +--- a/CMakeLists.txt 2021-03-25 12:25:15.000000000 -0600 ++++ b/CMakeLists.txt 2021-09-02 09:49:18.070436958 -0600 +@@ -94,16 +94,22 @@ + + # Check if recursive flag exists + include(CheckFortranCompilerFlag) +-check_fortran_compiler_flag("-recursive" _recursiveFlag) +-check_fortran_compiler_flag("-frecursive" _frecursiveFlag) +-check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag) ++if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang) ++ check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag) ++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU) ++ check_fortran_compiler_flag("-frecursive" _frecursiveFlag) ++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) ++ check_fortran_compiler_flag("-recursive" _recursiveFlag) ++elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL) ++ check_fortran_compiler_flag("-qrecur" _qrecurFlag) ++endif() + + # Add recursive flag +-if(_recursiveFlag) +- string(REGEX MATCH "-recursive" output_test "${CMAKE_Fortran_FLAGS}") ++if(_MrecursiveFlag) ++ string(REGEX MATCH "-Mrecursive" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) +- set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive" +- CACHE STRING "Recursive flag must be set" FORCE) ++ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive" ++ CACHE STRING "Recursive flag must be set" FORCE) + endif() + elseif(_frecursiveFlag) + string(REGEX MATCH "-frecursive" output_test "${CMAKE_Fortran_FLAGS}") +@@ -111,11 +117,17 @@ + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive" + CACHE STRING "Recursive flag must be set" FORCE) + endif() +-elseif(_MrecursiveFlag) +- string(REGEX MATCH "-Mrecursive" output_test "${CMAKE_Fortran_FLAGS}") ++elseif(_recursiveFlag) ++ string(REGEX MATCH "-recursive" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) +- set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive" +- CACHE STRING "Recursive flag must be set" FORCE) ++ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive" ++ CACHE STRING "Recursive flag must be set" FORCE) ++ endif() ++elseif(_qrecurFlag) ++ string(REGEX MATCH "-qrecur" output_test "${CMAKE_Fortran_FLAGS}") ++ if(NOT output_test) ++ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur" ++ CACHE STRING "Recursive flag must be set" FORCE) + endif() + endif() + +@@ -124,7 +136,7 @@ + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") + endif() + if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) +- set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") ++ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict") + endif() + # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. + # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin +diff -Naur a/INSTALL/make.inc.XLF b/INSTALL/make.inc.XLF +--- a/INSTALL/make.inc.XLF 2021-03-25 12:25:15.000000000 -0600 ++++ b/INSTALL/make.inc.XLF 2021-09-02 09:50:02.664646455 -0600 +@@ -14,10 +14,10 @@ + # the compiler options desired when NO OPTIMIZATION is selected. + # + FC = xlf +-FFLAGS = -O3 -qfixed -qnosave ++FFLAGS = -O3 -qfixed -qnosave -qrecur + # For -O2, add -qstrict=none + FFLAGS_DRV = $(FFLAGS) +-FFLAGS_NOOPT = -O0 -qfixed -qnosave ++FFLAGS_NOOPT = -O0 -qfixed -qnosave -qrecur + + # Define LDFLAGS to the desired linker options for your machine. + # diff --git a/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch b/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch new file mode 100644 index 0000000000..52b5f19719 --- /dev/null +++ b/.gitlab/spack_packages/netlib-lapack/ibm-xl.patch @@ -0,0 +1,53 @@ +Fixes for IBM XL and Cray CCE builds: + +* Avoid optimizations that would alter program semantics by changing the + qstrict activation threshold from O3 to O2 (XL) + +* Don't assume Fortran code is all in fixed source form; disable qfixed (XL) + +* Correct path to the fallback configuration used to handle mangling for + C++/Fortran compatibility (CCE, XL) +############################################################################## + +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -62,7 +62,7 @@ + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") + endif() + if("${CMAKE_Fortran_COMPILER}" MATCHES "xlf") +- set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") ++ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict") + endif() + # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. + # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin + +--- a/CMAKE/CheckLAPACKCompilerFlags.cmake ++++ b/CMAKE/CheckLAPACKCompilerFlags.cmake +@@ -43,12 +43,6 @@ + if( "${CMAKE_Fortran_FLAGS}" MATCHES "-qflttrap=[a-zA-Z:]:enable" ) + set( FPE_EXIT TRUE ) + endif() +- +- if( NOT ("${CMAKE_Fortran_FLAGS}" MATCHES "-qfixed") ) +- message( STATUS "Enabling fixed format F90/F95 with -qfixed" ) +- set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qfixed" +- CACHE STRING "Flags for Fortran compiler." FORCE ) +- endif() + + # HP Fortran + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "HP" ) + +--- a/CBLAS/CMakeLists.txt ++++ b/CBLAS/CMakeLists.txt +@@ -12,8 +12,8 @@ + SYMBOL_NAMESPACE "F77_") + if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND) + message(WARNING "Reverting to pre-defined include/lapacke_mangling.h") +- configure_file(include/lapacke_mangling_with_flags.h.in +- ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h) ++ configure_file(include/cblas_mangling_with_flags.h.in ++ ${LAPACK_BINARY_DIR}/include/cblas_mangling.h) + endif() + + include_directories(include ${LAPACK_BINARY_DIR}/include) + diff --git a/.gitlab/spack_packages/netlib-lapack/package.py b/.gitlab/spack_packages/netlib-lapack/package.py new file mode 100644 index 0000000000..cb1832d061 --- /dev/null +++ b/.gitlab/spack_packages/netlib-lapack/package.py @@ -0,0 +1,223 @@ +# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) +import spack.build_systems.cmake +from spack.package import * + + +class NetlibLapack(CMakePackage): + """LAPACK version 3.X is a comprehensive FORTRAN library that does + linear algebra operations including matrix inversions, least squared + solutions to linear sets of equations, eigenvector analysis, singular + value decomposition, etc. It is a very comprehensive and reputable + package that has found extensive use in the scientific community. + + """ + + homepage = "https://www.netlib.org/lapack/" + url = "https://www.netlib.org/lapack/lapack-3.5.0.tgz" + tags = ["windows"] + + version( + "3.10.1", + sha256="cd005cd021f144d7d5f7f33c943942db9f03a28d110d6a3b80d718a295f7f714", + url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.10.1.tar.gz", + ) + version( + "3.10.0", + sha256="328c1bea493a32cac5257d84157dc686cc3ab0b004e2bea22044e0a59f6f8a19", + url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.10.0.tar.gz", + ) + version( + "3.9.1", + sha256="d0085d2caf997ff39299c05d4bacb6f3d27001d25a4cc613d48c1f352b73e7e0", + url="https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.9.1.tar.gz", + ) + version( + "3.9.0", + sha256="106087f1bb5f46afdfba7f569d0cbe23dacb9a07cd24733765a0e89dbe1ad573", + url="https://github.com/Reference-LAPACK/lapack/archive/v3.9.0.tar.gz", + ) + version( + "3.8.0", + sha256="deb22cc4a6120bff72621155a9917f485f96ef8319ac074a7afbc68aab88bcf6", + url="https://www.netlib.org/lapack/lapack-3.8.0.tar.gz", + ) + version("3.7.1", sha256="f6c53fd9f56932f3ddb3d5e24c1c07e4cd9b3b08e7f89de9c867125eecc9a1c8") + version("3.7.0", sha256="ed967e4307e986474ab02eb810eed1d1adc73f5e1e3bc78fb009f6fe766db3be") + version("3.6.1", sha256="888a50d787a9d828074db581c80b2d22bdb91435a673b1bf6cd6eb51aa50d1de") + version("3.6.0", sha256="a9a0082c918fe14e377bbd570057616768dca76cbdc713457d8199aaa233ffc3") + version("3.5.0", sha256="9ad8f0d3f3fb5521db49f2dd716463b8fb2b6bc9dc386a9956b8c6144f726352") + version("3.4.2", sha256="60a65daaf16ec315034675942618a2230521ea7adf85eea788ee54841072faf0") + version("3.4.1", sha256="93b910f94f6091a2e71b59809c4db4a14655db527cfc5821ade2e8c8ab75380f") + version("3.4.0", sha256="a7139ef97004d0e3c4c30f1c52d508fd7ae84b5fbaf0dd8e792c167dc306c3e9") + version("3.3.1", sha256="56821ab51c29369a34e5085728f92c549a9aa926f26acf7eeac87b61eed329e4") + + # netlib-lapack is the reference implementation of LAPACK + for ver in [ + "3.10.1", + "3.10.0", + "3.9.1", + "3.9.0", + "3.8.0", + "3.7.1", + "3.7.0", + "3.6.1", + "3.6.0", + "3.5.0", + "3.4.2", + "3.4.1", + "3.4.0", + "3.3.1", + ]: + provides("lapack@" + ver, when="@" + ver) + + variant("shared", default=True, description="Build shared library version") + variant("external-blas", default=False, description="Build lapack with an external blas") + + variant("lapacke", default=True, description="Activates the build of the LAPACKE C interface") + variant("xblas", default=False, description="Builds extended precision routines using XBLAS") + + # Fixes for IBM XL and Cray CCE builds: + # Avoid optimizations that alter program semantics + # Don't assume fixed source form for Fortran + # Correct path to mangling config + patch("ibm-xl.patch", when="@3.7:3.8 %xl") + patch("ibm-xl.patch", when="@3.7:3.8 %xl_r") + patch("ibm-xl.patch", when="@3.7:3.8 %cce@9:") + + # https://github.com/Reference-LAPACK/lapack/pull/621 + # Fixes for IBM XL and Cray CCE builds: + # Correct path to mangling config + # Fix logic for detecting recursive Fortran flags + patch("ibm-xl-3.9.1.patch", when="@3.9.1 %xl") + patch("ibm-xl-3.9.1.patch", when="@3.9.1 %xl_r") + patch("ibm-xl-3.9.1.patch", when="@3.9.1 %cce@13:") + + # https://github.com/Reference-LAPACK/lapack/issues/228 + patch("undefined_declarations.patch", when="@3.8.0:3.8") + + # https://github.com/Reference-LAPACK/lapack/pull/268 + patch("testing.patch", when="@3.7.0:3.8") + + # virtual dependency + provides("blas", when="~external-blas") + provides("lapack") + + depends_on("blas", when="+external-blas") + depends_on("netlib-xblas+fortran+plain_blas", when="+xblas") + depends_on("python@2.7:", type="test") + + # We need to run every phase twice in order to get static and shared + # versions of the libraries. When ~shared, we run the default + # implementations of the CMakePackage's phases and get only one building + # directory 'spack-build-static' with -DBUILD_SHARED_LIBS:BOOL=OFF (see + # implementations of self.build_directory and self.cmake_args() below). + # When +shared, we run the overridden methods for the phases, each + # running the default implementation twice with different values for + # self._building_shared. As a result, we get two building directories: + # 'spack-build-static' with -DBUILD_SHARED_LIBS:BOOL=OFF and + # 'spack-build-shared' with -DBUILD_SHARED_LIBS:BOOL=ON. + _building_shared = False + + def patch(self): + # Fix cblas CMakeLists.txt -- has wrong case for subdirectory name. + if self.spec.satisfies("@3.6.0:"): + filter_file( + "${CMAKE_CURRENT_SOURCE_DIR}/CMAKE/", + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/", + "CBLAS/CMakeLists.txt", + string=True, + ) + + # Remove duplicate header file that gets generated during CMake shared + # builds: https://github.com/Reference-LAPACK/lapack/issues/583 + if self.spec.satisfies("platform=windows @0:3.9.1"): + force_remove("LAPACKE/include/lapacke_mangling.h") + + @property + def blas_libs(self): + shared = True if "+shared" in self.spec else False + query_parameters = self.spec.last_query.extra_parameters + query2libraries = { + tuple(): ["libblas"], + ("c", "fortran"): ["libcblas", "libblas"], + ("c",): ["libcblas"], + ("fortran",): ["libblas"], + } + key = tuple(sorted(query_parameters)) + libraries = query2libraries[key] + return find_libraries(libraries, root=self.prefix, shared=shared, recursive=True) + + @property + def lapack_libs(self): + shared = True if "+shared" in self.spec else False + query_parameters = self.spec.last_query.extra_parameters + query2libraries = { + tuple(): ["liblapack"], + ("c", "fortran"): ["liblapacke", "liblapack"], + ("c",): ["liblapacke"], + ("fortran",): ["liblapack"], + } + key = tuple(sorted(query_parameters)) + libraries = query2libraries[key] + return find_libraries(libraries, root=self.prefix, shared=shared, recursive=True) + + @property + def headers(self): + include_dir = self.spec.prefix.include + cblas_h = join_path(include_dir, "cblas.h") + lapacke_h = join_path(include_dir, "lapacke.h") + return HeaderList([cblas_h, lapacke_h]) + + +class CMakeBuilder(spack.build_systems.cmake.CMakeBuilder): + def cmake_args(self): + args = [ + self.define_from_variant("BUILD_SHARED_LIBS", "shared"), + self.define_from_variant("LAPACKE", "lapacke"), + self.define_from_variant("LAPACKE_WITH_TMG", "lapacke"), + self.define("CBLAS", self.spec.satisfies("@3.6.0:")), + ] + + if self.spec.satisfies("%intel"): + # Intel compiler finds serious syntax issues when trying to + # build CBLAS and LapackE + args.extend([self.define("CBLAS", False), self.define("LAPACKE", False)]) + + if self.spec.satisfies("%xl") or self.spec.satisfies("%xl_r"): + # use F77 compiler if IBM XL + args.extend( + [ + self.define("CMAKE_Fortran_COMPILER", self.pkg.compiler.f77), + self.define( + "CMAKE_Fortran_FLAGS", + " ".join(self.spec.compiler_flags["fflags"]) + " -O3 -qnohot", + ), + ] + ) + + # deprecated routines are commonly needed by, for example, suitesparse + # Note that OpenBLAS spack is built with deprecated routines + args.append(self.define("BUILD_DEPRECATED", True)) + + if self.spec.satisfies("+external-blas"): + args.extend( + [ + self.define("USE_OPTIMIZED_BLAS", True), + self.define("BLAS_LIBRARIES:PATH", self.spec["blas"].libs.joined(";")), + ] + ) + + if self.spec.satisfies("+xblas"): + args.extend( + [ + self.define("XBLAS_INCLUDE_DIR", self.spec["netlib-xblas"].prefix.include), + self.define("XBLAS_LIBRARY", self.spec["netlib-xblas"].libs.joined(";")), + ] + ) + + args.append(self.define("BUILD_TESTING", self.pkg.run_tests)) + + return args diff --git a/.gitlab/spack_packages/netlib-lapack/testing.patch b/.gitlab/spack_packages/netlib-lapack/testing.patch new file mode 100644 index 0000000000..fce18548c4 --- /dev/null +++ b/.gitlab/spack_packages/netlib-lapack/testing.patch @@ -0,0 +1,13 @@ +diff --git a/TESTING/LIN/alahd.f b/TESTING/LIN/alahd.f +index 8f4cd58d..6a4946e0 100644 +--- a/TESTING/LIN/alahd.f ++++ b/TESTING/LIN/alahd.f +@@ -1036,7 +1036,7 @@ + 9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' ) + 9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' ) + 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-6: ', A1, +- $ 'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ' ++ $ 'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ', + $ A1, 'GETSLS)') + 9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' ) + 9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X, diff --git a/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch b/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch new file mode 100644 index 0000000000..9dac2562f7 --- /dev/null +++ b/.gitlab/spack_packages/netlib-lapack/undefined_declarations.patch @@ -0,0 +1,26 @@ +diff --git a/SRC/dsytrf_aa_2stage.f b/SRC/dsytrf_aa_2stage.f +index 2991305..f5f06cc 100644 +--- a/SRC/dsytrf_aa_2stage.f ++++ b/SRC/dsytrf_aa_2stage.f +@@ -191,7 +191,7 @@ + EXTERNAL LSAME, ILAENV + * .. + * .. External Subroutines .. +- EXTERNAL XERBLA, DCOPY, DLACGV, DLACPY, ++ EXTERNAL XERBLA, DCOPY, DLACPY, + $ DLASET, DGBTRF, DGEMM, DGETRF, + $ DSYGST, DSWAP, DTRSM + * .. +diff --git a/SRC/ssytrf_aa_2stage.f b/SRC/ssytrf_aa_2stage.f +index be6809d..a929749 100644 +--- a/SRC/ssytrf_aa_2stage.f ++++ b/SRC/ssytrf_aa_2stage.f +@@ -191,7 +191,7 @@ + EXTERNAL LSAME, ILAENV + * .. + * .. External Subroutines .. +- EXTERNAL XERBLA, SCOPY, SLACGV, SLACPY, ++ EXTERNAL XERBLA, SCOPY, SLACPY, + $ SLASET, SGBTRF, SGEMM, SGETRF, + $ SSYGST, SSWAP, STRSM + * .. diff --git a/.gitlab/spack_packages/raja/package.py b/.gitlab/spack_packages/raja/package.py deleted file mode 100644 index db49267ed1..0000000000 --- a/.gitlab/spack_packages/raja/package.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - -import socket - -from spack import * - - -class Raja(CachedCMakePackage, CudaPackage, ROCmPackage): - """RAJA Parallel Framework.""" - - homepage = "https://software.llnl.gov/RAJA/" - git = "https://github.com/LLNL/RAJA.git" - tags = ['radiuss', 'e4s'] - - maintainers = ['davidbeckingsale'] - - version('develop', branch='develop', submodules=False) - version('main', branch='main', submodules=False) - version('2022.03.1', tag='v2022.03.0', submodules=False) - version('2022.03.0', tag='v2022.03.0', submodules=False) - version('0.14.0', tag='v0.14.0', submodules='True') - version('0.13.0', tag='v0.13.0', submodules='True') - version('0.12.1', tag='v0.12.1', submodules="True") - version('0.12.0', tag='v0.12.0', submodules="True") - version('0.11.0', tag='v0.11.0', submodules="True") - version('0.10.1', tag='v0.10.1', submodules="True") - version('0.10.0', tag='v0.10.0', submodules="True") - version('0.9.0', tag='v0.9.0', submodules="True") - version('0.8.0', tag='v0.8.0', submodules="True") - version('0.7.0', tag='v0.7.0', submodules="True") - version('0.6.0', tag='v0.6.0', submodules="True") - version('0.5.3', tag='v0.5.3', submodules="True") - version('0.5.2', tag='v0.5.2', submodules="True") - version('0.5.1', tag='v0.5.1', submodules="True") - version('0.5.0', tag='v0.5.0', submodules="True") - version('0.4.1', tag='v0.4.1', submodules="True") - version('0.4.0', tag='v0.4.0', submodules="True") - - # export targets when building pre-2.4.0 release with BLT 0.4.0+ - patch('https://github.com/LLNL/RAJA/commit/eca1124ee4af380d6613adc6012c307d1fd4176b.patch?full_index=1', - sha256='12bb78c00b6683ad3e7fd4e3f87f9776bae074b722431b79696bc862816735ef', - when='@:0.13.0 ^blt@0.4:') - - variant('openmp', default=True, description='Build OpenMP backend') - variant('shared', default=True, description='Build Shared Libs') - variant('examples', default=True, description='Build examples.') - variant('exercises', default=True, description='Build exercises.') - # TODO: figure out gtest dependency and then set this default True - # and remove the +tests conflict below. - variant('tests', default=False, description='Build tests') - - depends_on('blt') - depends_on('blt@0.5.0:', type='build', when='@0.14.1:') - depends_on('blt@0.4.1', type='build', when='@0.14.0') - depends_on('blt@0.4.0:', type='build', when='@0.13.0') - depends_on('blt@0.3.6:', type='build', when='@:0.12.0') - - depends_on('camp@0.2.2', when='@0.14.0') - depends_on('camp@0.1.0', when='@0.10.0:0.13.0') - depends_on('camp@2022.03.0:', when='@2022.03.0:') - - depends_on('cmake@:3.20', when='+rocm', type='build') - depends_on('cmake@3.14:', when='@2022.03.0:') - - with when('+rocm @0.12.0:'): - depends_on('camp+rocm') - for arch in ROCmPackage.amdgpu_targets: - depends_on('camp+rocm amdgpu_target={0}'.format(arch), - when='amdgpu_target={0}'.format(arch)) - conflicts('+openmp') - - with when('+cuda @0.12.0:'): - depends_on('camp+cuda') - for sm_ in CudaPackage.cuda_arch_values: - depends_on('camp +cuda cuda_arch={0}'.format(sm_), - when='cuda_arch={0}'.format(sm_)) - - def _get_sys_type(self, spec): - sys_type = spec.architecture - if "SYS_TYPE" in env: - sys_type = env["SYS_TYPE"] - return sys_type - - @property - def cache_name(self): - hostname = socket.gethostname() - if "SYS_TYPE" in env: - hostname = hostname.rstrip('1234567890') - return "{0}-{1}-{2}@{3}.cmake".format( - hostname, - self._get_sys_type(self.spec), - self.spec.compiler.name, - self.spec.compiler.version - ) - - def initconfig_hardware_entries(self): - spec = self.spec - entries = super(Raja, self).initconfig_hardware_entries() - - entries.append(cmake_cache_option("ENABLE_OPENMP", '+openmp' in spec)) - - if '+cuda' in spec: - entries.append(cmake_cache_option("ENABLE_CUDA", True)) - - if not spec.satisfies('cuda_arch=none'): - cuda_arch = spec.variants['cuda_arch'].value - entries.append(cmake_cache_string( - "CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0]))) - entries.append(cmake_cache_string( - "CMAKE_CUDA_ARCHITECTURES", '{0}'.format(cuda_arch[0]))) - else: - entries.append(cmake_cache_option("ENABLE_CUDA", False)) - - if '+rocm' in spec: - entries.append(cmake_cache_option("ENABLE_HIP", True)) - entries.append(cmake_cache_path( - "HIP_ROOT_DIR", '{0}'.format(spec['hip'].prefix))) - archs = self.spec.variants['amdgpu_target'].value - if archs != 'none': - arch_str = ",".join(archs) - entries.append(cmake_cache_string( - "HIP_HIPCC_FLAGS", '--amdgpu-target={0}'.format(arch_str))) - else: - entries.append(cmake_cache_option("ENABLE_HIP", False)) - - return entries - - def initconfig_package_entries(self): - spec = self.spec - entries = [] - - option_prefix = "RAJA_" if spec.satisfies("@2022.03.0:") else "" - - entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec['blt'].prefix)) - if 'camp' in self.spec: - entries.append(cmake_cache_path("camp_DIR", spec['camp'].prefix)) - entries.append(cmake_cache_option("BUILD_SHARED_LIBS", '+shared' in spec)) - entries.append(cmake_cache_option( - "{}ENABLE_EXAMPLES".format(option_prefix), '+examples' in spec)) - if spec.satisfies('@0.14.0:'): - entries.append(cmake_cache_option( - "{}ENABLE_EXERCISES".format(option_prefix), '+exercises' in spec)) - else: - entries.append(cmake_cache_option("ENABLE_EXERCISES", - '+exercises' in spec)) - - # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which - # is used by the spack compiler wrapper. This can go away when BLT - # removes -Werror from GTest flags - if self.spec.satisfies('%clang target=ppc64le:') or not self.run_tests: - entries.append(cmake_cache_option("ENABLE_TESTS", False)) - else: - entries.append(cmake_cache_option("ENABLE_TESTS", True)) - - return entries - - def cmake_args(self): - options = [] - return options - - @property - def build_relpath(self): - """Relative path to the cmake build subdirectory.""" - return join_path('..', self.build_dirname) - - @run_after('install') - def setup_build_tests(self): - """Copy the build test files after the package is installed to a - relative install test subdirectory for use during `spack test run`.""" - # Now copy the relative files - self.cache_extra_test_sources(self.build_relpath) - - # Ensure the path exists since relying on a relative path at the - # same level as the normal stage source path. - mkdirp(self.install_test_root) - - @property - def _extra_tests_path(self): - # TODO: The tests should be converted to re-build and run examples - # TODO: using the installed libraries. - return join_path(self.install_test_root, self.build_relpath, 'bin') - - def _test_examples(self): - """Perform very basic checks on a subset of copied examples.""" - checks = [ - ('ex5_line-of-sight_solution', - [r'RAJA sequential', r'RAJA OpenMP', r'result -- PASS']), - ('ex6_stencil-offset-layout_solution', - [r'RAJA Views \(permuted\)', r'result -- PASS']), - ('ex8_tiled-matrix-transpose_solution', - [r'parallel top inner loop', - r'collapsed inner loops', r'result -- PASS']), - ('kernel-dynamic-tile', [r'Running index', r'(24,24)']), - ('plugin-example', - [r'Launching host kernel for the 10 time']), - ('tut_batched-matrix-multiply', [r'result -- PASS']), - ('wave-eqn', [r'Max Error = 2', r'Evolved solution to time']) - ] - for exe, expected in checks: - reason = 'test: checking output of {0} for {1}' \ - .format(exe, expected) - self.run_test(exe, [], expected, installed=False, - purpose=reason, skip_missing=True, - work_dir=self._extra_tests_path) - - def test(self): - """Perform smoke tests.""" - self._test_examples() \ No newline at end of file diff --git a/.gitlab/spack_packages/sundials/package.py b/.gitlab/spack_packages/sundials/package.py index 4de41139de..ad7ee0cf3c 100644 --- a/.gitlab/spack_packages/sundials/package.py +++ b/.gitlab/spack_packages/sundials/package.py @@ -27,7 +27,10 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage): # Versions # ========================================================================== version("develop", branch="develop") - version("6.4.0", branch="develop") + version("6.5.1", sha256="4252303805171e4dbdd19a01e52c1dcfe0dafc599c3cfedb0a5c2ffb045a8a75") + version("6.5.0", sha256="4e0b998dff292a2617e179609b539b511eb80836f5faacf800e688a886288502") + version("6.4.1", sha256="7bf10a8d2920591af3fba2db92548e91ad60eb7241ab23350a9b1bc51e05e8d0") + version("6.4.0", sha256="0aff803a12c6d298d05b56839197dd09858631864017e255ed89e28b49b652f1") version("6.3.0", sha256="89a22bea820ff250aa7239f634ab07fa34efe1d2dcfde29cc8d3af11455ba2a7") version("6.2.0", sha256="195d5593772fc483f63f08794d79e4bab30c2ec58e6ce4b0fb6bcc0e0c48f31d") version("6.1.1", sha256="cfaf637b792c330396a25ef787eb59d58726c35918ebbc08e33466e45d50470c") @@ -123,9 +126,17 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage): when="@6.0.0: +profiling", description="Enable Caliper instrumentation/profiling", ) + variant("ginkgo", default=False, when="@6.4.0:", description="Enable Ginkgo interfaces") variant("hypre", default=False, when="@2.7.0:", description="Enable Hypre MPI parallel vector") - variant("lapack", default=False, description="Enable LAPACK direct solvers") + variant("kokkos", default=False, when="@6.4.0:", description="Enable Kokkos vector") + variant( + "kokkos-kernels", + default=False, + when="@6.4.0:", + description="Enable KokkosKernels based matrix and linear solver", + ) variant("klu", default=False, description="Enable KLU sparse, direct solver") + variant("lapack", default=False, description="Enable LAPACK direct solvers") variant("petsc", default=False, when="@2.7.0:", description="Enable PETSc interfaces") variant("magma", default=False, when="@5.7.0:", description="Enable MAGMA interface") variant("superlu-mt", default=False, description="Enable SuperLU_MT sparse, direct solver") @@ -169,6 +180,9 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage): "profiling", default=False, when="@6.0.0:", description="Build with profiling capabilities" ) + # Scheduler + variant("scheduler", default="slurm", description="Specify which scheduler the system runs on.", values=("flux", "lsf", "slurm")) + # ========================================================================== # Dependencies # ========================================================================== @@ -190,6 +204,23 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage): # External libraries depends_on("caliper", when="+caliper") + depends_on("ginkgo@1.5.0:", when="+ginkgo") + depends_on("kokkos", when="+kokkos") + depends_on("kokkos-kernels", when="+kokkos-kernels") + for cuda_arch in CudaPackage.cuda_arch_values: + depends_on( + "kokkos+cuda+cuda_lambda+cuda_constexpr cuda_arch=%s" % cuda_arch, + when="+kokkos +cuda cuda_arch=%s" % cuda_arch, + ) + depends_on( + "kokkos-kernels+cuda cuda_arch=%s" % cuda_arch, + when="+kokkos-kernels +cuda cuda_arch=%s" % cuda_arch, + ) + for rocm_arch in ROCmPackage.amdgpu_targets: + depends_on( + "kokkos+rocm amdgpu_target=%s" % rocm_arch, + when="+kokkos +rocm amdgpu_target=%s" % rocm_arch, + ) depends_on("lapack", when="+lapack") depends_on("hypre+mpi@2.22.1:", when="@5.7.1: +hypre") depends_on("hypre+mpi@:2.22.0", when="@:5.7.0 +hypre") @@ -205,13 +236,13 @@ class Sundials(CachedCMakePackage, CudaPackage, ROCmPackage): # Require that external libraries built with the same precision depends_on("petsc~double~complex", when="+petsc precision=single") depends_on("petsc+double~complex", when="+petsc precision=double") - + # Require that external libraries built with the same index type with when('+int64'): depends_on("hypre+mpi+int64", when="+hypre +int64") depends_on("petsc+int64", when="+petsc +int64") depends_on("superlu-dist+int64", when="+superlu-dist +int64") - + with when('~int64'): depends_on("hypre+mpi~int64", when="+hypre ~int64") depends_on("petsc~int64", when="+petsc ~int64") @@ -640,6 +671,13 @@ def initconfig_mpi_entries(self): cmake_cache_path("MPI_MPIF90", spec["mpi"].mpifc) ] ) + if "scheduler=flux" in spec: + entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "flux run")) + if "scheduler=slurm" in spec: + entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "srun")) + if "scheduler=lsf" in spec: + entries.append(cmake_cache_string("SUNDIALS_TEST_MPIRUN_COMMAND", "jsrun")) + return entries @@ -666,10 +704,9 @@ def initconfig_hardware_entries(self): cmake_cache_path("HIP_PATH", spec["hip"].prefix), cmake_cache_path("HIP_CLANG_INCLUDE_PATH", spec["llvm-amdgpu"].prefix.include), cmake_cache_path("ROCM_PATH", spec["llvm-amdgpu"].prefix), - cmake_cache_string("AMDGPU_TARGETS", spec.variants["amdgpu_target"].value) + cmake_cache_string("AMDGPU_TARGETS", ";".join(spec.variants["amdgpu_target"].value)) ] ) - return entries def initconfig_package_entries(self): @@ -735,6 +772,25 @@ def initconfig_package_entries(self): if "+caliper" in spec: entries.append(cmake_cache_path("CALIPER_DIR", spec["caliper"].prefix)) + # Building with Ginkgo + if "+ginkgo" in spec: + gko_backends = ["REF"] + if "+openmp" in spec["ginkgo"] and "+openmp" in spec: + gko_backends.append("OMP") + if "+cuda" in spec["ginkgo"] and "+cuda" in spec: + gko_backends.append("CUDA") + if "+rocm" in spec["ginkgo"] and "+rocm" in spec: + gko_backends.append("HIP") + if "+oneapi" in spec["ginkgo"] and "+sycl" in spec: + gko_backends.append("DPCPP") + entries.extend( + [ + self.cache_option_from_variant("ENABLE_GINKGO", "ginkgo"), + cmake_cache_path("Ginkgo_DIR", spec["ginkgo"].prefix), + cmake_cache_string("SUNDIALS_GINKGO_BACKENDS", ";".join(gko_backends)), + ] + ) + # Building with Hypre if "+hypre" in spec: entries.extend( @@ -747,6 +803,12 @@ def initconfig_package_entries(self): hypre_libs = spec["blas"].libs + spec["lapack"].libs entries.extend([cmake_cache_string("HYPRE_LIBRARIES", hypre_libs.joined(";"))]) + # Building with Kokkos and KokkosKernels + if "+kokkos" in spec: + entries.extend([self.cache_option_from_variant("Kokkos_DIR", spec["kokkos"].prefix)]) + if "+kokkos-kernels" in spec: + entries.extend([self.cache_option_from_variant("KokkosKernels_DIR", spec["kokkos-kernels"].prefix)]) + # Building with KLU if "+klu" in spec: entries.extend( @@ -788,23 +850,30 @@ def initconfig_package_entries(self): entries.append(cmake_cache_path("RAJA_DIR", spec["raja"].prefix)) if "camp" in spec: entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix.lib.cmake + '/camp')) + if "+rocm" in spec: + entries.append(cmake_cache_string("SUNDIALS_RAJA_BACKENDS", "HIP")) # Building with SuperLU_DIST if "+superlu-dist" in spec: - if spec.satisfies("@6.4.0:"): + #if spec.satisfies("@6.4.0:"): + if False: entries.extend( [ cmake_cache_path("SUPERLUDIST_DIR", spec["superlu-dist"].prefix), - cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), + cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), ] ) else: + superludist_libs = [] + superludist_libs.extend(spec["parmetis"].libs) + superludist_libs.extend(spec["metis"].libs) + superludist_libs.extend(spec["superlu-dist"].libs) entries.extend( [ cmake_cache_path("SUPERLUDIST_INCLUDE_DIR", spec["superlu-dist"].prefix.include), cmake_cache_path("SUPERLUDIST_LIBRARY_DIR", spec["superlu-dist"].prefix.lib), - cmake_cache_string("SUPERLUDIST_LIBRARIES", spec["superlu-dist"].libs), - cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), + cmake_cache_string("SUPERLUDIST_LIBRARIES", ";".join(superludist_libs)), + cmake_cache_string("SUPERLUDIST_OpenMP", "^superlu-dist+openmp" in spec), ] ) diff --git a/.gitlab/uberenv b/.gitlab/uberenv index 4941c237ee..0d00dc8e19 160000 --- a/.gitlab/uberenv +++ b/.gitlab/uberenv @@ -1 +1 @@ -Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d +Subproject commit 0d00dc8e19a889ba07ae433590b87533c4b5b3da diff --git a/.uberenv_config.json b/.uberenv_config.json index 7c65ba8c6f..67618cca2f 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,8 +4,8 @@ "package_final_phase": "initconfig", "package_source_dir": "../..", "spack_url": "https://github.com/spack/spack", - "spack_commit": "13e6f87ef6527954b152eaea303841978e83b992", + "spack_commit": "5e0d2107348eed6cbe6deca43a30f5b06c5e40af", "spack_activate": {}, "spack_configs_path": ".gitlab/radiuss-spack-configs", "spack_packages_path": ".gitlab/spack_packages" -} \ No newline at end of file +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 44eb259cbf..28a06f737a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # SUNDIALS Changelog +## Changes to SUNDIALS in release 6.6.0 + +Added the second order IMEX method from Giraldo, Kelly, and Constantinescu 2013 +as the default second order IMEX method in ARKStep. The explicit table is given +by `ARKODE_ARK2_ERK_3_1_2` and the implicit table by `ARKODE_ARK2_DIRK_3_1_2`. + +Updated the F2003 utility routines `SUNDIALSFileOpen` and `SUNDIALSFileClose` +to support user specification of `stdout` and `stderr` strings for the output +file names. + ## Changes to SUNDIALS in release 6.5.1 Added the functions `ARKStepClearStopTime`, `ERKStepClearStopTime`, diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 793ffbb0a4..e9fd4648c1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -26,12 +26,9 @@ sundials_option(BENCHMARK_NVECTOR BOOL "NVector benchmarks are on" ON) # Add specific benchmarks #---------------------------------------- -if(ENABLE_MPI AND ENABLE_RAJA) - add_subdirectory(advection_reaction_3D) -endif() - if(ENABLE_MPI) - add_subdirectory(diffusion_2D) +add_subdirectory(diffusion_2D) +add_subdirectory(advection_reaction_3D) endif() # Add the nvector benchmarks diff --git a/benchmarks/advection_reaction_3D/CMakeLists.txt b/benchmarks/advection_reaction_3D/CMakeLists.txt index e51a95155a..7469a6a10a 100644 --- a/benchmarks/advection_reaction_3D/CMakeLists.txt +++ b/benchmarks/advection_reaction_3D/CMakeLists.txt @@ -1,5 +1,5 @@ # --------------------------------------------------------------- -# Programmer(s): Cody J. Balos @ LLNL +# Programmer(s): Daniel R. Reynolds @ SMU # --------------------------------------------------------------- # SUNDIALS Copyright Start # Copyright (c) 2002-2023, Lawrence Livermore National Security @@ -12,135 +12,10 @@ # SUNDIALS Copyright End # --------------------------------------------------------------- -if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA) - - if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP")) - set(OTHER_LIBS OpenMP::OpenMP_CXX) - endif() - - # ---------------------------------------------------------------------------- - # MPI only - # ---------------------------------------------------------------------------- - - add_executable(advection_reaction_3D - advection_reaction_3D.cpp - arkode_driver.cpp - cvode_driver.cpp - ida_driver.cpp - rhs3D.hpp - ParallelGrid.hpp - backends.hpp) - - # ensure the linker language is reset to CXX - set_target_properties(advection_reaction_3D PROPERTIES LINKER_LANGUAGE CXX) - - target_include_directories(advection_reaction_3D - PRIVATE - ${PROJECT_SOURCE_DIR}/utilities - ${MPI_CXX_INCLUDE_DIRS}) - - target_link_libraries(advection_reaction_3D - PRIVATE - sundials_arkode - sundials_cvode - sundials_ida - sundials_nvecmpiplusx - sundials_nvecserial - RAJA - ${MPI_CXX_LIBRARIES} - ${OTHER_LIBS}) - - install(TARGETS advection_reaction_3D - DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D") - - install(FILES README.md - DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D") - - # ---------------------------------------------------------------------------- - # MPI + CUDA - # ---------------------------------------------------------------------------- - - if(BUILD_NVECTOR_CUDA) - - set_source_files_properties(advection_reaction_3D.cpp - PROPERTIES LANGUAGE CUDA) - set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA) - - add_executable(advection_reaction_3D_mpicuda - advection_reaction_3D.cpp - arkode_driver.cpp - cvode_driver.cpp - ida_driver.cpp - rhs3D.hpp - ParallelGrid.hpp - backends.hpp) - - # ensure the linker language is reset to CXX - set_target_properties(advection_reaction_3D_mpicuda - PROPERTIES LINKER_LANGUAGE CXX) - - target_include_directories(advection_reaction_3D_mpicuda - PRIVATE - ${PROJECT_SOURCE_DIR}/utilities - ${MPI_CXX_INCLUDE_DIRS}) - - target_link_libraries(advection_reaction_3D_mpicuda - PRIVATE - sundials_arkode - sundials_cvode - sundials_ida - sundials_nvecmpiplusx - sundials_nveccuda - RAJA - ${MPI_CXX_LIBRARIES} - ${OTHER_LIBS}) - - target_compile_definitions(advection_reaction_3D_mpicuda PRIVATE USE_CUDA_NVEC) - - install(TARGETS advection_reaction_3D_mpicuda - DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D") - - endif() - - # ---------------------------------------------------------------------------- - # MPI + HIP - # ---------------------------------------------------------------------------- - - if(BUILD_NVECTOR_HIP) - - add_executable(advection_reaction_3D_mpihip - advection_reaction_3D.cpp - arkode_driver.cpp - cvode_driver.cpp - ida_driver.cpp - rhs3D.hpp - ParallelGrid.hpp - backends.hpp) - - target_include_directories(advection_reaction_3D_mpihip - PRIVATE - ${PROJECT_SOURCE_DIR}/utilities - ${MPI_CXX_INCLUDE_DIRS}) - - target_link_libraries(advection_reaction_3D_mpihip - PRIVATE - sundials_arkode - sundials_cvode - sundials_ida - sundials_nvecmpiplusx - sundials_nvechip - RAJA - hip::device - ${MPI_CXX_LIBRARIES} - ${OTHER_LIBS}) - - target_compile_definitions(advection_reaction_3D_mpihip PRIVATE USE_HIP_NVEC) - - install(TARGETS advection_reaction_3D_mpihip - DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D") - - endif() +if(ENABLE_RAJA) + add_subdirectory(raja) +endif() +if(ENABLE_KOKKOS AND BUILD_NVECTOR_KOKKOS) + add_subdirectory(kokkos) endif() diff --git a/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt new file mode 100644 index 0000000000..2d58e5fe4c --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/CMakeLists.txt @@ -0,0 +1,61 @@ +# --------------------------------------------------------------- +# Programmer(s): Daniel R. Reynolds @ SMU +# --------------------------------------------------------------- +# SUNDIALS Copyright Start +# Copyright (c) 2002-2023, Lawrence Livermore National Security +# and Southern Methodist University. +# All rights reserved. +# +# See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-3-Clause +# SUNDIALS Copyright End +# --------------------------------------------------------------- + +# Add the build targets for each backend +if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA) + foreach(backend ${KOKKOS_EXAMPLES_BACKENDS}) + + # set benchmark target name + set(benchmark_target "advection_reaction_3D_kokkos.${backend}") + + # benchmark source files + add_executable(${benchmark_target} + advection_reaction_3D.cpp + arkode_driver.cpp + cvode_driver.cpp + ida_driver.cpp + rhs3D.hpp + ParallelGrid.hpp + check_retval.h) + + # which backend to use + target_compile_definitions(${benchmark_target} PRIVATE USE_${backend}) + + # directories to include + target_include_directories(${benchmark_target} + PRIVATE + ${PROJECT_SOURCE_DIR}/utilities + ${MPI_CXX_INCLUDE_DIRS} + ) + + # libraries to link against + target_link_libraries(${benchmark_target} + PRIVATE + sundials_arkode + sundials_cvode + sundials_ida + sundials_nvecmpiplusx + sundials_nveckokkos + ${MPI_CXX_LIBRARIES} + ${EXE_EXTRA_LINK_LIBS} + ) + + install(TARGETS ${benchmark_target} + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos") + + install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/kokkos") + + endforeach() +endif() diff --git a/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp new file mode 100644 index 0000000000..c324105b02 --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/ParallelGrid.hpp @@ -0,0 +1,593 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Daniel R. Reynolds @ SMU + * Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ----------------------------------------------------------------------------- + * A simple implementation of a parallel structured Cartesian mesh class that + * supports up to 3 spatial dimensions and an arbitrary number of degrees of + * freedom, and that uses Kokkos views to store communication buffer data. + * ----------------------------------------------------------------------------*/ + +#ifndef _KOKKOSPARGRID_H +#define _KOKKOSPARGRID_H + +#include +#include +#include +#include +#include +#include + + +/* Set Kokkos execution space and type shortcuts */ +#if defined(USE_CUDA) +using ExecSpace = Kokkos::Cuda; +using MemSpace = Kokkos::CudaSpace; +#elif defined(USE_HIP) +#if KOKKOS_VERSION / 10000 > 3 +using ExecSpace = Kokkos::HIP; +using MemSpace = Kokkos::HIPSpace; +#else +using ExecSpace = Kokkos::Experimental::HIP; +using MemSpace = Kokkos::Experimental::HIPSpace; +#endif +#elif defined(USE_OPENMP) +using ExecSpace = Kokkos::OpenMP; +using MemSpace = Kokkos::HostSpace; +#else +using ExecSpace = Kokkos::Serial; +using MemSpace = Kokkos::HostSpace; +#endif +using Vec1D = Kokkos::View; +using Vec4D = Kokkos::View; +using Vec1DHost = Vec1D::HostMirror; +using Vec4DHost = Vec4D::HostMirror; +using Range3D = Kokkos::MDRangePolicy>; + + +namespace sundials_tools +{ + +// Types of boundaries supported. +enum class BoundaryType +{ + PERIODIC +}; + +// Types of stencils supported. +enum class StencilType +{ + UPWIND +}; + +template +class ParallelGrid +{ +public: + // Constructor that creates a new ParallelGrid object. + // [in] - the memory helper to use for allocating the MPI buffers + // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator + // [in] a[] - an array of length 3 which defines the domain [a,b] + // [in] b[] - an array of length 3 which defines the domain [a,b] + // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension + // [in] dof - the number of degrees of freedom in each dimension + // [in] bc - the type of boundary conditions (see BoundaryType) + // [in] st - the stencil to use (see StencilType) + // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose + // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this) + ParallelGrid(MPI_Comm* comm, const realtype a[], const realtype b[], const GLOBALINT npts[], + int dof, BoundaryType bc, StencilType st, const realtype c, + const int npxyz[] = nullptr, bool reorder = false) + : nx(1), ny(1), nz(1), + nxl(1), nyl(1), nzl(1), + npx(1), npy(1), npz(1), + dx(0.0), dy(0.0), dz(0.0), + ax(0.0), ay(0.0), az(0.0), + bx(0.0), by(0.0), bz(0.0), + dof(dof), dims{0,0,0}, coords{0,0,0}, + bc(bc), st(st), upwindRight(true) + { + assert(st == StencilType::UPWIND); + + /* Set up MPI Cartesian communicator */ + if (npxyz) + { + dims[0] = npxyz[0]; + dims[1] = npxyz[1]; + dims[2] = npxyz[2]; + } + + int retval, nprocs; + MPI_Comm_size(*comm, &nprocs); + retval = MPI_Dims_create(nprocs, 3, dims); + assert(retval == MPI_SUCCESS); + + int periods[] = { bc == BoundaryType::PERIODIC, + bc == BoundaryType::PERIODIC, + bc == BoundaryType::PERIODIC }; + retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm); + assert(retval == MPI_SUCCESS); + + retval = MPI_Cart_get(*comm, 3, dims, periods, coords); + assert(retval == MPI_SUCCESS); + + cart_comm = *comm; + + /* Set upwinding direction */ + upwindRight = (c > 0.0); + + /* Set up information for the first spatial dimension */ + npx = dims[0]; + nx = npts[0]; + ax = a[0]; + bx = b[0]; + dx = (bx-ax) / (realtype) nx; + int is = nx*(coords[0])/npx; + int ie = nx*(coords[0]+1)/npx-1; + nxl = ie-is+1; + neq = dof * nxl; + + /* Set up information for the second spatial dimension */ + npy = dims[1]; + ny = npts[1]; + ay = a[1]; + by = b[1]; + dy = (by-ay) / (realtype) ny; + int js = ny*(coords[1])/npy; + int je = ny*(coords[1]+1)/npy-1; + nyl = je-js+1; + neq *= nyl; + + /* Set up information for the third spatial dimension */ + npz = dims[2]; + nz = npts[2]; + az = a[2]; + bz = b[2]; + dz = (bz-az) / (realtype) nz; + int ks = nz*(coords[2])/npz; + int ke = nz*(coords[2]+1)/npz-1; + nzl = ke-ks+1; + neq *= nzl; + + /* Allocate buffers for nearest-neighbor exchange */ + if (st == StencilType::UPWIND) + AllocateBuffersUpwind(); + + } + + // TODO: + // - support non-periodic boundary conditions + // For all faces where neighbors exist: determine neighbor process indices. + // For all faces: allocate upwind exchange buffers. + void AllocateBuffersUpwind() + { + + /* Allocate send/receive buffers and determine ID for communication West */ + if (upwindRight) { + Wrecv_ = Vec1D("Wrecv", dof*nyl*nzl); + WrecvH_ = Kokkos::create_mirror_view(Wrecv_); + } else { + Wsend_ = Vec1D("Wsend", dof*nyl*nzl); + WsendH_ = Kokkos::create_mirror_view(Wsend_); + } + ipW = MPI_PROC_NULL; + if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0]-1, coords[1], coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW); + assert(retval == MPI_SUCCESS); + } + + /* Allocate send/receive buffers and determine ID for communication East */ + if (upwindRight) { + Esend_ = Vec1D("Esend", dof*nyl*nzl); + EsendH_ = Kokkos::create_mirror_view(Esend_); + } else { + Erecv_ = Vec1D("Erecv", dof*nyl*nzl); + ErecvH_ = Kokkos::create_mirror_view(Erecv_); + } + ipE = MPI_PROC_NULL; + if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0]+1, coords[1], coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE); + assert(retval == MPI_SUCCESS); + } + + /* Allocate send/receive buffers and determine ID for communication South */ + if (upwindRight) { + Srecv_ = Vec1D("Srecv", dof*nxl*nzl); + SrecvH_ = Kokkos::create_mirror_view(Srecv_); + } else { + Ssend_ = Vec1D("Ssend", dof*nxl*nzl); + SsendH_ = Kokkos::create_mirror_view(Ssend_); + } + ipS = MPI_PROC_NULL; + if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1]-1, coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS); + assert(retval == MPI_SUCCESS); + } + + /* Allocate send/receive buffers and determine ID for communication North */ + if (upwindRight) { + Nsend_ = Vec1D("Nsend", dof*nxl*nzl); + NsendH_ = Kokkos::create_mirror_view(Nsend_); + } else { + Nrecv_ = Vec1D("Nrecv", dof*nxl*nzl); + NrecvH_ = Kokkos::create_mirror_view(Nrecv_); + } + ipN = MPI_PROC_NULL; + if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1]+1, coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN); + assert(retval == MPI_SUCCESS); + } + + /* Allocate send/receive buffers and determine ID for communication Back */ + if (upwindRight) { + Brecv_ = Vec1D("Brecv", dof*nxl*nyl); + BrecvH_ = Kokkos::create_mirror_view(Brecv_); + } else { + Bsend_ = Vec1D("Bsend", dof*nxl*nyl); + BsendH_ = Kokkos::create_mirror_view(Bsend_); + } + ipB = MPI_PROC_NULL; + if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1], coords[2]-1}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB); + assert(retval == MPI_SUCCESS); + } + + /* Allocate send/receive buffers and determine ID for communication Front */ + if (upwindRight) { + Fsend_ = Vec1D("Fsend", dof*nxl*nyl); + FsendH_ = Kokkos::create_mirror_view(Fsend_); + } else { + Frecv_ = Vec1D("Frecv", dof*nxl*nyl); + FrecvH_ = Kokkos::create_mirror_view(Frecv_); + } + ipF = MPI_PROC_NULL; + if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1], coords[2]+1}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF); + assert(retval == MPI_SUCCESS); + } + + } + + // Initiate non-blocking neighbor communication + int ExchangeStart() + { + int retval = 0; + nreq = 0; + + // Initialize all requests in array + for (int i=0; i<12; i++) + req[i] = MPI_REQUEST_NULL; + + // Open an Irecv buffer on host for each neighbor + if ((ipW != MPI_PROC_NULL) && (upwindRight)) + { + retval = MPI_Irecv(WrecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, + 1, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipE != MPI_PROC_NULL) && (!upwindRight)) + { + retval = MPI_Irecv(ErecvH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, + 0, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipS != MPI_PROC_NULL) && (upwindRight)) + { + retval = MPI_Irecv(SrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, + 3, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipN != MPI_PROC_NULL) && (!upwindRight)) + { + retval = MPI_Irecv(NrecvH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, + 2, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipB != MPI_PROC_NULL) && (upwindRight)) + { + retval = MPI_Irecv(BrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, + 5, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipF != MPI_PROC_NULL) && (!upwindRight)) + { + retval = MPI_Irecv(FrecvH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, + 4, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + // Send data to neighbors, first copying from device to host buffers + if ((ipW != MPI_PROC_NULL) && (!upwindRight)) + { + Kokkos::deep_copy(WsendH_, Wsend_); + retval = MPI_Isend(WsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipE != MPI_PROC_NULL) && (upwindRight)) + { + Kokkos::deep_copy(EsendH_, Esend_); + retval = MPI_Isend(EsendH_.data(), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipS != MPI_PROC_NULL) && (!upwindRight)) + { + Kokkos::deep_copy(SsendH_, Ssend_); + retval = MPI_Isend(SsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipN != MPI_PROC_NULL) && (upwindRight)) + { + Kokkos::deep_copy(NsendH_, Nsend_); + retval = MPI_Isend(NsendH_.data(), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipB != MPI_PROC_NULL) && (!upwindRight)) + { + Kokkos::deep_copy(BsendH_, Bsend_); + retval = MPI_Isend(BsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipF != MPI_PROC_NULL) && (upwindRight)) + { + Kokkos::deep_copy(FsendH_, Fsend_); + retval = MPI_Isend(FsendH_.data(), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + return retval; + } + + // Waits for neighbor exchange to finish. + int ExchangeEnd() + { + MPI_Status stat[12]; + int retval; + + // return automatically with success if there are no outstanding requests + if (nreq == 0) + return(0); + + // Wait for messages to finish send/receive + retval = MPI_Waitall(nreq, req, stat); + assert(retval == MPI_SUCCESS); + + // Copy data from host to device buffers + if ((ipW != MPI_PROC_NULL) && (upwindRight)) + Kokkos::deep_copy(Wrecv_, WrecvH_); + if ((ipE != MPI_PROC_NULL) && (!upwindRight)) + Kokkos::deep_copy(Erecv_, ErecvH_); + if ((ipS != MPI_PROC_NULL) && (upwindRight)) + Kokkos::deep_copy(Srecv_, SrecvH_); + if ((ipN != MPI_PROC_NULL) && (!upwindRight)) + Kokkos::deep_copy(Nrecv_, NrecvH_); + if ((ipB != MPI_PROC_NULL) && (upwindRight)) + Kokkos::deep_copy(Brecv_, BrecvH_); + if ((ipF != MPI_PROC_NULL) && (!upwindRight)) + Kokkos::deep_copy(Frecv_, FrecvH_); + + return retval; + } + + // Prints out information about the ParallelGrid to stdout. + void PrintInfo() + { + printf("ParallelGrid Info:\n"); + printf(" dimensions = %d\n", 3); + printf(" processors = {%d, %d, %d}\n", npx, npy, npz); + printf(" domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz); + printf(" global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz); + printf(" local npts = {%d, %d, %d}\n", nxl, nyl, nzl); + printf(" mesh spacing = {%g, %g, %g}\n", dx, dy, dz); + if (upwindRight) + printf(" upwind dir = right\n"); + else + printf(" upwind dir = left\n"); + } + + // Saves the mesh to a file. + // First row is x. Second row is y. Third row is z. + // Can be loaded into MATLAB like so: + // mesh = loadtxt('mesh.txt'); + // [X,Y,Z] = meshgrid(mesh(1,:),mesh(2,:),mesh(3,:)); + void MeshToFile(const std::string& fname) + { + std::ofstream mesh_file; + mesh_file.open(fname); + mesh_file << std::setprecision(16); + for (GLOBALINT i = 0; i < nx; i++) + mesh_file << " " << dx*i; + mesh_file << std::endl; + for (GLOBALINT i = 0; i < ny; i++) + mesh_file << " " << dy*i; + mesh_file << std::endl; + for (GLOBALINT i = 0; i < nz; i++) + mesh_file << " " << dz*i; + mesh_file << std::endl; + mesh_file.close(); + } + + int nprocs() const + { + return npx*npy*npz; + } + + GLOBALINT npts() const + { + return nx*ny*nz; + } + + GLOBALINT nptsl() const + { + return nxl*nyl*nzl; + } + + GLOBALINT neql() const + { + return dof*nptsl(); + } + + realtype* GetRecvView(const std::string& direction) + { + if (direction == "WEST") + { + return static_cast(Wrecv_.data()); + } + else if (direction == "EAST") + { + return static_cast(Erecv_.data()); + } + else if (direction == "NORTH") + { + return static_cast(Nrecv_.data()); + } + else if (direction == "SOUTH") + { + return static_cast(Srecv_.data()); + } + else if (direction == "FRONT") + { + return static_cast(Frecv_.data()); + } + else if (direction == "BACK") + { + return static_cast(Brecv_.data()); + } + else + { + assert(direction == "ILLEGAL"); + return nullptr; + } + } + + realtype* GetSendView(const std::string& direction) + { + if (direction == "WEST") + { + return static_cast(Wsend_.data()); + } + else if (direction == "EAST") + { + return static_cast(Esend_.data()); + } + else if (direction == "NORTH") + { + return static_cast(Nsend_.data()); + } + else if (direction == "SOUTH") + { + return static_cast(Ssend_.data()); + } + else if (direction == "FRONT") + { + return static_cast(Fsend_.data()); + } + else if (direction == "BACK") + { + return static_cast(Bsend_.data()); + } + else + { + assert(direction == "ILLEGAL"); + return nullptr; + } + } + + GLOBALINT nx, ny, nz; /* number of intervals globally */ + int nxl, nyl, nzl; /* number of intervals locally */ + int npx, npy, npz; /* numner of processes */ + realtype dx, dy, dz; /* mesh spacing */ + realtype ax, ay, az; /* domain in [a, b] */ + realtype bx, by, bz; + int dof; /* degrees of freedom per node */ + int neq; /* total number of equations locally */ + + int ipW, ipE; /* MPI ranks for neighbor procs */ + int ipS, ipN; + int ipB, ipF; + bool upwindRight; /* Upwind dir: true/false == R/L */ + + int dims[3]; + int coords[3]; + + +private: + MPI_Comm cart_comm; /* MPI cartesian communicator */ + MPI_Request req[12]; + int nreq; + + BoundaryType bc; + StencilType st; + + Vec1D Wsend_; /* MPI send/recv buffers */ + Vec1D Esend_; + Vec1D Ssend_; + Vec1D Nsend_; + Vec1D Bsend_; + Vec1D Fsend_; + Vec1D Wrecv_; + Vec1D Erecv_; + Vec1D Srecv_; + Vec1D Nrecv_; + Vec1D Brecv_; + Vec1D Frecv_; + Vec1DHost WsendH_; /* MPI send/recv buffers (host) */ + Vec1DHost EsendH_; + Vec1DHost SsendH_; + Vec1DHost NsendH_; + Vec1DHost BsendH_; + Vec1DHost FsendH_; + Vec1DHost WrecvH_; + Vec1DHost ErecvH_; + Vec1DHost SrecvH_; + Vec1DHost NrecvH_; + Vec1DHost BrecvH_; + Vec1DHost FrecvH_; + +}; + +} + +#endif diff --git a/benchmarks/advection_reaction_3D/kokkos/README.md b/benchmarks/advection_reaction_3D/kokkos/README.md new file mode 100644 index 0000000000..f27484385f --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/README.md @@ -0,0 +1,113 @@ +# Benchmark: 3D Advection-Reaction + +This benchmark problem implements a 3D advection-reaction equation using the +Kokkos performance portability layer with serial, OpenMP, CUDA, or HIP backends. + +## Problem description + +This code simulates the advection and reaction of three chemical species where +the reaction mechanism is a variation of the Brusselator problem from chemical +kinetics. The PDE system is given by +```math +\begin{align} + u_t &= -c \nabla u + A - (w+1) u + v u^2 \\ + v_t &= -c \nabla v + w u - v u^2 \\ + w_t &= -c \nabla w + (B - w) / \epsilon - w u +\end{align} +``` +where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed, +$A$ and $B$ are the concentrations of chemical species that remain constant over +space and time, and $\epsilon$ is a parameter that varies the stiffness of the +system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$, +for times $t$ in $[0,t_f]$. The initial condition is +```math +\begin{align} + u(0,X) &= A + p(X) \\ + v(0,X) &= B / A + p(X) \\ + w(0,X) &= 3.0 + p(X) +\end{align} +``` +where the perturbation function is +```math + p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}} +``` +with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal +matrix with entries $0.25 X_{\text{max}}$. + +Spatial derivatives are discretized with first-order upwind finite differences +on a uniform spatial grid. The system can be evolved in time using explicit, +implicit, or IMEX methods from ARKODE, Adams or BDF methods from CVODE, or BDF +methods from IDA. When using an IMEX method, advection is treated explicitly and +reactions implicitly. + +The nonlinear system(s) that arise in each time step may be solved using a +global Newton method with a matrix-free GMRES linear solver or an Anderson +accelerated fixed-point method. When using an IMEX method, a custom task-local +nonlinear solver that leverages the locality of the reaction systems may also be +used. + +## Options + +Several command line options are available to change the problem parameters +as well as the integrator and solver options. A summary of the options are +listed below. + +| Option | Description | Default | +|:----------------------------|:------------------------------------------------------------------------------|:------------| +| `--help` | Print the command line options and description | -- | +| `--dont-save` | Do not save the solution to the disk | Save | +| `--output-dir ` | Directory where all output files will be written | `.` | +| `--nout ` | Number of output times | 40 | +| `--npts ` | Number of mesh points in each direction | 100 | +| `--npxyz ` | Number of MPI tasks in each direction (0 forces MPI to decide) | 0 0 0 | +| `--xmax ` | Maximum value of `x`, `y`, and `z` in :math:`X_max` | 1.0 | +| `--A ` | Constant concentration of species `A` | 1.0 | +| `--B ` | Constant concentration of species `B` | 3.5 | +| `--c ` | Advection speed `c` | 0.01 | +| `--order ` | Integration method order | 3 | +| `--method ` | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK` | +| `--nls ` | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none` | `newton` | +| `--fpaccel ` | Number of fixed point acceleration vectors | 3 | +| `--nopre` | Disable preconditioning | False | +| `--fused` | Enabled fused operations | Off | +| `--tf ` | Final integration time `t_f` | 10.0 | +| `--rtol ` | Relative tolerance | 1.0e-6 | +| `--atol ` | Absolute tolerance | 1.0e-9 | + +## Building and Running + +To build the benchmark executables SUNDIALS must be configured with ARKODE, +CVODE, and IDA enabled and with MPI and Kokkos support on. Additionally, either +CUDA or HIP support must be on to build executables utilizing NVIDIA or AMD +GPUs. See the installation guide for more details on configuring, building, +and installing SUNDIALS. + +Based on the configuration the following executables will be built and installed +in the `/advection_reaction_3D/kokkos` directory: + +* `advection_reaction_3D_kokkos.SERIAL` -- MPI parallelism +* `advection_reaction_3D_kokkos.OPENMP` -- MPI + OpenMP parallelism +* `advection_reaction_3D_kokkos.CUDA` -- MPI + CUDA parallelism +* `advection_reaction_3D_kokkos.HIP` -- MPI + HIP parallelism + +On Summit, with the default environment +``` + Compiler: xl/16.1.1-5 + MPI: spectrum-mpi/10.3.1.2-20200121 + CUDA: cuda/10.1.243 +``` +an example `jsrun` command is +``` +jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA +``` + +On Lassen, with the environment +``` + Compiler: gcc/8.3.1 + MPI: mvapich2/2021.05.28-cuda-11.1.1 + CUDA: cuda/11.1.1 +``` +an example `jsrun` command is +``` +jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_kokkos.CUDA +``` diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp new file mode 100644 index 0000000000..fa9f2bcc94 --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.cpp @@ -0,0 +1,711 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Daniel R. Reynolds @ SMU + * David J. Gardner, Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ----------------------------------------------------------------------------- + * This benchmark problem simulates the advection and reaction of three + * chemical species, u, v, and w, in a three dimensional domain. The reaction + * mechanism is a variation of the Brusselator problem from chemical kinetics. + * This is a PDE system with 3 components, Y = [u,v,w], satisfying the + * equations, + * + * u_t = -c * dot(grad,u) + A - (w+1) * u + v * u^2 + * v_t = -c * dot(grad,v) + w * u - v * u^2 + * w_t = -c * dot(grad,w) + (B - w) / ep - w * u + * + * for t in [0,tf], X = (x,y,z) where in (x,y,z) in [0,xmax] with periodic + * boundary conditions. The initial condition is + * + * u(0,X) = k1 * A / k4 + p(X) + * v(0,X) = k2 * k4 * B / (k1 * k3 * A) + p(X) + * w(0,X) = 3.0 + p(X) + * p(X) = alpha * e^( -((X - mu)^T Sigma^{-1} (x-mu)) / (2*sqrt(|Sigma|*(2pi)^3)) ) + * + * alpha = 0.1, mu = (xmax/2.0, xmax/2.0, xmax/2.0), and Sigma = diag(xmax/4.0). + * The reaction rates are set so k_1 = k_2 = k_3 = k_4 = k, and k_5 = k_6 + * = 1/5e-6. The spatial derivatives are discretized with first-order upwind + * finite differences. NOUT outputs are printed at equal intervals, and run + * statistics are printed at the end. + * + * Command line options: + * --help prints this message + * --dont-save do not save the solution to the filesystem at the nout interval (default is to save) + * --output-dir the directory where all output files will be written + * --nout number of output times + * --method ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA + * --nls nonlinear solver to use; options are newton, + * tl-newton (task-local newton), or fixedpoint + * --fpaccel the number of fixed-point acceleration vectors to use + * (only valid when using fixedpoint nonlinear solver) + * --nopre turn off preconditioning + * --order the method order to use + * --npts number of mesh points in each direction + * --xmax maximum value of x (size of domain) + * --tf final time + * --A A parameter value + * --B B parameter value + * --k reaction rate + * --c advection speed + * --rtol relative tolerance + * --atol absolute tolerance + * --------------------------------------------------------------------------*/ + +#include "advection_reaction_3D.hpp" + + +/* Main Program */ +int main(int argc, char *argv[]) +{ + + SUNContext ctx; + + /* Initialize MPI */ + MPI_Comm comm = MPI_COMM_WORLD; + MPI_Init(&argc, &argv); + + /* Create SUNDIALS context */ + SUNContext_Create((void*) &comm, &ctx); + + /* Initialize Kokkos */ + Kokkos::initialize(argc, argv); + { + + /* General problem variables */ + N_Vector y = NULL; /* empty solution vector */ + UserData udata(ctx); /* user data */ + UserOptions uopt; /* user options */ + int retval; /* reusable error-checking flag */ + + SUNDIALS_CXX_MARK_FUNCTION(udata.prof); + + /* Process input arguments and set up the problem */ + retval = SetupProblem(argc, argv, &udata, &uopt, ctx); + if (check_retval(&retval, "SetupProblem", 1, udata.myid)) MPI_Abort(comm, 1); + + /* Create solution vector (on-node and MPI-parallel versions) */ + SUNVector yloc{(unsigned int)udata.grid->neq, ctx}; + y = N_VMake_MPIPlusX(udata.comm, yloc, ctx); + if (check_retval((void *) y, "N_VMake_MPIPlusX", 0, udata.myid)) MPI_Abort(comm, 1); + + /* Set the initial condition */ + retval = SetIC(y, &udata); + if (check_retval(&retval, "SetIC", 1, udata.myid)) MPI_Abort(comm, 1); + + /* Output spatial mesh to disk (add extra point for periodic BC) */ + if (udata.myid == 0 && uopt.nout > 0) + { + char fname[MXSTR]; + snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir); + udata.grid->MeshToFile(fname); + } + + /* Integrate in time */ + if (uopt.method == "ERK") retval = EvolveProblemExplicit(y, &udata, &uopt); + else if (uopt.method == "ARK-DIRK") retval = EvolveProblemDIRK(y, &udata, &uopt); + else if (uopt.method == "ARK-IMEX") retval = EvolveProblemIMEX(y, &udata, &uopt); + else if (uopt.method == "CV-BDF") retval = EvolveProblemBDF(y, &udata, &uopt); + else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt); + else if (uopt.method == "IDA") retval = EvolveDAEProblem(y, &udata, &uopt); + if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1); + + /* Clean up */ + N_VDestroy(y); + } + Kokkos::finalize(); + SUNContext_Free(&ctx); + MPI_Finalize(); + return(0); +} + + +/* Destructor for problem data */ +UserData::~UserData() +{ + /* close output streams */ + if (uopt->nout > 0) + { + if (UFID) fclose(UFID); + if (VFID) fclose(VFID); + if (WFID) fclose(WFID); + if (TFID && myid == 0) fclose(TFID); + } + + /* free solution masks */ + if (umask != nullptr) { + N_VDestroy(umask); + umask = nullptr; + } + if (vmask != nullptr) { + N_VDestroy(vmask); + vmask = nullptr; + } + if (wmask != nullptr) { + N_VDestroy(wmask); + wmask = nullptr; + } + + /* free the parallel grid */ + delete grid; +} + + +/* -------------------------------------------------------------- + * Communication functions + * --------------------------------------------------------------*/ + +/* Fills send buffers before exchanging neighbor information */ +int FillSendBuffers(N_Vector y, UserData* udata) +{ + + /* Shortcuts */ + const realtype c = udata->c; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + + /* Create 4D view of the vector */ + Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + + if (c > 0.0) + { + + /* Flow moving in the positive directions uses backward difference. */ + + /* Create 4D views of send buffers */ + Vec4D Esend(udata->grid->GetSendView("EAST"), 1, nyl, nzl, dof); + Vec4D Nsend(udata->grid->GetSendView("NORTH"), nxl, 1, nzl, dof); + Vec4D Fsend(udata->grid->GetSendView("FRONT"), nxl, nyl, 1, dof); + + /* Fill buffers on device */ + Kokkos::parallel_for("FillEastBuffer", + Range3D({0,0,0},{nyl,nzl,dof}), + KOKKOS_LAMBDA (int j, int k, int l) { + Esend(0,j,k,l) = Yview(nxl-1,j,k,l); + }); + Kokkos::parallel_for("FillNorthBuffer", + Range3D({0,0,0},{nxl,nzl,dof}), + KOKKOS_LAMBDA (int i, int k, int l) { + Nsend(i,0,k,l) = Yview(i,nyl-1,k,l); + }); + Kokkos::parallel_for("FillFrontBuffer", + Range3D({0,0,0},{nxl,nyl,dof}), + KOKKOS_LAMBDA (int i, int j, int l) { + Fsend(i,j,0,l) = Yview(i,j,nzl-1,l); + }); + + } + else if (c < 0.0) + { + + /* Flow moving in the negative directions uses forward difference. */ + + /* Create 4D views of send buffers */ + Vec4D Wsend(udata->grid->GetSendView("WEST"), 1, nyl, nzl, dof); + Vec4D Ssend(udata->grid->GetSendView("SOUTH"), nxl, 1, nzl, dof); + Vec4D Bsend(udata->grid->GetSendView("BACK"), nxl, nyl, 1, dof); + + /* Fill buffers on device */ + Kokkos::parallel_for("FillWestBuffer", + Range3D({0,0,0},{nyl,nzl,dof}), + KOKKOS_LAMBDA (int j, int k, int l) { + Wsend(0,j,k,l) = Yview(0,j,k,l); + }); + Kokkos::parallel_for("FillSouthBuffer", + Range3D({0,0,0},{nxl,nzl,dof}), + KOKKOS_LAMBDA (int i, int k, int l) { + Ssend(i,0,k,l) = Yview(i,0,k,l); + }); + Kokkos::parallel_for("FillBackBuffer", + Range3D({0,0,0},{nxl,nyl,dof}), + KOKKOS_LAMBDA (int i, int j, int l) { + Bsend(i,j,0,l) = Yview(i,j,0,l); + }); + + } + + return(0); +} + + +/* -------------------------------------------------------------- + * Problem setup + * --------------------------------------------------------------*/ + +/* Parses the CLI arguments */ +int ParseArgs(int argc, char *argv[], UserData* udata, UserOptions* uopt) +{ + /* check for input args */ + if (argc > 1) + { + /* loop over input args and get value */ + for (int i = 1; i < argc; i++) + { + string argvi(argv[i]); + + if (argvi.compare("--help") == 0) + { + InputError(argv[0]); + return(-1); + } + else if (argvi.compare("--nout") == 0) + { + uopt->nout = atoi(argv[++i]); + } + else if (argvi.compare("--dont-save") == 0) + { + uopt->save = 0; + } + else if (argvi.compare("--output-dir") == 0) + { + if (strlen(argv[i+1]) > MXSTR) + { + if (udata->myid == 0) + fprintf(stderr, "ERROR: output directory string is too long\n"); + return(-1); + } + uopt->outputdir = argv[++i]; + } + else if (argvi.compare("--npts") == 0) + { + uopt->npts = atoi(argv[++i]); + } + else if (argvi.compare("--npxyz") == 0) + { + uopt->npxyz[0] = atoi(argv[++i]); + uopt->npxyz[1] = atoi(argv[++i]); + uopt->npxyz[2] = atoi(argv[++i]); + } + else if (argvi.compare("--xmax") == 0) + { + udata->xmax = strtod(argv[++i], NULL); + } + else if (argvi.compare("--A") == 0) + { + udata->A = strtod(argv[++i], NULL); + } + else if (argvi.compare("--B") == 0) + { + udata->B = strtod(argv[++i], NULL); + } + else if (argvi.compare("--k") == 0) + { + udata->k1 = strtod(argv[++i], NULL); + udata->k2 = strtod(argv[++i], NULL); + udata->k3 = strtod(argv[++i], NULL); + udata->k4 = strtod(argv[++i], NULL); + } + else if (argvi.compare("--c") == 0) + { + udata->c = strtod(argv[++i], NULL); + } + else if (argvi.compare("--order") == 0) + { + uopt->order = atoi(argv[++i]); + } + else if (argvi.compare("--method") == 0) + { + uopt->method = string(argv[++i]); + if (uopt->method != "ERK" && + uopt->method != "ARK-DIRK" && + uopt->method != "ARK-IMEX" && + uopt->method != "CV-BDF" && + uopt->method != "CV-ADAMS" && + uopt->method != "IDA") + { + fprintf(stderr, "ERROR: unknown method\n"); + InputError(argv[0]); + return(-1); + } + } + else if (argvi.compare("--fpaccel") == 0) + { + uopt->fpaccel = atoi(argv[++i]); + } + else if (argvi.compare("--nls") == 0) + { + uopt->nls = string(argv[++i]); + if (uopt->nls != "newton" && + uopt->nls != "tl-newton" && + uopt->nls != "fixedpoint" && + uopt->nls != "none") + { + fprintf(stderr, "ERROR: unknown nls\n"); + InputError(argv[0]); + return(-1); + } + } + else if (argvi.compare("--nopre") == 0) + { + uopt->precond = 0; + } + else if (argvi.compare("--fused") == 0) + { + uopt->fused = 1; + } + else if (argvi.compare("--tf") == 0) + { + uopt->tf = strtod(argv[++i], NULL); + } + else if (argvi.compare("--rtol") == 0) + { + uopt->rtol = strtod(argv[++i], NULL); + } + else if (argvi.compare("--atol") == 0) + { + uopt->atol = strtod(argv[++i], NULL); + } + else + { + InputError(argv[0]); + return(-1); + } + } + } + + /* Explicit method uses no nonlinear solver */ + if (uopt->method == "ERK") + uopt->nls = "none"; + + /* CV Adams method only uses fixedpoint nonlinear solver */ + if (uopt->method == "CV-ADAMS") + uopt->nls = "fixedpoint"; + + return(0); +} + + +/* Fills the mask vector for the component so that + u = y .* umask, v = y .* vmask, w = y .* wmask */ +int ComponentMask(N_Vector mask, const int component, const UserData* udata) +{ + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* Shortcuts */ + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + + /* Create 4D view of mask data */ + Vec4D maskview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(mask)), nxl, nyl, nzl, dof); + + /* Fill mask data */ + N_VConst(0.0, mask); + Kokkos::parallel_for("Fill_mask", + Range3D({0,0,0},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + maskview(i,j,k,component) = 1.0; + }); + + return 0; +} + + +/* Parses the CLI arguments and sets up the problem */ +int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, + SUNContext ctx) +{ + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* MPI variables */ + udata->comm = MPI_COMM_WORLD; + MPI_Comm_rank(udata->comm, &udata->myid); + MPI_Comm_size(udata->comm, &udata->nprocs); + + /* Default problem parameters */ + udata->add_reactions = true; + udata->xmax = 1.0; + udata->A = 1.0; + udata->B = 3.5; + udata->k1 = 1.0; + udata->k2 = 1.0; + udata->k3 = 1.0; + udata->k4 = 1.0; + udata->k5 = 1.0/5.0e-6; + udata->k6 = 1.0/5.0e-6; + udata->c = 0.01; + udata->uopt = uopt; + udata->TFID = NULL; + udata->UFID = NULL; + udata->VFID = NULL; + udata->WFID = NULL; + udata->nnlfi = 0; + + /* Set default integrator options */ + uopt->npxyz[0] = 0; /* number of processesors in x */ + uopt->npxyz[1] = 0; /* number of processesors in y */ + uopt->npxyz[2] = 0; /* number of processesors in z */ + uopt->npts = 100; /* number of mesh points in each direction */ + uopt->order = 3; /* method order */ + uopt->method = "ARK-DIRK"; /* stepper/method */ + uopt->t0 = 0.0; /* initial time */ + uopt->tf = 10.0; /* final time */ + uopt->rtol = 1.0e-6; /* relative tolerance */ + uopt->atol = 1.0e-9; /* absolute tolerance */ + uopt->nls = "newton"; /* default to newton, when appropriate */ + uopt->fpaccel = 3; /* default number of fixed point acceleration vectors */ + uopt->precond = 1; /* by default, precondition when appropriate */ + uopt->fused = 0; /* use fused vector ops */ + uopt->save = 1; /* save solution to disk */ + uopt->nout = 10; /* number of output times */ + uopt->outputdir = (char *) "."; /* output directory */ + + /* Parse CLI args and set udata/uopt appropriately */ + int retval = ParseArgs(argc, argv, udata, uopt); + if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1; + + /* Setup the parallel decomposition */ + const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts}; + const realtype amax[] = {0.0, 0.0, 0.0}; + const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax}; + udata->grid = new ParallelGrid(&udata->comm, amax, bmax, npts, + 3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c, uopt->npxyz); + + /* Create the solution masks */ + SUNVector *umaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx); + udata->umask = N_VMake_MPIPlusX(udata->comm, *umaskloc, ctx); + if (check_retval((void *) udata->umask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1); + SUNVector *vmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx); + udata->vmask = N_VMake_MPIPlusX(udata->comm, *vmaskloc, ctx); + if (check_retval((void *) udata->vmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1); + SUNVector *wmaskloc = new SUNVector((unsigned int)udata->grid->neq, ctx); + udata->wmask = N_VMake_MPIPlusX(udata->comm, *wmaskloc, ctx); + if (check_retval((void *) udata->wmask, "N_VMake_MPIPlusX", 0, udata->myid)) MPI_Abort(udata->comm, 1); + ComponentMask(udata->umask, 0, udata); + ComponentMask(udata->vmask, 1, udata); + ComponentMask(udata->wmask, 2, udata); + + /* Open output files for results */ + if (uopt->save) + { + char fname[MXSTR]; + if (udata->myid == 0) + { + sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid); + udata->TFID = fopen(fname, "w"); + } + + sprintf(fname, "%s/u.%06d.txt", uopt->outputdir, udata->myid); + udata->UFID = fopen(fname, "w"); + + sprintf(fname, "%s/v.%06d.txt", uopt->outputdir, udata->myid); + udata->VFID = fopen(fname, "w"); + + sprintf(fname, "%s/w.%06d.txt", uopt->outputdir, udata->myid); + udata->WFID = fopen(fname, "w"); + } + + /* Print problem setup */ + if (udata->myid == 0) + { + printf("\n\t\tAdvection-Reaction Test Problem\n\n"); + printf("Using the MPI+Kokkos NVECTOR"); +#if defined(USE_CUDA) + printf(" with the CUDA back-end\n"); +#elif defined(USE_HIP) + printf(" with the HIP back-end\n"); +#elif defined(USE_OPENMP) + printf(" with the OpenMP back-end and %i threads\n", omp_get_max_threads()); +#else + printf(" with the serial back-end\n"); +#endif + printf("Number of Processors = %li\n", (long int) udata->nprocs); + udata->grid->PrintInfo(); + printf("Problem Parameters:\n"); + printf(" A = %g\n", udata->A); + printf(" B = %g\n", udata->B); + printf(" k = %g\n", udata->k1); + printf(" c = %g\n", udata->c); + printf("Integrator Options:\n"); + printf(" order = %d\n", uopt->order); + printf(" method = %s\n", uopt->method.c_str()); + printf(" nonlinear solver = %s\n", uopt->nls.c_str()); + printf(" fpaccel = %d\n", uopt->fpaccel); + printf(" preconditioner = %d\n", uopt->precond); + printf(" fused vector ops = %d\n", uopt->fused); + printf(" t0 = %g\n", uopt->t0); + printf(" tf = %g\n", uopt->tf); + printf(" reltol = %.1e\n", uopt->rtol); + printf(" abstol = %.1e\n", uopt->atol); + printf(" nout = %d\n", uopt->nout); + printf("Output directory: %s\n", uopt->outputdir); + } + + + /* return success */ + return(0); +} + + +/* Compute the 3D Gaussian function. */ +KOKKOS_FUNCTION +void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax) +{ + /* Gaussian distribution defaults */ + const realtype alpha = 0.1; + const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) }; + const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma) + + /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */ + const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3)); + x = alpha * exp( -((x - mu[0])*(x - mu[0])*(1.0/sigma[0])) / denom ); + y = alpha * exp( -((y - mu[1])*(y - mu[1])*(1.0/sigma[1])) / denom ); + z = alpha * exp( -((z - mu[2])*(z - mu[2])*(1.0/sigma[2])) / denom ); +} + + +/* Initial condition function */ +int SetIC(N_Vector y, UserData* udata) +{ + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* Variable shortcuts */ + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + const realtype dx = udata->grid->dx; + const realtype dy = udata->grid->dy; + const realtype dz = udata->grid->dz; + const realtype xmax = udata->xmax; + const realtype A = udata->A; + const realtype B = udata->B; + const realtype k1 = udata->k1; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const int xcrd = udata->grid->coords[0]; + const int ycrd = udata->grid->coords[1]; + const int zcrd = udata->grid->coords[2]; + + /* Steady state solution */ + const realtype us = k1 * A / k4; + const realtype vs = k2 * k4 * B / (k1 * k3 * A); + const realtype ws = 3.0; + + /* Create 4D view of y */ + Vec4D yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + + /* Gaussian perturbation of the steady state solution */ + Kokkos::parallel_for("SetIC", + Range3D({0,0,0},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + realtype x = (xcrd * nxl + i) * dx; + realtype y = (ycrd * nyl + j) * dy; + realtype z = (zcrd * nzl + k) * dz; + Gaussian3D(x,y,z,xmax); + const realtype p = x + y + z; + yview(i,j,k,0) = us + p; + yview(i,j,k,1) = vs + p; + yview(i,j,k,2) = ws + p; + }); + + /* Return success */ + return(0); +} + + +/* Write time and solution to disk */ +int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt) +{ + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* output current solution norm to screen */ + realtype N = (realtype) udata->grid->npts(); + realtype u = N_VWL2Norm(y, udata->umask); + u = sqrt(u*u/N); + realtype v = N_VWL2Norm(y, udata->vmask); + v = sqrt(v*v/N); + realtype w = N_VWL2Norm(y, udata->wmask); + w = sqrt(w*w/N); + if (udata->myid == 0) { + printf(" %10.6f %10.6f %10.6f %10.6f\n", t, u, v, w); + std::fflush(stdout); + } + + if (uopt->save) + { + /* Copy solution data to host mirror view */ + SUNVector* ylocal = sundials::kokkos::GetVec(N_VGetLocalVector_MPIPlusX(y)); + sundials::kokkos::CopyFromDevice(*ylocal); + + /* output the times to disk */ + if (udata->myid == 0 && udata->TFID) { + fprintf(udata->TFID," %.16e\n", t); + std::fflush(udata->TFID); + } + + /* create 4D view of host data */ + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + Vec4DHost yview(N_VGetArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + + /* output results to disk */ + for (int i = 0; i < nxl; i++) + for (int j = 0; j < nyl; j++) + for (int k = 0; k < nzl; k++) { + fprintf(udata->UFID," %.16e", yview(i,j,k,0)); + fprintf(udata->VFID," %.16e", yview(i,j,k,1)); + fprintf(udata->WFID," %.16e", yview(i,j,k,2)); + } + + fprintf(udata->UFID,"\n"); + fprintf(udata->VFID,"\n"); + fprintf(udata->WFID,"\n"); + std::fflush(udata->UFID); + std::fflush(udata->VFID); + std::fflush(udata->WFID); + } + + return(0); +} + + +void InputError(char *name) +{ + int myid; + + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + + if (myid == 0) + { + fprintf(stderr, "\nERROR: Invalid command line input\n"); + fprintf(stderr, "\nCommand line options for %s\n",name); + fprintf(stderr, " --help prints this message\n"); + fprintf(stderr, " --output-dir the directory where all output files will be written (default is the CWD)\n"); + fprintf(stderr, " --nout number of output times to print (default is 10)\n"); + fprintf(stderr, " --dont-save do not save the solution to the filesystem at the nout interval (default is to save)\n"); + fprintf(stderr, " --method ERK, ARK-DIRK, ARK-IMEX (default), CV-BDF, CV-ADAMS, IDA\n"); + fprintf(stderr, " --fpaccel the number of fixed-point acceleration vectors to use (only valid when using fixedpoint nonlinear solver)\n"); + fprintf(stderr, " --nls nonlinear solver to use (newton, tl-newton (task-local newton), fixedpoint)\n"); + fprintf(stderr, " --nopre do not precondition the linear system\n"); + fprintf(stderr, " --order the method order to use\n"); + fprintf(stderr, " --npts number of mesh points in each direction\n"); + fprintf(stderr, " --npxyz number of processors in each direction (0 forces MPI to decide)\n"); + fprintf(stderr, " --xmax maximum value of x (size of domain)\n"); + fprintf(stderr, " --tf final time\n"); + fprintf(stderr, " --A A parameter value\n"); + fprintf(stderr, " --B B parameter value\n"); + fprintf(stderr, " --k reaction rate\n"); + fprintf(stderr, " --c advection speed\n"); + fprintf(stderr, " --rtol relative tolerance\n"); + fprintf(stderr, " --atol absolute tolerance\n"); + } + + MPI_Barrier(MPI_COMM_WORLD); +} diff --git a/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp new file mode 100644 index 0000000000..cb0dceea64 --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/advection_reaction_3D.hpp @@ -0,0 +1,171 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Daniel R. Reynolds @ SMU + * David J. Gardner, Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ---------------------------------------------------------------------------*/ + +#ifndef ADVECTION_REACTION_3D_HPP +#define ADVECTION_REACTION_3D_HPP + +#include +#include +#include +#include +#include + +#include +#include +#include "nvector/nvector_kokkos.hpp" +#include "check_retval.h" +#include "ParallelGrid.hpp" + +/* Set SUNDIALS Kokkos vector shortcut */ +using SUNVector = sundials::kokkos::Vector; + +using sundials_tools::ParallelGrid; +using sundials_tools::BoundaryType; +using sundials_tools::StencilType; +using std::string; + +/* Maximum size of output directory string */ +constexpr int MXSTR = 2048; + +/* + * Data structure for problem options + */ + +struct UserOptions +{ + int npxyz[3]; /* number of processors in x,y,z */ + sunindextype npts; /* number of spatial mesh points */ + realtype t0; /* initial time */ + realtype tf; /* final time */ + realtype rtol; /* relative tolerance */ + realtype atol; /* absolute tolerance */ + int order; /* method order */ + string method; /* method string */ + string nls; /* nonlinear solver to use */ + int fpaccel; /* number of fixedpoint vectors */ + int precond; /* to precondition or not */ + int fused; /* use fused vector ops */ + int nout; /* number of outputs */ + int save; /* save solution to disk */ + char* outputdir; +}; + + +/* + * Data structure for problem specific data + */ + +struct UserData +{ + SUNContext ctx; + SUNProfiler prof; + + /* MPI data */ + MPI_Comm comm; + int myid; + int nprocs; + MPI_Request req[2]; + + /* Should reactions be added to the advection or not */ + bool add_reactions; + + /* File handles for output */ + FILE* TFID; /* time output file pointer */ + FILE* UFID; /* solution output file pointer */ + FILE* VFID; + FILE* WFID; + + /* Solution masks */ + N_Vector umask; + N_Vector vmask; + N_Vector wmask; + + /* Problem parameters */ + realtype xmax; /* maximum x value */ + realtype A; /* concentration of species A */ + realtype B; /* w source rate */ + realtype k1; /* reaction rates */ + realtype k2; + realtype k3; + realtype k4; + realtype k5; + realtype k6; + realtype c; /* advection coefficient */ + + /* Parallel mesh */ + ParallelGrid* grid; + + /* Count of implicit function evals by the task local nonlinear solver */ + long int nnlfi; + + /* Integrator options */ + UserOptions* uopt; + + /* Constructor that takes the context */ + UserData(SUNContext ctx) + : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr), + TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr) + { + SUNContext_GetProfiler(ctx, &prof); + } + + /* destructor frees the problem data */ + ~UserData(); +}; + + +/* + * Functions to evolve the solution (defined by the drivers) + */ + +/* function that does ARKStep setup and evolves the solution with a DIRK method */ +extern int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt); + +/* function that does ARKStep setup and evolves the solution with an IMEX method */ +extern int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt); + +/* function that does ERKStep setup and evolves the solution */ +extern int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt); + +/* function that does CVODE BDF setup and evolves the solution */ +extern int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt); + +/* function that does CVODE Adams setup and evolves the solution */ +extern int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt); + +/* function that does IDA BDF setup and evolves the solution */ +extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt); + + +/* + * Helper functions + */ + +/* function to set initial condition */ +int SetIC(N_Vector y, UserData* udata); + +/* function to fill neighbor data */ +int FillSendBuffers(N_Vector y, UserData* udata); + +/* functions for processing command line args */ +int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, + SUNContext ctx); +void InputError(char *name); +int ComponentMask(N_Vector mask, const int component, const UserData* udata); + +/* function to write solution to disk */ +int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt); + +#endif diff --git a/benchmarks/advection_reaction_3D/arkode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp similarity index 98% rename from benchmarks/advection_reaction_3D/arkode_driver.cpp rename to benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp index bbea07956a..e2cf1451e3 100644 --- a/benchmarks/advection_reaction_3D/arkode_driver.cpp +++ b/benchmarks/advection_reaction_3D/kokkos/arkode_driver.cpp @@ -588,10 +588,7 @@ int TaskLocalLSolve(N_Vector delta, void* arkode_mem) SUNDIALS_CXX_MARK_FUNCTION(udata->prof); /* set up I - gamma*J and solve */ - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), - RAJA::RangeSegment(0, udata->grid->nyl), - RAJA::RangeSegment(0, udata->grid->nzl)); - retval = SolveReactionLinSys(z, delta, delta, gamma, range, udata); + retval = SolveReactionLinSys(z, delta, delta, gamma, udata); return(retval); diff --git a/benchmarks/advection_reaction_3D/check_retval.h b/benchmarks/advection_reaction_3D/kokkos/check_retval.h similarity index 99% rename from benchmarks/advection_reaction_3D/check_retval.h rename to benchmarks/advection_reaction_3D/kokkos/check_retval.h index 31a4fa5922..887b7cea5d 100644 --- a/benchmarks/advection_reaction_3D/check_retval.h +++ b/benchmarks/advection_reaction_3D/kokkos/check_retval.h @@ -54,4 +54,4 @@ static int check_retval(void *returnvalue, const char *funcname, int opt, int my return(0); } -#endif \ No newline at end of file +#endif diff --git a/benchmarks/advection_reaction_3D/cvode_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp similarity index 100% rename from benchmarks/advection_reaction_3D/cvode_driver.cpp rename to benchmarks/advection_reaction_3D/kokkos/cvode_driver.cpp diff --git a/benchmarks/advection_reaction_3D/ida_driver.cpp b/benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp similarity index 100% rename from benchmarks/advection_reaction_3D/ida_driver.cpp rename to benchmarks/advection_reaction_3D/kokkos/ida_driver.cpp diff --git a/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp new file mode 100644 index 0000000000..34698146ab --- /dev/null +++ b/benchmarks/advection_reaction_3D/kokkos/rhs3D.hpp @@ -0,0 +1,540 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Daniel R. Reynolds @ SMU + * David J. Gardner, Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * -----------------------------------------------------------------------------*/ + +#ifndef ADVECTION_REACTION_3D_RHS_HPP +#define ADVECTION_REACTION_3D_RHS_HPP + +#include "advection_reaction_3D.hpp" + +/* -------------------------------------------------------------- + * Right hand side (RHS) and residual functions + * --------------------------------------------------------------*/ + +/* Compute the advection term f(t,y) = -c (grad * y). This is done using + upwind 1st order finite differences. At present, only periodic boudary + conditions are supported, which are handled via MPI's Cartesian + communicator (even for serial runs). */ +static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* set variable shortcuts */ + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + const realtype c = udata->c; + const realtype cx = -c / udata->grid->dx; + const realtype cy = -c / udata->grid->dy; + const realtype cz = -c / udata->grid->dz; + + /* local variables */ + int retval; + + /* fill send buffers and begin exchanging boundary information */ + SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); + retval = FillSendBuffers(y, udata); + if (check_retval(&retval, "FillSendBuffers", 1, udata->myid)) + return(-1); + retval = udata->grid->ExchangeStart(); + if (check_retval(&retval, "ExchangeStart", 1, udata->myid)) + return(-1); + SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); + + /* set output to zero */ + N_VConst(0.0, ydot); + + /* create 4D views of the state and RHS vectors */ + Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof); + + /* iterate over domain interior, computing advection */ + if (c > 0.0) + { + /* flow moving in the positive x,y,z direction */ + Kokkos::parallel_for("AdvectionInteriorRight", + Range3D({1,1,1},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + const realtype u_ijk = Yview(i,j,k,0); + const realtype v_ijk = Yview(i,j,k,1); + const realtype w_ijk = Yview(i,j,k,2); + + // grad * u + dYview(i,j,k,0) = cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz + dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy + dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx + + // grad * v + dYview(i,j,k,1) = cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz + dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy + dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx + + // grad * w + dYview(i,j,k,2) = cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz + dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy + dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx + }); + } + else if (c < 0.0) + { + /* flow moving in the negative x,y,z direction */ + Kokkos::parallel_for("AdvectionInteriorLeft", + Range3D({0,0,0},{nxl-1,nyl-1,nzl-1}), + KOKKOS_LAMBDA (int i, int j, int k) + { + const realtype u_ijk = Yview(i,j,k,0); + const realtype v_ijk = Yview(i,j,k,1); + const realtype w_ijk = Yview(i,j,k,2); + + // grad * u + dYview(i,j,k,0) = cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz + dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy + dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx + + // grad * v + dYview(i,j,k,1) = cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz + dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy + dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx + + // grad * w + dYview(i,j,k,2) = cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz + dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy + dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx + }); + } + + /* finish exchanging boundary information */ + SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); + retval = udata->grid->ExchangeEnd(); + if (check_retval(&retval, "ExchangeEnd", 1, udata->myid)) + return(-1); + SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); + + /* compute advection at process boundaries */ + if (c > 0.0) + { + /* Flow moving in the positive x,y,z direction: + boundaries are west face, south face, and back face */ + + /* Create 4D views of receive buffers */ + Vec4D Wrecv(udata->grid->GetRecvView("WEST"), 1, nyl, nzl, dof); + Vec4D Srecv(udata->grid->GetRecvView("SOUTH"), nxl, 1, nzl, dof); + Vec4D Brecv(udata->grid->GetRecvView("BACK"), nxl, nyl, 1, dof); + + /* Perform calculations on each "lower" face */ + Kokkos::parallel_for("AdvectionBoundaryWest", + Range3D({0,0,0},{nyl,nzl,dof}), + KOKKOS_LAMBDA (int j, int k, int l) + { + const int i = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l); + const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l); + dYview(i,j,k,l) = cx * (Yijkl - Wrecv(0,j,k,l)); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - YSouth); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - YBack); // d/dz + }); + Kokkos::parallel_for("AdvectionBoundarySouth", + Range3D({0,0,0},{nxl,nzl,dof}), + KOKKOS_LAMBDA (int i, int k, int l) + { + const int j = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l); + const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,0,l); + dYview(i,j,k,l) = cx * (Yijkl - YWest); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,0,k,l)); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - YBack); // d/dz + }); + Kokkos::parallel_for("AdvectionBoundaryBack", + Range3D({0,0,0},{nxl,nyl,dof}), + KOKKOS_LAMBDA (int i, int j, int l) + { + const int k = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(0,j,k,l); + const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,0,k,l); + dYview(i,j,k,l) = cx * (Yijkl - YWest); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - YSouth); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,0,l)); // d/dz + }); + + } + else if (c < 0.0) + { + + /* Flow moving in the negative x,y,z direction: + boundaries are east face, north face, and front face */ + + /* Create 4D views of receive buffers */ + Vec4D Erecv(udata->grid->GetRecvView("EAST"), 1, nyl, nzl, dof); + Vec4D Nrecv(udata->grid->GetRecvView("NORTH"), nxl, 1, nzl, dof); + Vec4D Frecv(udata->grid->GetRecvView("FRONT"), nxl, nyl, 1, dof); + + /* Perform calculations on each "upper" face */ + Kokkos::parallel_for("AdvectionBoundaryEast", + Range3D({0,0,0},{nyl,nzl,dof}), + KOKKOS_LAMBDA (int j, int k, int l) + { + const int i = nxl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l); + const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l); + dYview(i,j,k,l) = cx * (Erecv(0,j,k,l) - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (YNorth - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (YFront - Yijkl); // d/dz + }); + Kokkos::parallel_for("AdvectionBoundaryNorth", + Range3D({0,0,0},{nxl,nzl,dof}), + KOKKOS_LAMBDA (int i, int k, int l) + { + const int j = nyl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YEast = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l); + const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,0,l); + dYview(i,j,k,l) = cx * (YEast - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (Nrecv(i,0,k,l) - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (YFront - Yijkl); // d/dz + }); + Kokkos::parallel_for("AdvectionBoundaryFront", + Range3D({0,0,0},{nxl,nyl,dof}), + KOKKOS_LAMBDA (int i, int j, int l) + { + const int k = nzl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YEast = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(0,j,k,l); + const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,0,k,l); + dYview(i,j,k,l) = cx * (YEast - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (YNorth - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (Frecv(i,j,0,l) - Yijkl); // d/dz + }); + } + + /* return success */ + return(0); +} + + +/* Compute the reaction term g(t,y). */ +static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* set variable shortcuts */ + const realtype A = udata->A; + const realtype B = udata->B; + const realtype k1 = udata->k1; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k5 = udata->k5; + const realtype k6 = udata->k6; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + + /* Zero output if not adding reactions to existing RHS */ + if (!udata->add_reactions) + N_VConst(0.0, ydot); + + /* create 4D views of state and RHS vectors */ + Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + Vec4D dYview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(ydot)), nxl, nyl, nzl, dof); + + /* add reaction terms to RHS */ + Kokkos::parallel_for("ReactionRHS", + Range3D({0,0,0},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u; + dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v; + dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w; + }); + + /* return success */ + return(0); +} + + +/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */ +static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot, + void *user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + int retval; + + /* NOTE: The order in which Advection and Reaction are called + is critical here. Advection must be computed first. */ + retval = Advection(t, y, ydot, user_data); + if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); + + retval = Reaction(t, y, ydot, user_data); + if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); + + /* return success */ + return(0); +} + +/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */ +static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot, + N_Vector F, void *user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + int retval; + + /* NOTE: The order in which Advection and Reaction are called + is critical here. Advection must be computed first. */ + retval = Advection(t, y, F, user_data); /* F = -c y_x */ + if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); + + retval = Reaction(t, y, F, user_data); /* F = -c y_x + g(t,y) */ + if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); + + /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */ + N_VLinearSum(1.0, ydot, -1.0, F, F); + + /* return success */ + return(0); +} + +/* -------------------------------------------------------------- + * Linear system and Jacobian functions + * --------------------------------------------------------------*/ + +/* Solve the linear systems Ax = b where A = I - gamma*dg/dy. + When using a fully implicit method, we are approximating + dh/dy as dg/dy. */ +static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b, + const realtype gamma, UserData* udata) +{ + /* set variable shortcuts */ + const int dof = udata->grid->dof; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k6 = udata->k6; + + /* create 4D views of state, RHS and solution vectors */ + Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof); + Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof); + + /* solve reaction linear system */ + Kokkos::parallel_for("SolveReactionLinSys", + Range3D({0,0,0},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + + /* shortcuts to u, v, w for the block */ + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + + // + // compute A = I - gamma*(dg/dy) + // + + /* 1st row: u, v, w */ + const realtype A0 = 1. - gamma * (-k2 * w + 2.0 * k3 * u * v - k4); + const realtype A1 = -gamma * (k3 * u * u); + const realtype A2 = -gamma * (-k2 * u); + + /* 2nd row: u, v, w */ + const realtype A3 = -gamma * (k2 * w - 2.0 * k3 * u * v); + const realtype A4 = 1. - gamma * (-k3 * u * u); + const realtype A5 = -gamma * (k2 * u); + + /* 3rd row: u, v, w */ + const realtype A6 = -gamma * (-k2 * w); + const realtype A7 = 0.0; + const realtype A8 = 1. - gamma * (-k2 * u - k6); + + // + // compute x = A^{-1}*b + // + + const realtype scratch_0 = A4*A8; + const realtype scratch_1 = A1*A5; + const realtype scratch_2 = A2*A7; + const realtype scratch_3 = A5*A7; + const realtype scratch_4 = A1*A8; + const realtype scratch_5 = A2*A4; + const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); + const realtype scratch_7 = A2*A3; + const realtype scratch_8 = A6*Bview(i,j,k,0); + const realtype scratch_9 = A2*A6; + const realtype scratch_10 = A3*Bview(i,j,k,0); + const realtype scratch_11 = 1.0/A0; + const realtype scratch_12 = A1*scratch_11; + const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); + + Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3) + + Bview(i,j,k,1)*(scratch_2 - scratch_4) + + Bview(i,j,k,2)*(scratch_1 - scratch_5)); + Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5) + + Bview(i,j,k,1)*(A0*A8 - scratch_9) + + A5*scratch_8 - A8*scratch_10 ); + Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8 + + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) / + (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); + + }); + + return(0); +} + +/* Solve the linear systems Ax = b where A = -dg/dy + gamma. + We are approximating dh/dy as dg/dy. */ +static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b, + const realtype gamma, UserData* udata) +{ + /* set variable shortcuts */ + const int dof = udata->grid->dof; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k6 = udata->k6; + + /* create 4D views of state, RHS and solution vectors */ + Vec4D Yview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(y)), nxl, nyl, nzl, dof); + Vec4D Bview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(b)), nxl, nyl, nzl, dof); + Vec4D Xview(N_VGetDeviceArrayPointer(N_VGetLocalVector_MPIPlusX(x)), nxl, nyl, nzl, dof); + + /* solve reaction linear system */ + Kokkos::parallel_for("SolveReactionLinSys", + Range3D({0,0,0},{nxl,nyl,nzl}), + KOKKOS_LAMBDA (int i, int j, int k) + { + + /* shortcuts to u, v, w for the block */ + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + + // + // compute A = -dg/dy + gamma*diag(df/dydot) + // where diag(df/dydot) is approximated as + // diag([udot, vdot, wdot]) + // + + /* 1st row: u, v, w */ + const realtype A0 = -(-k2 * w + 2.0 * k3 * u * v - k4) + gamma; + const realtype A1 = -(k3 * u * u); + const realtype A2 = -(-k2 * u); + + /* 2nd row: u, v, w */ + const realtype A3 = -(k2 * w - 2.0 * k3 * u * v); + const realtype A4 = -(-k3 * u * u) + gamma; + const realtype A5 = -(k2 * u); + + /* 3rd row: u, v, w */ + const realtype A6 = -(-k2 * w); + const realtype A7 = 0.0; + const realtype A8 = -(-k2 * u - k6) + gamma; + + // + // compute x = A^{-1}*b + // + + const realtype scratch_0 = A4*A8; + const realtype scratch_1 = A1*A5; + const realtype scratch_2 = A2*A7; + const realtype scratch_3 = A5*A7; + const realtype scratch_4 = A1*A8; + const realtype scratch_5 = A2*A4; + const realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); + const realtype scratch_7 = A2*A3; + const realtype scratch_8 = A6*Bview(i,j,k,0); + const realtype scratch_9 = A2*A6; + const realtype scratch_10 = A3*Bview(i,j,k,0); + const realtype scratch_11 = 1.0/A0; + const realtype scratch_12 = A1*scratch_11; + const realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); + + Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3) + + Bview(i,j,k,1)*(scratch_2 - scratch_4) + + Bview(i,j,k,2)*(scratch_1 - scratch_5)); + Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5) + + Bview(i,j,k,1)*(A0*A8 - scratch_9) + + A5*scratch_8 - A8*scratch_10 ); + Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8 + + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) / + (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); + + }); + + return(0); +} + + +/* -------------------------------------------------------------- + * Preconditioner functions + * --------------------------------------------------------------*/ + +/* Solves Pz = r where P = I - gamma * dg/dy */ +static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r, + N_Vector z, realtype gamma, realtype delta, int lr, + void *user_data) +{ + /* local variables */ + UserData* udata = (UserData*) user_data; + int retval; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* solve the task-local linear system Pz = r */ + retval = SolveReactionLinSys(y, z, r, gamma, udata); + + return(retval); +} + +/* Solves Pz = r where P = -dg/dy + gamma */ +static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F, + N_Vector r, N_Vector z, realtype cj, realtype delta, + void *user_data) +{ + /* local variables */ + UserData* udata = (UserData*) user_data; + int retval; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* solve the task-local linear system Pz = r */ + retval = SolveReactionLinSysRes(y, z, r, cj, udata); + + return(retval); +} + + +#endif diff --git a/benchmarks/advection_reaction_3D/raja/CMakeLists.txt b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt new file mode 100644 index 0000000000..0bae78c562 --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/CMakeLists.txt @@ -0,0 +1,151 @@ +# --------------------------------------------------------------- +# Programmer(s): Cody J. Balos @ LLNL +# Daniel R. Reynolds @ SMU +# --------------------------------------------------------------- +# SUNDIALS Copyright Start +# Copyright (c) 2002-2023, Lawrence Livermore National Security +# and Southern Methodist University. +# All rights reserved. +# +# See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-3-Clause +# SUNDIALS Copyright End +# --------------------------------------------------------------- + +if(BUILD_ARKODE AND BUILD_CVODE AND BUILD_IDA) + + if((RAJA_BACKENDS MATCHES "TARGET_OPENMP") OR (RAJA_BACKENDS MATCHES "OPENMP")) + set(OTHER_LIBS OpenMP::OpenMP_CXX) + endif() + + # ---------------------------------------------------------------------------- + # MPI only + # ---------------------------------------------------------------------------- + + add_executable(advection_reaction_3D_raja + advection_reaction_3D.cpp + arkode_driver.cpp + cvode_driver.cpp + ida_driver.cpp + rhs3D.hpp + ParallelGrid.hpp + check_retval.h + backends.hpp) + + # ensure the linker language is reset to CXX + set_target_properties(advection_reaction_3D_raja PROPERTIES LINKER_LANGUAGE CXX) + + target_include_directories(advection_reaction_3D_raja + PRIVATE + ${PROJECT_SOURCE_DIR}/utilities + ${MPI_CXX_INCLUDE_DIRS}) + + target_link_libraries(advection_reaction_3D_raja + PRIVATE + sundials_arkode + sundials_cvode + sundials_ida + sundials_nvecmpiplusx + sundials_nvecserial + RAJA + ${MPI_CXX_LIBRARIES} + ${OTHER_LIBS}) + + install(TARGETS advection_reaction_3D_raja + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja") + + install(FILES README.md ../scripts/compare_error.py ../scripts/compute_error.py ../scripts/pickle_solution_output.py + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja") + + # ---------------------------------------------------------------------------- + # MPI + CUDA + # ---------------------------------------------------------------------------- + + if(BUILD_NVECTOR_CUDA) + + set_source_files_properties(advection_reaction_3D.cpp + PROPERTIES LANGUAGE CUDA) + set_source_files_properties(arkode_driver.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(cvode_driver.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(ida_driver.cpp PROPERTIES LANGUAGE CUDA) + + add_executable(advection_reaction_3D_raja_mpicuda + advection_reaction_3D.cpp + arkode_driver.cpp + cvode_driver.cpp + ida_driver.cpp + rhs3D.hpp + ParallelGrid.hpp + check_retval.h + backends.hpp) + + # ensure the linker language is reset to CXX + set_target_properties(advection_reaction_3D_raja_mpicuda + PROPERTIES LINKER_LANGUAGE CXX) + + target_include_directories(advection_reaction_3D_raja_mpicuda + PRIVATE + ${PROJECT_SOURCE_DIR}/utilities + ${MPI_CXX_INCLUDE_DIRS}) + + target_link_libraries(advection_reaction_3D_raja_mpicuda + PRIVATE + sundials_arkode + sundials_cvode + sundials_ida + sundials_nvecmpiplusx + sundials_nveccuda + RAJA + ${MPI_CXX_LIBRARIES} + ${OTHER_LIBS}) + + target_compile_definitions(advection_reaction_3D_raja_mpicuda PRIVATE USE_CUDA_NVEC) + + install(TARGETS advection_reaction_3D_raja_mpicuda + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja") + + endif() + + # ---------------------------------------------------------------------------- + # MPI + HIP + # ---------------------------------------------------------------------------- + + if(BUILD_NVECTOR_HIP) + + add_executable(advection_reaction_3D_raja_mpihip + advection_reaction_3D.cpp + advection_reaction_3D.hpp + arkode_driver.cpp + cvode_driver.cpp + ida_driver.cpp + rhs3D.hpp + ParallelGrid.hpp + check_retval.h + backends.hpp) + + target_include_directories(advection_reaction_3D_raja_mpihip + PRIVATE + ${PROJECT_SOURCE_DIR}/utilities + ${MPI_CXX_INCLUDE_DIRS}) + + target_link_libraries(advection_reaction_3D_raja_mpihip + PRIVATE + sundials_arkode + sundials_cvode + sundials_ida + sundials_nvecmpiplusx + sundials_nvechip + RAJA + hip::device + ${MPI_CXX_LIBRARIES} + ${OTHER_LIBS}) + + target_compile_definitions(advection_reaction_3D_raja_mpihip PRIVATE USE_HIP_NVEC) + + install(TARGETS advection_reaction_3D_raja_mpihip + DESTINATION "${BENCHMARKS_INSTALL_PATH}/advection_reaction_3D/raja") + + endif() + +endif() diff --git a/benchmarks/advection_reaction_3D/ParallelGrid.hpp b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp similarity index 56% rename from benchmarks/advection_reaction_3D/ParallelGrid.hpp rename to benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp index abd6185810..1592a27806 100644 --- a/benchmarks/advection_reaction_3D/ParallelGrid.hpp +++ b/benchmarks/advection_reaction_3D/raja/ParallelGrid.hpp @@ -1,5 +1,6 @@ /* ----------------------------------------------------------------------------- * Programmer(s): Cody J. Balos @ LLNL + * Daniel R. Reynolds @ SMU * ----------------------------------------------------------------------------- * SUNDIALS Copyright Start * Copyright (c) 2002-2023, Lawrence Livermore National Security @@ -40,24 +41,26 @@ enum class StencilType UPWIND }; -template +template class ParallelGrid { public: // Constructor that creates a new ParallelGrid object. // [in] - the memory helper to use for allocating the MPI buffers // [in,out] comm - on input, the overal MPI communicator, on output, the cartesian communicator - // [in] a[] - an array of length NDIMS which defines the domain [a,b] - // [in] b[] - an array of length NDIMS which defines the domain [a,b] - // [in] npts[] - an array of length NDIMS which defines the number of mesh points in each dimension + // [in] a[] - an array of length 3 which defines the domain [a,b] + // [in] b[] - an array of length 3 which defines the domain [a,b] + // [in] npts[] - an array of length 3 which defines the number of mesh points in each dimension // [in] dof - the number of degrees of freedom in each dimension // [in] bc - the type of boundary conditions (see BoundaryType) // [in] st - the stencil to use (see StencilType) // [in] width - the stencil width; defaults to 1 // [in] npxyz - the number of processors in each dimension; defaults to 0 which means MPI will choose // [in] reorder - should MPI_Cart_create do process reordering to optimize or not; defaults to false (some MPI implementations ignore this) - ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[], const GLOBALINT npts[], int dof, - BoundaryType bc, StencilType st, int width = 1, const int npxyz[] = nullptr, bool reorder = false) + ParallelGrid(SUNMemoryHelper memhelp, MPI_Comm* comm, const REAL a[], const REAL b[], + const GLOBALINT npts[], int dof, BoundaryType bc, StencilType st, + const REAL c, int width = 1, const int npxyz[] = nullptr, + bool reorder = false) : nx(1), ny(1), nz(1), nxl(1), nyl(1), nzl(1), npx(1), npy(1), npz(1), @@ -66,35 +69,40 @@ class ParallelGrid bx(0.0), by(0.0), bz(0.0), dof(dof), dims{0,0,0}, coords{0,0,0}, bc(bc), st(st), width(width), + upwindRight(true), memhelp(memhelp) - { - static_assert((NDIMS >= 1 && NDIMS <= 3), "ParallelGrid NDIMS must be 1, 2 or 3"); - int retval, nprocs; - int periods[] = {0, 0, 0}; + { + assert(st == StencilType::UPWIND); + /* Set up MPI Cartesian communicator */ if (npxyz) { dims[0] = npxyz[0]; - if (NDIMS >= 2) dims[1] = npxyz[1]; - if (NDIMS == 3) dims[2] = npxyz[2]; + dims[1] = npxyz[1]; + dims[2] = npxyz[2]; } + int retval, nprocs; MPI_Comm_size(*comm, &nprocs); - retval = MPI_Dims_create(nprocs, NDIMS, dims); + retval = MPI_Dims_create(nprocs, 3, dims); assert(retval == MPI_SUCCESS); - periods[0] = bc == BoundaryType::PERIODIC; - periods[1] = bc == BoundaryType::PERIODIC; - periods[2] = bc == BoundaryType::PERIODIC; - retval = MPI_Cart_create(*comm, NDIMS, dims, periods, reorder, comm); + int periods[] = { bc == BoundaryType::PERIODIC, + bc == BoundaryType::PERIODIC, + bc == BoundaryType::PERIODIC }; + retval = MPI_Cart_create(*comm, 3, dims, periods, reorder, comm); assert(retval == MPI_SUCCESS); - retval = MPI_Cart_get(*comm, NDIMS, dims, periods, coords); + retval = MPI_Cart_get(*comm, 3, dims, periods, coords); assert(retval == MPI_SUCCESS); cart_comm = *comm; + /* Set upwinding direction */ + upwindRight = (c > 0.0); + + /* Set up information for the first spatial dimension */ npx = dims[0]; nx = npts[0]; ax = a[0]; @@ -103,251 +111,235 @@ class ParallelGrid int is = nx*(coords[0])/npx; int ie = nx*(coords[0]+1)/npx-1; nxl = ie-is+1; - neq = dof * nxl; - if (NDIMS >= 2) - { - npy = dims[1]; - ny = npts[1]; - ay = a[1]; - by = b[1]; - dy = (by-ay) / (REAL) ny; - int js = ny*(coords[1])/npy; - int je = ny*(coords[1]+1)/npy-1; - nyl = je-js+1; - - neq *= nyl; - } - - if (NDIMS == 3) - { - npz = dims[2]; - nz = npts[2]; - az = a[2]; - bz = b[2]; - dz = (bz-az) / (REAL) nz; - int ks = nz*(coords[2])/npz; - int ke = nz*(coords[2]+1)/npz-1; - nzl = ke-ks+1; - - neq *= nzl; - } - + /* Set up information for the second spatial dimension */ + npy = dims[1]; + ny = npts[1]; + ay = a[1]; + by = b[1]; + dy = (by-ay) / (REAL) ny; + int js = ny*(coords[1])/npy; + int je = ny*(coords[1]+1)/npy-1; + nyl = je-js+1; + neq *= nyl; + + /* Set up information for the third spatial dimension */ + npz = dims[2]; + nz = npts[2]; + az = a[2]; + bz = b[2]; + dz = (bz-az) / (REAL) nz; + int ks = nz*(coords[2])/npz; + int ke = nz*(coords[2]+1)/npz-1; + nzl = ke-ks+1; + neq *= nzl; + + /* Allocate buffers for nearest-neighbor exchange */ if (st == StencilType::UPWIND) AllocateBuffersUpwind(); } // TODO: - // - does not take advantage of upwind scheme to reduce communications and memory // - support non-periodic boundary conditions // For all faces where neighbors exist: determine neighbor process indices. // For all faces: allocate exchange buffers. void AllocateBuffersUpwind() { - int retval = 0; - int nbcoords[] = {0, 0, 0}; - SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl, - memoryType(), nullptr); - SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl, - memoryType(), nullptr); + /* Allocate send/receive buffers and determine ID for communication West */ + if (upwindRight) + SUNMemoryHelper_Alloc(memhelp, &Wrecv_, sizeof(REAL)*dof*width*nyl*nzl, + memoryType(), nullptr); + else + SUNMemoryHelper_Alloc(memhelp, &Wsend_, sizeof(REAL)*dof*width*nyl*nzl, + memoryType(), nullptr); ipW = MPI_PROC_NULL; if ((coords[0] > 0) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]-1; - nbcoords[1] = coords[1]; - nbcoords[2] = coords[2]; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW); + int nbcoords[] = {coords[0]-1, coords[1], coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipW); assert(retval == MPI_SUCCESS); } - SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl, - memoryType(), nullptr); - SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl, - memoryType(), nullptr); + /* Allocate send/receive buffers and determine ID for communication East */ + if (upwindRight) + SUNMemoryHelper_Alloc(memhelp, &Esend_, sizeof(REAL)*dof*width*nyl*nzl, + memoryType(), nullptr); + else + SUNMemoryHelper_Alloc(memhelp, &Erecv_, sizeof(REAL)*dof*width*nyl*nzl, + memoryType(), nullptr); ipE = MPI_PROC_NULL; if ((coords[0] < dims[0]-1) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]+1; - nbcoords[1] = coords[1]; - nbcoords[2] = coords[2]; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE); + int nbcoords[] = {coords[0]+1, coords[1], coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipE); assert(retval == MPI_SUCCESS); } - if (NDIMS >= 2) - { + /* Allocate send/receive buffers and determine ID for communication South */ + if (upwindRight) SUNMemoryHelper_Alloc(memhelp, &Srecv_, sizeof(REAL)*dof*width*nxl*nzl, memoryType(), nullptr); + else SUNMemoryHelper_Alloc(memhelp, &Ssend_, sizeof(REAL)*dof*width*nxl*nzl, memoryType(), nullptr); - ipS = MPI_PROC_NULL; - if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]; - nbcoords[1] = coords[1]-1; - nbcoords[2] = coords[2]; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS); - assert(retval == MPI_SUCCESS); - } + ipS = MPI_PROC_NULL; + if ((coords[1] > 0) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1]-1, coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipS); + assert(retval == MPI_SUCCESS); + } - SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl, - memoryType(), nullptr); + /* Allocate send/receive buffers and determine ID for communication North */ + if (upwindRight) SUNMemoryHelper_Alloc(memhelp, &Nsend_, sizeof(REAL)*dof*width*nxl*nzl, memoryType(), nullptr); - ipN = MPI_PROC_NULL; - if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]; - nbcoords[1] = coords[1]+1; - nbcoords[2] = coords[2]; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN); - assert(retval == MPI_SUCCESS); - } + else + SUNMemoryHelper_Alloc(memhelp, &Nrecv_, sizeof(REAL)*dof*width*nxl*nzl, + memoryType(), nullptr); + ipN = MPI_PROC_NULL; + if ((coords[1] < dims[1]-1) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1]+1, coords[2]}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipN); + assert(retval == MPI_SUCCESS); } - if (NDIMS == 3) - { + /* Allocate send/receive buffers and determine ID for communication Back */ + if (upwindRight) SUNMemoryHelper_Alloc(memhelp, &Brecv_, sizeof(REAL)*dof*width*nxl*nyl, memoryType(), nullptr); + else SUNMemoryHelper_Alloc(memhelp, &Bsend_, sizeof(REAL)*dof*width*nxl*nyl, memoryType(), nullptr); - ipB = MPI_PROC_NULL; - if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]; - nbcoords[1] = coords[1]; - nbcoords[2] = coords[2]-1; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB); - assert(retval == MPI_SUCCESS); - } + ipB = MPI_PROC_NULL; + if ((coords[2] > 0) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1], coords[2]-1}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipB); + assert(retval == MPI_SUCCESS); + } - SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl, - memoryType(), nullptr); + /* Allocate send/receive buffers and determine ID for communication Front */ + if (upwindRight) SUNMemoryHelper_Alloc(memhelp, &Fsend_, sizeof(REAL)*dof*width*nxl*nyl, memoryType(), nullptr); - ipF = MPI_PROC_NULL; - if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) { - nbcoords[0] = coords[0]; - nbcoords[1] = coords[1]; - nbcoords[2] = coords[2]+1; - retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF); - assert(retval == MPI_SUCCESS); - } + else + SUNMemoryHelper_Alloc(memhelp, &Frecv_, sizeof(REAL)*dof*width*nxl*nyl, + memoryType(), nullptr); + ipF = MPI_PROC_NULL; + if ((coords[2] < dims[2]-1) || (bc == BoundaryType::PERIODIC)) { + int nbcoords[] = {coords[0], coords[1], coords[2]+1}; + int retval = MPI_Cart_rank(cart_comm, nbcoords, &ipF); + assert(retval == MPI_SUCCESS); } } - // TODO: this could be optimized for upwind - int ExchangeStart(std::function fill) + // Initiate non-blocking neighbor communication + int ExchangeStart() { int retval = 0; + nreq = 0; // Initialize all requests in array for (int i=0; i<12; i++) req[i] = MPI_REQUEST_NULL; // Open an Irecv buffer for each neighbor - if (ipW != MPI_PROC_NULL) + if ((ipW != MPI_PROC_NULL) && (upwindRight)) { - retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, - 1, cart_comm, req); + retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, + 1, cart_comm, req+nreq); assert(retval == MPI_SUCCESS); + nreq++; } - if (ipE != MPI_PROC_NULL) + if ((ipE != MPI_PROC_NULL) && (!upwindRight)) { - retval = MPI_Irecv(getRecvBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, - 0, cart_comm, req+1); + retval = MPI_Irecv(getRecvBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, + 0, cart_comm, req+nreq); assert(retval == MPI_SUCCESS); + nreq++; } - if (NDIMS >= 2) + if ((ipS != MPI_PROC_NULL) && (upwindRight)) { - if (ipS != MPI_PROC_NULL) - { - retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, - 3, cart_comm, req+2); - assert(retval == MPI_SUCCESS); - } - - if (ipN != MPI_PROC_NULL) - { - retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, - 2, cart_comm, req+3); - assert(retval == MPI_SUCCESS); - } - } - - if (NDIMS >= 3) + retval = MPI_Irecv(getRecvBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, + 3, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipN != MPI_PROC_NULL) && (!upwindRight)) + { + retval = MPI_Irecv(getRecvBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, + 2, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipB != MPI_PROC_NULL) && (upwindRight)) + { + retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, + 5, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipF != MPI_PROC_NULL) && (!upwindRight)) { - if (ipB != MPI_PROC_NULL) - { - retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, - 5, cart_comm, req+4); - assert(retval == MPI_SUCCESS); - } - - if (ipF != MPI_PROC_NULL) - { - retval = MPI_Irecv(getRecvBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, - 4, cart_comm, req+5); - assert(retval == MPI_SUCCESS); - } - } - - // Call user lambda to fill the send buffers - fill(getSendBuffer("WEST"), - getSendBuffer("EAST"), - getSendBuffer("SOUTH"), - getSendBuffer("NORTH"), - getSendBuffer("BACK"), - getSendBuffer("FRONT")); + retval = MPI_Irecv(getRecvBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, + 4, cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } // Send data to neighbors - if (ipW != MPI_PROC_NULL) + if ((ipW != MPI_PROC_NULL) && (!upwindRight)) { - retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0, - cart_comm, req+6); + retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipW, 0, + cart_comm, req+nreq); assert(retval == MPI_SUCCESS); + nreq++; } - if (ipE != MPI_PROC_NULL) + if ((ipE != MPI_PROC_NULL) && (upwindRight)) { - retval = MPI_Isend(getSendBuffer("WEST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1, - cart_comm, req+7); + retval = MPI_Isend(getSendBuffer("EAST"), dof*nyl*nzl, MPI_SUNREALTYPE, ipE, 1, + cart_comm, req+nreq); assert(retval == MPI_SUCCESS); + nreq++; } - if (NDIMS >= 2) + if ((ipS != MPI_PROC_NULL) && (!upwindRight)) { - if (ipS != MPI_PROC_NULL) - { - retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2, - cart_comm, req+8); - assert(retval == MPI_SUCCESS); - } - - if (ipN != MPI_PROC_NULL) - { - retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3, - cart_comm, req+9); - assert(retval == MPI_SUCCESS); - } - } - - if (NDIMS == 3) + retval = MPI_Isend(getSendBuffer("SOUTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipS, 2, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipN != MPI_PROC_NULL) && (upwindRight)) { - if (ipB != MPI_PROC_NULL) - { - retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4, - cart_comm, req+10); - assert(retval == MPI_SUCCESS); - } + retval = MPI_Isend(getSendBuffer("NORTH"), dof*nxl*nzl, MPI_SUNREALTYPE, ipN, 3, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } - if (ipF != MPI_PROC_NULL) - { - retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5, - cart_comm, req+11); - assert(retval == MPI_SUCCESS); - } + if ((ipB != MPI_PROC_NULL) && (!upwindRight)) + { + retval = MPI_Isend(getSendBuffer("BACK"), dof*nxl*nyl, MPI_SUNREALTYPE, ipB, 4, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; + } + + if ((ipF != MPI_PROC_NULL) && (upwindRight)) + { + retval = MPI_Isend(getSendBuffer("FRONT"), dof*nxl*nyl, MPI_SUNREALTYPE, ipF, 5, + cart_comm, req+nreq); + assert(retval == MPI_SUCCESS); + nreq++; } return retval; @@ -359,8 +351,12 @@ class ParallelGrid MPI_Status stat[12]; int retval; + // return automatically with success if there are no outstanding requests + if (nreq == 0) + return(0); + // Wait for messages to finish send/receive - retval = MPI_Waitall(12, req, stat); + retval = MPI_Waitall(nreq, req, stat); assert(retval == MPI_SUCCESS); return retval; @@ -370,12 +366,16 @@ class ParallelGrid void PrintInfo() { printf("ParallelGrid Info:\n"); - printf(" dimensions = %d\n", NDIMS); + printf(" dimensions = %d\n", 3); printf(" processors = {%d, %d, %d}\n", npx, npy, npz); printf(" domain = {[%g,%g], [%g,%g], [%g,%g]}\n", ax, bx, ay, by, az, bz); printf(" global npts = {%li, %li, %li}\n", (long int) nx, (long int) ny, (long int) nz); printf(" local npts = {%d, %d, %d}\n", nxl, nyl, nzl); printf(" mesh spacing = {%g, %g, %g}\n", dx, dy, dz); + if (upwindRight) + printf(" upwind dir = right\n"); + else + printf(" upwind dir = left\n"); } // Saves the mesh to a file. @@ -407,16 +407,12 @@ class ParallelGrid GLOBALINT npts() const { - if (NDIMS == 1) return nx; - if (NDIMS == 2) return nx*ny; - if (NDIMS == 3) return nx*ny*nz; + return nx*ny*nz; } GLOBALINT nptsl() const { - if (NDIMS == 1) return nxl; - if (NDIMS == 2) return nxl*nyl; - if (NDIMS == 3) return nxl*nyl*nzl; + return nxl*nyl*nzl; } GLOBALINT neql() const @@ -452,6 +448,7 @@ class ParallelGrid } else { + assert(direction == "ILLEGAL"); return nullptr; } } @@ -484,24 +481,28 @@ class ParallelGrid } else { + assert(direction == "ILLEGAL"); return nullptr; } } ~ParallelGrid() { - SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr); - SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr); + if (upwindRight) { + SUNMemoryHelper_Dealloc(memhelp, Esend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Nsend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Fsend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Wrecv_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Srecv_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Brecv_, nullptr); + } else { + SUNMemoryHelper_Dealloc(memhelp, Wsend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Ssend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Bsend_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Erecv_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Nrecv_, nullptr); + SUNMemoryHelper_Dealloc(memhelp, Frecv_, nullptr); + } } GLOBALINT nx, ny, nz; /* number of intervals globally */ @@ -516,6 +517,7 @@ class ParallelGrid int ipW, ipE; /* MPI ranks for neighbor procs */ int ipS, ipN; int ipB, ipF; + bool upwindRight; /* Upwind dir: true/false == R/L */ int dims[3]; int coords[3]; @@ -524,6 +526,7 @@ class ParallelGrid private: MPI_Comm cart_comm; /* MPI cartesian communicator */ MPI_Request req[12]; + int nreq; BoundaryType bc; StencilType st; diff --git a/benchmarks/advection_reaction_3D/README.md b/benchmarks/advection_reaction_3D/raja/README.md similarity index 78% rename from benchmarks/advection_reaction_3D/README.md rename to benchmarks/advection_reaction_3D/raja/README.md index ab9974b660..33c82db725 100644 --- a/benchmarks/advection_reaction_3D/README.md +++ b/benchmarks/advection_reaction_3D/raja/README.md @@ -8,27 +8,31 @@ RAJA performance portability layer with serial, CUDA, or HIP backends. This code simulates the advection and reaction of three chemical species where the reaction mechanism is a variation of the Brusselator problem from chemical kinetics. The PDE system is given by +```math +\begin{align} + u_t &= -c \nabla u + A - (w+1) u + v u^2 \\ + v_t &= -c \nabla v + w u - v u^2 \\ + w_t &= -c \nabla w + (B - w) / \epsilon - w u +\end{align} ``` - u_t = -c grad(u) + A - (w+1) * u + v * u^2 - v_t = -c grad(v) + w * u - v * u^2 - w_t = -c grad(w) + (B - w) / epsilon - w * u -``` -where `u`, `v`, and `w` are chemical concentrations, `c` is the advection speed, -`A` and `B` are the concentrations of chemical species that remain constant over -space and time, and `epsilon` is a parameter that varies the stiffness of the -system. The problem is solved on the domain `(x,y,z) = X` in `[0, X_max]^3`, -for times `t` in `[0,t_f]`. The initial condition is -``` - u(0,X) = A + p(X) - v(0,X) = B / A + p(X) - w(0,X) = 3.0 + p(X) +where $u$, $v$, and $w$ are chemical concentrations, $c$ is the advection speed, +$A$ and $B$ are the concentrations of chemical species that remain constant over +space and time, and $\epsilon$ is a parameter that varies the stiffness of the +system. The problem is solved on the domain $(x,y,z) = X$ in $[0, X_{\text{max}}]^3$, +for times $t$ in $[0,t_f]$. The initial condition is +```math +\begin{align} + u(0,X) &= A + p(X) \\ + v(0,X) &= B / A + p(X) \\ + w(0,X) &= 3.0 + p(X) +\end{align} ``` where the perturbation function is +```math + p(X) = \alpha e^{-(X-\mu)^T \sigma^{-1} (X-\mu) / 2 \sqrt{|\sigma| 8 \pi^3}} ``` - p(X) = alpha * e^( -((X-mu)^T sigma^{-1} (X-mu)) / (2*sqrt(|sigma| 8 pi^3)) ) -``` -with `alpha = 0.1`, `mu = 0.5 X_max`, and `sigma` is a diagonal matrix with -entries `0.25 X_max`. +with $\alpha = 0.1$, $\mu = 0.5 X_{\text{max}}$, and $\sigma$ is a diagonal +matrix with entries $0.25 X_{\text{max}}$. Spatial derivatives are discretized with first-order upwind finite differences on a uniform spatial grid. The system can be evolved in time using explicit, @@ -64,7 +68,7 @@ listed below. | `--method ` | Integrator to use: `ERK`, `ARK-DIRK`, `ARK-IMEX`, `CV-BDF`, `CV-ADAMS`, `IDA` | `ARK-DIRK` | | `--nls ` | Nonlinear Solver Method: `newton`, `tl-newton`, `fixedpoint`, `none` | `newton` | | `--fpaccel ` | Number of fixed point acceleration vectors | 3 | -| `--nopre` | Disable preconditioning | False | +| `--nopre` | Disable preconditioning | False | | `--fused` | Enabled fused operations | Off | | `--tf ` | Final integration time `t_f` | 10.0 | | `--rtol ` | Relative tolerance | 1.0e-6 | @@ -79,11 +83,11 @@ GPUs. See the installation guide for more details on configuring, building, and installing SUNDIALS. Based on the configuration the following executables will be built and installed -in the `/bin/benchmarks/advection_reaction_3D` directory: +in the `/advection_reaction_3D/raja` directory: -* `advection_reaction_3D` -- MPI parallelism -* `advection_reaction_3D_mpicuda` -- MPI + CUDA parallelism -* `advection_reaction_3D_mpihip` -- MPI + HIP parallelism +* `advection_reaction_3D_raja` -- MPI parallelism +* `advection_reaction_3D_raja_mpicuda` -- MPI + CUDA parallelism +* `advection_reaction_3D_raja_mpihip` -- MPI + HIP parallelism On Summit, with the default environment ``` @@ -93,7 +97,7 @@ On Summit, with the default environment ``` an example `jsrun` command is ``` -jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda +jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda ``` On Lassen, with the environment @@ -104,5 +108,5 @@ On Lassen, with the environment ``` an example `jsrun` command is ``` -jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_mpicuda +jsrun -n 2 -a 1 -c 1 -g 1 ./advection_reaction_3D_raja_mpicuda ``` diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp similarity index 71% rename from benchmarks/advection_reaction_3D/advection_reaction_3D.cpp rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp index dc169c5fa1..088e4536a0 100644 --- a/benchmarks/advection_reaction_3D/advection_reaction_3D.cpp +++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.cpp @@ -1,5 +1,6 @@ /* ----------------------------------------------------------------------------- * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL + * Daniel R. Reynolds @ SMU * ----------------------------------------------------------------------------- * SUNDIALS Copyright Start * Copyright (c) 2002-2023, Lawrence Livermore National Security @@ -60,9 +61,13 @@ #include "advection_reaction_3D.hpp" +#define STENCIL_WIDTH 1 + + /* Main Program */ int main(int argc, char *argv[]) { + SUNContext ctx; /* Initialize MPI */ @@ -87,7 +92,6 @@ int main(int argc, char *argv[]) UserData udata(ctx); /* user data */ UserOptions uopt; /* user options */ int retval; /* reusable error-checking flag */ - char fname[MXSTR]; SUNDIALS_CXX_MARK_FUNCTION(udata.prof); @@ -113,6 +117,7 @@ int main(int argc, char *argv[]) /* Output spatial mesh to disk (add extra point for periodic BC) */ if (udata.myid == 0 && uopt.nout > 0) { + char fname[MXSTR]; snprintf(fname, MXSTR, "%s/mesh.txt", uopt.outputdir); udata.grid->MeshToFile(fname); } @@ -124,7 +129,6 @@ int main(int argc, char *argv[]) else if (uopt.method == "CV-BDF") retval = EvolveProblemBDF(y, &udata, &uopt); else if (uopt.method == "CV-ADAMS") retval = EvolveProblemAdams(y, &udata, &uopt); else if (uopt.method == "IDA") retval = EvolveDAEProblem(y, &udata, &uopt); - if (check_retval(&retval, "Evolve", 1, udata.myid)) MPI_Abort(comm, 1); /* Clean up */ @@ -142,15 +146,6 @@ int main(int argc, char *argv[]) /* Destructor for problem data */ UserData::~UserData() { - /* free solution masks */ - N_VDestroy(N_VGetLocalVector_MPIPlusX(umask)); - N_VDestroy(umask); - N_VDestroy(vmask); - N_VDestroy(wmask); - - /* free the parallel grid */ - delete grid; - /* close output streams */ if (uopt->nout > 0) { @@ -159,6 +154,24 @@ UserData::~UserData() if (WFID) fclose(WFID); if (TFID && myid == 0) fclose(TFID); } + + /* free solution masks */ + if (umask != nullptr) { + N_VDestroy(N_VGetLocalVector_MPIPlusX(umask)); + N_VDestroy(umask); + umask = nullptr; + } + if (vmask != nullptr) { + N_VDestroy(vmask); + vmask = nullptr; + } + if (wmask != nullptr) { + N_VDestroy(wmask); + wmask = nullptr; + } + + /* free the parallel grid */ + delete grid; } @@ -166,175 +179,98 @@ UserData::~UserData() * Communication functions * --------------------------------------------------------------*/ -/* Exchanges the boundary conditions only, */ -int ExchangeBCOnly(N_Vector y, UserData* udata) +/* Fills send buffers before exchanging neighbor information */ +int FillSendBuffers(N_Vector y, UserData* udata) { - int ierr; - MPI_Status stat; - MPI_Request reqR, reqS; /* shortcuts */ - int nvar = udata->grid->dof; - int myid = udata->myid; - int first = 0; - int last = udata->nprocs - 1; + const realtype c = udata->c; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; - /* extract the data */ - realtype* Ydata = GetVecData(y); - realtype* Wsend = udata->grid->getSendBuffer("WEST"); + /* Create a 4D view of the vector */ + RAJA::View > Yview(GetVecData(y), + nxl, nyl, nzl, dof); - /* open the East Irecv buffer */ - if (myid == last) - { - ierr = MPI_Irecv(udata->grid->getRecvBuffer("EAST"), nvar, MPI_SUNREALTYPE, first, - MPI_ANY_TAG, udata->comm, &reqR); - } - - /* send first mesh node to the last processor */ - if (myid == first) - { - RAJA::forall< EXEC_POLICY >( RAJA::RangeSegment(0, nvar), - [=] DEVICE_FUNC (int var) { - Wsend[IDX(nvar, 0, var)] = Ydata[IDX(nvar, 0, var)]; - }); - ierr = MPI_Isend(Wsend, nvar, MPI_SUNREALTYPE, - last, 0, udata->comm, &reqS); - } - - if (myid == last) - { - /* wait for exchange to finish */ - ierr = MPI_Wait(&reqR, &stat); - if (ierr != MPI_SUCCESS) - { - fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr); - return -1; - } - } - - if (myid == first) + if (c > 0.0) { - /* wait for exchange to finish */ - ierr = MPI_Wait(&reqS, &stat); - if (ierr != MPI_SUCCESS) - { - fprintf(stderr, "\nERROR: error in MPI_Wait = %d\n", ierr); - return -1; - } - } - - return(0); -} + /* Flow moving in the positive directions uses backward difference. */ -/* Starts the exchange of the neighbor information */ -int ExchangeAllStart(N_Vector y, UserData* udata) -{ - SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); + /* Fill 3D views of send buffers on device */ + RAJA::View > + Esend(udata->grid->getSendBuffer("EAST"), nyl, nzl, dof); + RAJA::View > + Nsend(udata->grid->getSendBuffer("NORTH"), nxl, nzl, dof); + RAJA::View > + Fsend(udata->grid->getSendBuffer("FRONT"), nxl, nyl, dof); + + auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(east_face, + [=] DEVICE_FUNC (int j, int k, int l) { + Esend(j,k,l) = Yview(nxl-1,j,k,l); + }); - /* shortcuts */ - realtype c = udata->c; + auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(north_face, + [=] DEVICE_FUNC (int i, int k, int l) { + Nsend(i,k,l) = Yview(i,nyl-1,k,l); + }); - /* extract the data */ - RAJA::View > Yview(GetVecData(y), - udata->grid->nxl, - udata->grid->nyl, - udata->grid->nzl, - udata->grid->dof); + auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(front_face, + [=] DEVICE_FUNC (int i, int j, int l) { + Fsend(i,j,l) = Yview(i,j,nzl-1,l); + }); - if (c > 0.0) - { - /* Flow moving in the positive directions uses backward difference. */ - udata->grid->ExchangeStart( - [=] (realtype*, realtype* Esend, realtype*, realtype* Nsend, realtype* Bsend, realtype*) { - int nxl = udata->grid->nxl; - int nyl = udata->grid->nyl; - int nzl = udata->grid->nzl; - int dof = udata->grid->dof; - - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, std::max(1,nxl-1)), - RAJA::RangeSegment(0, std::max(1,nyl-1)), - RAJA::RangeSegment(0, std::max(1,nzl-1))); - - RAJA::View > - Eview(Esend, nyl, nzl, dof); - RAJA::View > - Nview(Nsend, nxl, nzl, dof); - RAJA::View > - Bview(Bsend, nxl, nyl, dof); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - - if (nxl > 1) - { - Eview(j,k,0) = Yview(nxl-1,j,k,0); - Eview(j,k,1) = Yview(nxl-1,j,k,1); - Eview(j,k,2) = Yview(nxl-1,j,k,2); - } - - if (nyl > 1) - { - Nview(i,k,0) = Yview(i,nyl-1,k,0); - Nview(i,k,1) = Yview(i,nyl-1,k,1); - Nview(i,k,2) = Yview(i,nyl-1,k,2); - } - - if (nzl > 1) - { - Bview(i,j,0) = Yview(i,j,nzl-1,0); - Bview(i,j,1) = Yview(i,j,nzl-1,1); - Bview(i,j,2) = Yview(i,j,nzl-1,2); - } - - }); - }); } else if (c < 0.0) { + /* Flow moving in the negative directions uses forward difference. */ - udata->grid->ExchangeStart( - [=] (realtype* Wsend, realtype*, realtype*Ssend, realtype*, realtype*, realtype* Fsend) { - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl-1), - RAJA::RangeSegment(0, udata->grid->nyl-1), - RAJA::RangeSegment(0, udata->grid->nzl-1)); - - RAJA::View > - Wview(Wsend, udata->grid->nyl, udata->grid->nzl, udata->grid->dof); - RAJA::View > - Sview(Ssend, udata->grid->nxl, udata->grid->nzl, udata->grid->dof); - RAJA::View > - Fview(Fsend, udata->grid->nxl, udata->grid->nyl, udata->grid->dof); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - Wview(j,k,0) = Yview(0,j,k,0); - Wview(j,k,1) = Yview(0,j,k,1); - Wview(j,k,2) = Yview(0,j,k,2); - - Sview(i,k,0) = Yview(i,0,k,0); - Sview(i,k,1) = Yview(i,0,k,1); - Sview(i,k,2) = Yview(i,0,k,2); - - Fview(i,j,0) = Yview(i,j,0,0); - Fview(i,j,1) = Yview(i,j,0,1); - Fview(i,j,2) = Yview(i,j,0,2); - }); - }); - } + /* Fill 3D views of send buffers on device */ + RAJA::View > + Wsend(udata->grid->getSendBuffer("WEST"), nyl, nzl, dof); + RAJA::View > + Ssend(udata->grid->getSendBuffer("SOUTH"), nxl, nzl, dof); + RAJA::View > + Bsend(udata->grid->getSendBuffer("BACK"), nxl, nyl, dof); + + auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(west_face, + [=] DEVICE_FUNC (int j, int k, int l) { + Wsend(j,k,l) = Yview(0,j,k,l); + }); - SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); - return(0); -} + auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(south_face, + [=] DEVICE_FUNC (int i, int k, int l) { + Ssend(i,k,l) = Yview(i,0,k,l); + }); + auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(back_face, + [=] DEVICE_FUNC (int i, int j, int l) { + Bsend(i,j,l) = Yview(i,j,0,l); + }); + + } -/* Completes the exchange of the neighbor information */ -int ExchangeAllEnd(UserData* udata) -{ - SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); - udata->grid->ExchangeEnd(); - SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); return(0); } @@ -494,17 +430,20 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata) N_VConst(0.0, mask); - RAJA::View > mask_view(GetVecData(mask), - udata->grid->nxl, - udata->grid->nyl, - udata->grid->nzl, - udata->grid->dof); + /* Create 4D view of mask data */ + RAJA::View > mask_view(GetVecData(mask), + udata->grid->nxl, + udata->grid->nyl, + udata->grid->nzl, + udata->grid->dof); + /* Fill mask data */ auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), RAJA::RangeSegment(0, udata->grid->nyl), RAJA::RangeSegment(0, udata->grid->nzl)); RAJA::kernel(range, - [=] DEVICE_FUNC (int xi, int yi, int zi) { - mask_view(xi,yi,zi,component) = 1.0; + [=] DEVICE_FUNC (int i, int j, int k) + { + mask_view(i,j,k,component) = 1.0; }); return 0; @@ -515,14 +454,9 @@ int ComponentMask(N_Vector mask, int component, const UserData* udata) int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, SUNMemoryHelper memhelper, SUNContext ctx) { - constexpr int STENCIL_WIDTH = 1; SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - /* Local variables */ - int retval = 0; - char fname[MXSTR]; - /* MPI variables */ udata->comm = MPI_COMM_WORLD; MPI_Comm_rank(udata->comm, &udata->myid); @@ -567,16 +501,16 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, uopt->outputdir = (char *) "."; /* output directory */ /* Parse CLI args and set udata/uopt appropriately */ - retval = ParseArgs(argc, argv, udata, uopt); + int retval = ParseArgs(argc, argv, udata, uopt); if (check_retval((void*)&retval, "ParseArgs", 1, udata->myid)) return -1; /* Setup the parallel decomposition */ const sunindextype npts[] = {uopt->npts, uopt->npts, uopt->npts}; const realtype amax[] = {0.0, 0.0, 0.0}; const realtype bmax[] = {udata->xmax, udata->xmax, udata->xmax}; - udata->grid = new ParallelGrid(memhelper, - &udata->comm, amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, STENCIL_WIDTH, uopt->npxyz - ); + udata->grid = new ParallelGrid(memhelper, &udata->comm, + amax, bmax, npts, 3, BoundaryType::PERIODIC, StencilType::UPWIND, udata->c, + STENCIL_WIDTH, uopt->npxyz); /* Create the solution masks */ udata->umask = N_VMake_MPIPlusX(udata->comm, LocalNvector(udata->grid->neq, ctx), ctx); @@ -589,6 +523,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, /* Open output files for results */ if (uopt->save) { + char fname[MXSTR]; if (udata->myid == 0) { sprintf(fname, "%s/t.%06d.txt", uopt->outputdir, udata->myid); @@ -609,7 +544,7 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, if (udata->myid == 0) { printf("\n\t\tAdvection-Reaction Test Problem\n\n"); - printf("Using the %s NVECTOR\n", NVECTOR_ID_STRING); + printf("Using the MPI+%s NVECTOR\n", NVECTOR_ID_STRING); printf("Number of Processors = %li\n", (long int) udata->nprocs); udata->grid->PrintInfo(); printf("Problem Parameters:\n"); @@ -632,7 +567,6 @@ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, printf("Output directory: %s\n", uopt->outputdir); } - /* return success */ return(0); } @@ -644,8 +578,8 @@ void Gaussian3D(realtype& x, realtype& y, realtype& z, realtype xmax) { /* Gaussian distribution defaults */ const realtype alpha = 0.1; - const realtype mu[3] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) }; - const realtype sigma[3] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma) + const realtype mu[] = { xmax/RCONST(2.0), xmax/RCONST(2.0), xmax/RCONST(2.0) }; + const realtype sigma[] = { xmax/RCONST(4.0), xmax/RCONST(4.0), xmax/RCONST(4.0) }; // Sigma = diag(sigma) /* denominator = 2*sqrt(|Sigma|*(2pi)^3) */ const realtype denom = 2.0 * sqrt((sigma[0]*sigma[1]*sigma[2])*pow(2*M_PI,3)); @@ -664,6 +598,7 @@ int SetIC(N_Vector y, UserData* udata) const int nxl = udata->grid->nxl; const int nyl = udata->grid->nyl; const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; const realtype dx = udata->grid->dx; const realtype dy = udata->grid->dy; const realtype dz = udata->grid->dz; @@ -683,22 +618,25 @@ int SetIC(N_Vector y, UserData* udata) const realtype vs = k2 * k4 * B / (k1 * k3 * A); const realtype ws = 3.0; + /* Create 4D view of y */ + RAJA::View > yview(GetVecData(y), + nxl, nyl, nzl, dof); + /* Gaussian perturbation of the steady state solution */ - RAJA::View > yview(GetVecData(y), nxl, nyl, nzl, - udata->grid->dof); auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), RAJA::RangeSegment(0, nyl), RAJA::RangeSegment(0, nzl)); RAJA::kernel(range, - [=] DEVICE_FUNC (int xi, int yi, int zi) { - realtype x = (xcrd * nxl + xi) * dx; - realtype y = (ycrd * nyl + yi) * dy; - realtype z = (zcrd * nzl + zi) * dz; + [=] DEVICE_FUNC (int i, int j, int k) + { + realtype x = (xcrd * nxl + i) * dx; + realtype y = (ycrd * nyl + j) * dy; + realtype z = (zcrd * nzl + k) * dz; Gaussian3D(x,y,z,xmax); const realtype p = x + y + z; - yview(xi,yi,zi,0) = us + p; - yview(xi,yi,zi,1) = vs + p; - yview(xi,yi,zi,2) = ws + p; + yview(i,j,k,0) = us + p; + yview(i,j,k,1) = vs + p; + yview(i,j,k,2) = ws + p; }); /* Return success */ @@ -710,23 +648,17 @@ int SetIC(N_Vector y, UserData* udata) int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt) { SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - - realtype u, v, w, N; - realtype* ydata = NULL; - - /* get vector data array */ - ydata = N_VGetArrayPointer(y); - if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1; + /* Copy solution data to host mirror view */ CopyVecFromDevice(N_VGetLocalVector_MPIPlusX(y)); /* output current solution norm to screen */ - N = (realtype) udata->grid->npts(); - u = N_VWL2Norm(y, udata->umask); + realtype N = (realtype) udata->grid->npts(); + realtype u = N_VWL2Norm(y, udata->umask); u = sqrt(u*u/N); - v = N_VWL2Norm(y, udata->vmask); + realtype v = N_VWL2Norm(y, udata->vmask); v = sqrt(v*v/N); - w = N_VWL2Norm(y, udata->wmask); + realtype w = N_VWL2Norm(y, udata->wmask); w = sqrt(w*w/N); if (udata->myid == 0) { printf(" %10.6f %10.6f %10.6f %10.6f\n", t, u, v, w); @@ -736,32 +668,38 @@ int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt) if (uopt->save) { /* output the times to disk */ - if (udata->myid == 0 && udata->TFID) + if (udata->myid == 0 && udata->TFID) { fprintf(udata->TFID," %.16e\n", t); + std::fflush(udata->TFID); + } + + /* create 4D view of host data */ + realtype* ydata = NULL; + ydata = N_VGetArrayPointer(y); + if (check_retval((void *) ydata, "N_VGetArrayPointer", 0, udata->myid)) return -1; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + RAJA::View > Yview(ydata, nxl, nyl, nzl, dof); /* output results to disk */ - RAJA::View > Yview(ydata, - udata->grid->nxl, - udata->grid->nyl, - udata->grid->nzl, - udata->grid->dof); - - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), - RAJA::RangeSegment(0, udata->grid->nyl), - RAJA::RangeSegment(0, udata->grid->nzl)); - - RAJA::kernel(range, - [=] (int i, int j, int k) { - fprintf(udata->UFID," %.16e", Yview(i,j,k,0)); - fprintf(udata->VFID," %.16e", Yview(i,j,k,1)); - fprintf(udata->WFID," %.16e", Yview(i,j,k,2)); - }); + for (int i = 0; i < nxl; i++) + for (int j = 0; j < nyl; j++) + for (int k = 0; k < nzl; k++) { + fprintf(udata->UFID," %.16e", Yview(i,j,k,0)); + fprintf(udata->VFID," %.16e", Yview(i,j,k,1)); + fprintf(udata->WFID," %.16e", Yview(i,j,k,2)); + } fprintf(udata->UFID,"\n"); fprintf(udata->VFID,"\n"); fprintf(udata->WFID,"\n"); + std::fflush(udata->UFID); + std::fflush(udata->VFID); + std::fflush(udata->WFID); } - + return(0); } @@ -799,4 +737,3 @@ void InputError(char *name) MPI_Barrier(MPI_COMM_WORLD); } - diff --git a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp similarity index 91% rename from benchmarks/advection_reaction_3D/advection_reaction_3D.hpp rename to benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp index 4396e69eb5..e4227d62c7 100644 --- a/benchmarks/advection_reaction_3D/advection_reaction_3D.hpp +++ b/benchmarks/advection_reaction_3D/raja/advection_reaction_3D.hpp @@ -1,5 +1,6 @@ /* ----------------------------------------------------------------------------- * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL + * Daniel R. Reynolds @ SMU * ----------------------------------------------------------------------------- * SUNDIALS Copyright Start * Copyright (c) 2002-2023, Lawrence Livermore National Security @@ -34,19 +35,9 @@ using sundials_tools::BoundaryType; using sundials_tools::StencilType; using std::string; -/* Number of dimensions */ -constexpr int NDIMS = 3; - /* Maximum size of output directory string */ constexpr int MXSTR = 2048; -/* Accessor macro: - n = number of state variables - i = mesh node index - c = component */ -#define IDX(n,i,c) ((n)*(i)+(c)) - - /* * Data structure for problem options */ @@ -113,7 +104,7 @@ struct UserData realtype c; /* advection coefficient */ /* parallel mesh */ - ParallelGrid* grid; + ParallelGrid* grid; /* count of implicit function evals by the task local nonlinear solver */ long int nnlfi; @@ -122,7 +113,10 @@ struct UserData UserOptions* uopt; /* constructor that takes the context */ - UserData(SUNContext ctx) : ctx(ctx) { + UserData(SUNContext ctx) + : ctx(ctx), umask(nullptr), vmask(nullptr), wmask(nullptr), uopt(nullptr), + TFID(nullptr), UFID(nullptr), VFID(nullptr), WFID(nullptr) + { SUNContext_GetProfiler(ctx, &prof); } @@ -161,15 +155,14 @@ extern int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt); /* function to set initial condition */ int SetIC(N_Vector y, UserData* udata); -/* functions to exchange neighbor data */ -int ExchangeBCOnly(N_Vector y, UserData* udata); -int ExchangeAllStart(N_Vector y, UserData* udata); -int ExchangeAllEnd(UserData* udata); +/* function to fill neighbor data */ +int FillSendBuffers(N_Vector y, UserData* udata); /* functions for processing command line args */ int SetupProblem(int argc, char *argv[], UserData* udata, UserOptions* uopt, SUNMemoryHelper memhelper, SUNContext ctx); void InputError(char *name); +int ComponentMask(N_Vector mask, const int component, const UserData* udata); /* function to write solution to disk */ int WriteOutput(realtype t, N_Vector y, UserData* udata, UserOptions* uopt); diff --git a/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp new file mode 100644 index 0000000000..e2cf1451e3 --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/arkode_driver.cpp @@ -0,0 +1,782 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ---------------------------------------------------------------------------*/ + +#include "arkode/arkode_arkstep.h" +#include "arkode/arkode_erkstep.h" +#include "sunlinsol/sunlinsol_spgmr.h" +#include "sunnonlinsol/sunnonlinsol_newton.h" +#include "sunnonlinsol/sunnonlinsol_fixedpoint.h" +#include "advection_reaction_3D.hpp" +#include "rhs3D.hpp" + +/* + * Definitions for a custom task local SUNNonlinearSolver + */ + +typedef struct +{ + int myid; + int nprocs; + long int ncnf; + MPI_Comm comm; + SUNNonlinearSolver local_nls; +} *TaskLocalNewton_Content; + +/* Content accessor macors */ +#define GET_NLS_CONTENT(NLS) ( (TaskLocalNewton_Content)(NLS->content) ) +#define LOCAL_NLS(NLS) ( GET_NLS_CONTENT(NLS)->local_nls ) + +/* SUNNonlinearSolver constructor */ +SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID); + + +/* -------------------------------------------------------------- + * Evolve functions + * --------------------------------------------------------------*/ + +/* Setup ARKODE and evolve problem in time with IMEX method */ +int EvolveProblemDIRK(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* arkode_mem = NULL; /* empty ARKODE memory structure */ + SUNNonlinearSolver NLS = NULL; /* empty nonlinear solver structure */ + SUNLinearSolver LS = NULL; /* empty linear solver structure */ + + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, nst_a, netf; /* step stats */ + long int nfe, nfi; /* RHS stats */ + long int nni, ncnf; /* nonlinear solver stats */ + long int nli, npsol; /* linear solver stats */ + FILE* DFID = NULL; /* diagnostics output file */ + char fname[MXSTR]; + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = true; + + /* Create the ARK timestepper module */ + arkode_mem = ARKStepCreate(NULL, AdvectionReaction, uopt->t0, y, udata->ctx); + if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1; + + /* Select the method order */ + retval = ARKStepSetOrder(arkode_mem, uopt->order); + if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1; + + /* Attach user data */ + retval = ARKStepSetUserData(arkode_mem, (void*) udata); + if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = ARKStepSetMaxNumSteps(arkode_mem, 100000); + if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1; + + /* Open output file for integrator diagnostics */ + if (uopt->save) + { + sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid); + DFID = fopen(fname, "w"); + + retval = ARKStepSetDiagnostics(arkode_mem, DFID); + if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1; + } + + /* Create the (non)linear solver */ + if (uopt->nls == "newton") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_Newton(y, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = ARKStepSetNonlinearSolver(arkode_mem, NLS); + if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1; + + /* Create linear solver */ + LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx); + if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1; + + /* Attach linear solver */ + retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL); + if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1; + + /* Attach preconditioner */ + retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve); + if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1; + } + else if (uopt->nls == "fixedpoint") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = ARKStepSetNonlinearSolver(arkode_mem, NLS); + if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1; + } + else + { + fprintf(stderr, "\nERROR: ARK-DIRK is not compatible with the nls option provided\n"); + return 1; + } + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL); + if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break; + + /* Output state */ + if(uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* close output stream */ + if (uopt->save) fclose(DFID); + + /* Get final statistics */ + retval = ARKStepGetNumSteps(arkode_mem, &nst); + check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid); + retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a); + check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid); + retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi); + check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid); + retval = ARKStepGetNumErrTestFails(arkode_mem, &netf); + check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid); + retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni); + check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid); + retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf); + check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid); + if (uopt->nls == "newton") + { + retval = ARKStepGetNumLinIters(arkode_mem, &nli); + check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid); + retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol); + check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid); + } + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li (attempted = %li)\n", nst, nst_a); + printf(" Total RHS evals: Fe = %li, Fi = %li\n", nfe, nfi + udata->nnlfi); + printf(" Total number of error test failures = %li\n", netf); + printf(" Total number of nonlinear solver convergence failures = %li\n", + ncnf); + printf(" Total number of nonlinear iterations = %li\n", nni); + if (uopt->nls == "newton") + { + printf(" Total number of linear iterations = %li\n", nli); + printf(" Total number of preconditioner solves = %li\n", npsol); + } + } + + /* Clean up */ + ARKStepFree(&arkode_mem); + SUNNonlinSolFree(NLS); + if (LS) SUNLinSolFree(LS); + + /* Return success */ + return(0); +} + + +/* Setup ARKODE and evolve problem in time with IMEX method */ +int EvolveProblemIMEX(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* arkode_mem = NULL; /* empty ARKODE memory structure */ + SUNNonlinearSolver NLS = NULL; /* empty nonlinear solver structure */ + SUNLinearSolver LS = NULL; /* empty linear solver structure */ + + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, nst_a, netf; /* step stats */ + long int nfe, nfi; /* RHS stats */ + long int nni, ncnf; /* nonlinear solver stats */ + long int nli, npsol; /* linear solver stats */ + FILE* DFID = NULL; /* diagnostics output file */ + char fname[MXSTR]; + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = false; + + /* Create the ARK timestepper module */ + arkode_mem = ARKStepCreate(Advection, Reaction, uopt->t0, y, udata->ctx); + if (check_retval((void*)arkode_mem, "ARKStepCreate", 0, udata->myid)) return 1; + + /* Select the method order */ + retval = ARKStepSetOrder(arkode_mem, uopt->order); + if (check_retval(&retval, "ARKStepSetOrder", 1, udata->myid)) return 1; + + /* Attach user data */ + retval = ARKStepSetUserData(arkode_mem, (void*) udata); + if (check_retval(&retval, "ARKStepSetUserData*", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = ARKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "ARKStepSStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = ARKStepSetMaxNumSteps(arkode_mem, 100000); + if (check_retval(&retval, "ARKStepSetMaxNumSteps", 1, udata->myid)) return 1; + + /* Open output file for integrator diagnostics */ + if (uopt->save) + { + sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid); + DFID = fopen(fname, "w"); + + retval = ARKStepSetDiagnostics(arkode_mem, DFID); + if (check_retval(&retval, "ARKStepSetDiagnostics", 1, udata->myid)) return 1; + } + + /* Create the (non)linear solver */ + if (uopt->nls == "newton") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_Newton(y, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = ARKStepSetNonlinearSolver(arkode_mem, NLS); + if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1; + + /* Create linear solver */ + LS = SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx); + if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1; + + /* Attach linear solver */ + retval = ARKStepSetLinearSolver(arkode_mem, LS, NULL); + if (check_retval(&retval, "ARKStepSetLinearSolver", 1, udata->myid)) return 1; + + /* Attach preconditioner */ + retval = ARKStepSetPreconditioner(arkode_mem, NULL, PSolve); + if (check_retval(&retval, "ARKStepSetPreconditioner", 1, udata->myid)) return 1; + } + else if (uopt->nls == "tl-newton") + { + /* The custom task-local nonlinear solver handles the linear solve + as well, so we do not need a SUNLinearSolver. */ + NLS = TaskLocalNewton(udata->ctx, y, DFID); + if (check_retval((void *)NLS, "TaskLocalNewton", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = ARKStepSetNonlinearSolver(arkode_mem, NLS); + if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1; + } + else if (uopt->nls == "fixedpoint") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = ARKStepSetNonlinearSolver(arkode_mem, NLS); + if (check_retval(&retval, "ARKStepSetNonlinearSolver", 1, udata->myid)) return 1; + } + else + { + fprintf(stderr, "\nERROR: ARK-IMEX method is not compatible with the nls option provided\n"); + return 1; + } + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = ARKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL); + if (check_retval(&retval, "ARKStepEvolve", 1, udata->myid)) break; + + /* Output state */ + if(uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* close output stream */ + if (uopt->save) fclose(DFID); + + /* Get final statistics */ + retval = ARKStepGetNumSteps(arkode_mem, &nst); + check_retval(&retval, "ARKStepGetNumSteps", 1, udata->myid); + retval = ARKStepGetNumStepAttempts(arkode_mem, &nst_a); + check_retval(&retval, "ARKStepGetNumStepAttempts", 1, udata->myid); + retval = ARKStepGetNumRhsEvals(arkode_mem, &nfe, &nfi); + check_retval(&retval, "ARKStepGetNumRhsEvals", 1, udata->myid); + retval = ARKStepGetNumErrTestFails(arkode_mem, &netf); + check_retval(&retval, "ARKStepGetNumErrTestFails", 1, udata->myid); + retval = ARKStepGetNumNonlinSolvIters(arkode_mem, &nni); + check_retval(&retval, "ARKStepGetNumNonlinSolvIters", 1, udata->myid); + retval = ARKStepGetNumNonlinSolvConvFails(arkode_mem, &ncnf); + check_retval(&retval, "ARKStepGetNumNonlinSolvConvFails", 1, udata->myid); + if (uopt->nls == "newton") + { + retval = ARKStepGetNumLinIters(arkode_mem, &nli); + check_retval(&retval, "ARKStepGetNumLinIters", 1, udata->myid); + retval = ARKStepGetNumPrecSolves(arkode_mem, &npsol); + check_retval(&retval, "ARKStepGetNumPrecSolves", 1, udata->myid); + } + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li (attempted = %li)\n", nst, nst_a); + printf(" Total RHS evals: Fe = %li, Fi = %li\n", nfe, nfi + udata->nnlfi); + printf(" Total number of error test failures = %li\n", netf); + printf(" Total number of nonlinear solver convergence failures = %li\n", + ncnf); + printf(" Total number of nonlinear iterations = %li\n", nni); + if (uopt->nls == "newton") + { + printf(" Total number of linear iterations = %li\n", nli); + printf(" Total number of preconditioner solves = %li\n", npsol); + } + } + + /* Clean up */ + ARKStepFree(&arkode_mem); + if (NLS) SUNNonlinSolFree(NLS); + if (LS) SUNLinSolFree(LS); + + /* Return success */ + return(0); +} + + +/* Setup ARKODE and evolve problem in time explicitly */ +int EvolveProblemExplicit(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* arkode_mem = NULL; /* empty ARKODE memory structure */ + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, nst_a, netf; /* step stats */ + long int nfe; /* RHS stats */ + FILE* DFID; /* diagnostics output file */ + char fname[MXSTR]; + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = true; + + /* Create the ERK timestepper module */ + arkode_mem = ERKStepCreate(AdvectionReaction, uopt->t0, y, udata->ctx); + if (check_retval((void*)arkode_mem, "ERKStepCreate", 0, udata->myid)) return 1; + + /* Select the method order */ + retval = ERKStepSetOrder(arkode_mem, uopt->order); + if (check_retval(&retval, "ERKStepSetOrder", 1, udata->myid)) return 1; + + /* Attach user data */ + retval = ERKStepSetUserData(arkode_mem, (void*) udata); + if (check_retval(&retval, "ERKStepSetUserData", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = ERKStepSStolerances(arkode_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "ERKStepSStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = ERKStepSetMaxNumSteps(arkode_mem, 1000000); + if (check_retval(&retval, "ERKStepSetMaxNumSteps", 1, udata->myid)) return 1; + + /* Set fixed step size */ + retval = ERKStepSetFixedStep(arkode_mem, 1e-5); + if (check_retval(&retval, "ERKStepSetFixedStep", 1, udata->myid)) return 1; + + /* Open output file for integrator diagnostics */ + if (uopt->save) + { + sprintf(fname, "%s/diagnostics.%06d.txt", uopt->outputdir, udata->myid); + DFID = fopen(fname, "w"); + + retval = ERKStepSetDiagnostics(arkode_mem, DFID); + if (check_retval(&retval, "ERKStepSetDiagnostics", 1, udata->myid)) return 1; + } + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = ERKStepEvolve(arkode_mem, tout, y, &t, ARK_NORMAL); + if (check_retval(&retval, "ERKStepEvolve", 1, udata->myid)) break; + + /* Output state */ + if(uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* close output stream */ + if (uopt->save) fclose(DFID); + + /* Get final statistics */ + retval = ERKStepGetNumSteps(arkode_mem, &nst); + check_retval(&retval, "ERKStepGetNumSteps", 1, udata->myid); + retval = ERKStepGetNumStepAttempts(arkode_mem, &nst_a); + check_retval(&retval, "ERKStepGetNumStepAttempts", 1, udata->myid); + retval = ERKStepGetNumRhsEvals(arkode_mem, &nfe); + check_retval(&retval, "ERKStepGetNumRhsEvals", 1, udata->myid); + retval = ERKStepGetNumErrTestFails(arkode_mem, &netf); + check_retval(&retval, "ERKStepGetNumErrTestFails", 1, udata->myid); + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li (attempted = %li)\n", nst, nst_a); + printf(" Total RHS evals: Fe = %li\n", nfe); + printf(" Total number of error test failures = %li\n", netf); + } + + /* Clean up */ + ERKStepFree(&arkode_mem); + + /* Return success */ + return(0); +} + + +/* -------------------------------------------------------------- + * (Non)linear system functions + * --------------------------------------------------------------*/ + +int TaskLocalNlsResidual(N_Vector ycor, N_Vector F, void* arkode_mem) +{ + /* temporary variables */ + UserData* udata; + int retval; + realtype c[3]; + N_Vector X[3]; + + /* nonlinear system data */ + N_Vector z, zpred, Fi, sdata; + realtype tcur, gamma; + void *user_data; + + ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi, + &gamma, &sdata, &user_data); + udata = (UserData*) user_data; + + /* update 'z' value as stored predictor + current corrector */ + N_VLinearSum(1.0, N_VGetLocalVector_MPIPlusX(zpred), + 1.0, (ycor), + N_VGetLocalVector_MPIPlusX(z)); + + /* compute implicit RHS and save for later */ + retval = Reaction(tcur, + N_VGetLocalVector_MPIPlusX(z), + N_VGetLocalVector_MPIPlusX(Fi), + user_data); + udata->nnlfi++; /* count calls to Fi as part of the nonlinear residual */ + if (retval < 0) return(-1); + if (retval > 0) return(+1); + + /* update with y, sdata, and gamma * fy */ + X[0] = ycor; + c[0] = 1.0; + c[1] = -1.0; + X[1] = N_VGetLocalVector_MPIPlusX(sdata); + c[2] = -gamma; + X[2] = N_VGetLocalVector_MPIPlusX(Fi); + + retval = N_VLinearCombination(3, c, X, F); + if (retval != 0) return(-1); + + return(0); +} + + +int TaskLocalLSolve(N_Vector delta, void* arkode_mem) +{ + /* local variables */ + UserData* udata = NULL; + int retval; + + /* nonlinear system data */ + N_Vector z, zpred, Fi, sdata; + realtype tcur, gamma; + void* user_data = NULL; + + ARKStepGetNonlinearSystemData(arkode_mem, &tcur, &zpred, &z, &Fi, + &gamma, &sdata, &user_data); + udata = (UserData*) user_data; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* set up I - gamma*J and solve */ + retval = SolveReactionLinSys(z, delta, delta, gamma, udata); + + + return(retval); +} + + +SUNNonlinearSolver_Type TaskLocalNewton_GetType(SUNNonlinearSolver NLS) +{ + return SUNNONLINEARSOLVER_ROOTFIND; +} + + +int TaskLocalNewton_Initialize(SUNNonlinearSolver NLS) +{ + /* check that the nonlinear solver is non-null */ + if (NLS == NULL) + return SUN_NLS_MEM_NULL; + + /* override default system and lsolve functions with local versions */ + SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), TaskLocalNlsResidual); + SUNNonlinSolSetLSolveFn(LOCAL_NLS(NLS), TaskLocalLSolve); + + return(SUNNonlinSolInitialize(LOCAL_NLS(NLS))); +} + + +int TaskLocalNewton_Solve(SUNNonlinearSolver NLS, + N_Vector y0, N_Vector ycor, + N_Vector w, realtype tol, + booleantype callLSetup, void* mem) +{ + /* local variables */ + MPI_Comm comm; + int solve_status, recover, nonrecover; + + /* check that the inputs are non-null */ + if ((NLS == NULL) || + (y0 == NULL) || + (ycor == NULL) || + (w == NULL) || + (mem == NULL)) + return SUN_NLS_MEM_NULL; + + /* shortcuts */ + comm = GET_NLS_CONTENT(NLS)->comm; + + /* each tasks solves the local nonlinear system */ + solve_status = SUNNonlinSolSolve(LOCAL_NLS(NLS), + N_VGetLocalVector_MPIPlusX(y0), + N_VGetLocalVector_MPIPlusX(ycor), + N_VGetLocalVector_MPIPlusX(w), + tol, callLSetup, mem); + + /* if any process had a nonrecoverable failure, return it */ + MPI_Allreduce(&solve_status, &nonrecover, 1, MPI_INT, MPI_MIN, comm); + if (nonrecover < 0) return nonrecover; + + /* check if any process has a recoverable convergence failure */ + MPI_Allreduce(&solve_status, &recover, 1, MPI_INT, MPI_MAX, comm); + if (recover == SUN_NLS_CONV_RECVR) GET_NLS_CONTENT(NLS)->ncnf++; + + /* return success (recover == 0) or a recoverable error code (recover > 0) */ + return recover; +} + + +int TaskLocalNewton_Free(SUNNonlinearSolver NLS) +{ + /* return if NLS is already free */ + if (NLS == NULL) + return SUN_NLS_SUCCESS; + + /* free items from contents, then the generic structure */ + if (NLS->content) + { + SUNNonlinSolFree(LOCAL_NLS(NLS)); + free(NLS->content); + NLS->content = NULL; + } + + /* free the ops structure */ + if (NLS->ops) + { + free(NLS->ops); + NLS->ops = NULL; + } + + /* free the nonlinear solver */ + free(NLS); + + return SUN_NLS_SUCCESS; +} + + +int TaskLocalNewton_SetSysFn(SUNNonlinearSolver NLS, + SUNNonlinSolSysFn SysFn) +{ + /* check that the nonlinear solver is non-null */ + if (NLS == NULL) + return SUN_NLS_MEM_NULL; + + return(SUNNonlinSolSetSysFn(LOCAL_NLS(NLS), SysFn)); +} + + +int TaskLocalNewton_SetConvTestFn(SUNNonlinearSolver NLS, + SUNNonlinSolConvTestFn CTestFn, + void* ctest_data) +{ + /* check that the nonlinear solver is non-null */ + if (NLS == NULL) + return SUN_NLS_MEM_NULL; + + return(SUNNonlinSolSetConvTestFn(LOCAL_NLS(NLS), CTestFn, ctest_data)); +} + + +int TaskLocalNewton_GetNumConvFails(SUNNonlinearSolver NLS, + long int *nconvfails) +{ + /* check that the nonlinear solver is non-null */ + if (NLS == NULL) + return SUN_NLS_MEM_NULL; + + *nconvfails = GET_NLS_CONTENT(NLS)->ncnf; + return(0); +} + + +SUNNonlinearSolver TaskLocalNewton(SUNContext ctx, N_Vector y, FILE* DFID) +{ + SUNNonlinearSolver NLS; + TaskLocalNewton_Content content; + + /* Check that the supplied N_Vector is non-NULL */ + if (y == NULL) return NULL; + + /* Check that the supplied N_Vector is an MPIPlusX */ + if (N_VGetVectorID(y) != SUNDIALS_NVEC_MPIPLUSX) + return NULL; + + /* Create an empty nonlinear linear solver object */ + NLS = SUNNonlinSolNewEmpty(ctx); + if (NLS == NULL) return NULL; + + /* Attach operations */ + NLS->ops->gettype = TaskLocalNewton_GetType; + NLS->ops->initialize = TaskLocalNewton_Initialize; + NLS->ops->solve = TaskLocalNewton_Solve; + NLS->ops->free = TaskLocalNewton_Free; + NLS->ops->setsysfn = TaskLocalNewton_SetSysFn; + NLS->ops->setctestfn = TaskLocalNewton_SetConvTestFn; + NLS->ops->getnumconvfails = TaskLocalNewton_GetNumConvFails; + + /* Create content */ + content = NULL; + content = (TaskLocalNewton_Content) malloc(sizeof *content); + if (content == NULL) { SUNNonlinSolFree(NLS); return NULL; } + + /* Initialize all components of content to 0/NULL */ + memset(content, 0, sizeof(*content)); + + /* Attach content */ + NLS->content = content; + + /* Fill general content */ + void *tmpcomm = N_VGetCommunicator(y); + if (tmpcomm == NULL) { SUNNonlinSolFree(NLS); return NULL; } + + MPI_Comm *comm = (MPI_Comm*) tmpcomm; + if ((*comm) == MPI_COMM_NULL) { SUNNonlinSolFree(NLS); return NULL; } + + content->comm = *comm; + + content->local_nls = SUNNonlinSol_Newton(N_VGetLocalVector_MPIPlusX(y), ctx); + if (content->local_nls == NULL) { SUNNonlinSolFree(NLS); return NULL; } + + MPI_Comm_rank(content->comm, &content->myid); + MPI_Comm_size(content->comm, &content->nprocs); + + content->ncnf = 0; + + /* Setup the local nonlinear solver monitoring */ + if (DFID != NULL) + { + SUNNonlinSolSetInfoFile_Newton(LOCAL_NLS(NLS), DFID); + SUNNonlinSolSetPrintLevel_Newton(LOCAL_NLS(NLS), 1); + } + + return NLS; +} diff --git a/benchmarks/advection_reaction_3D/backends.hpp b/benchmarks/advection_reaction_3D/raja/backends.hpp similarity index 100% rename from benchmarks/advection_reaction_3D/backends.hpp rename to benchmarks/advection_reaction_3D/raja/backends.hpp diff --git a/benchmarks/advection_reaction_3D/raja/check_retval.h b/benchmarks/advection_reaction_3D/raja/check_retval.h new file mode 100644 index 0000000000..887b7cea5d --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/check_retval.h @@ -0,0 +1,57 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ---------------------------------------------------------------------------*/ + +#ifndef _SUNDIALS_CHECK_RETVAL_H_ +#define _SUNDIALS_CHECK_RETVAL_H_ + +#include + +/* -------------------------------------------------------------- + * Function to check return values: + * + * opt == 0 means the function allocates memory and returns a + * pointer so check if a NULL pointer was returned + * opt == 1 means the function returns an integer where a + * value < 0 indicates an error occured + * --------------------------------------------------------------*/ +static int check_retval(void *returnvalue, const char *funcname, int opt, int myid) +{ + int* errvalue; + + if (opt == 0 && returnvalue == NULL) + { + /* A NULL pointer was returned - no memory allocated */ + if (myid == 0) + fprintf(stderr, "\nERROR: %s() failed - returned NULL pointer\n\n", + funcname); + return(1); + } + else if (opt == 1) + { + errvalue = (int *) returnvalue; + + /* A value < 0 was returned - function failed */ + if (*errvalue < 0) + { + if (myid == 0) + fprintf(stderr, "\nERROR: %s() returned %d\n\n", funcname, *errvalue); + return(1); + } + } + + /* return success */ + return(0); +} + +#endif diff --git a/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp new file mode 100644 index 0000000000..e147ccd8c4 --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/cvode_driver.cpp @@ -0,0 +1,289 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ---------------------------------------------------------------------------*/ + +#include "cvode/cvode.h" +#include "sunlinsol/sunlinsol_spgmr.h" +#include "sunnonlinsol/sunnonlinsol_newton.h" +#include "sunnonlinsol/sunnonlinsol_fixedpoint.h" +#include "advection_reaction_3D.hpp" +#include "rhs3D.hpp" + + +/* Setup CVODE and evolve problem in time with BDF method */ +int EvolveProblemBDF(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* cvode_mem = NULL; /* empty CVODE memory structure */ + SUNNonlinearSolver NLS = NULL; /* empty nonlinear solver structure */ + SUNLinearSolver LS = NULL; /* empty linear solver structure */ + + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, netf; /* step stats */ + long int nfi; /* RHS stats */ + long int nni, ncnf; /* nonlinear solver stats */ + long int nli, npsol; /* linear solver stats */ + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = true; + + /* Create CVode */ + cvode_mem = CVodeCreate(CV_BDF, udata->ctx); + if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1; + + /* Initialize CVode */ + retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y); + if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1; + + /* Attach user data */ + retval = CVodeSetUserData(cvode_mem, (void*) udata); + if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = CVodeSetMaxNumSteps(cvode_mem, 100000); + if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1; + + /* Create the (non)linear solver */ + if (uopt->nls == "newton") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_Newton(y, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = CVodeSetNonlinearSolver(cvode_mem, NLS); + if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1; + + /* Create linear solver */ + LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx); + if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1; + + /* Attach linear solver */ + retval = CVodeSetLinearSolver(cvode_mem, LS, NULL); + if (check_retval(&retval, "CVodeSetLinearSolver", 1, udata->myid)) return 1; + + /* Attach preconditioner */ + retval = CVodeSetPreconditioner(cvode_mem, NULL, PSolve); + if (check_retval(&retval, "CVodeSetPreconditioner", 1, udata->myid)) return 1; + } + else if (uopt->nls == "fixedpoint") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = CVodeSetNonlinearSolver(cvode_mem, NLS); + if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1; + } + else + { + fprintf(stderr, "\nERROR: CV-BDF method is not compatible with the nls option provided\n"); + return 1; + } + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL); + if (check_retval(&retval, "CVode", 1, udata->myid)) break; + + /* Output state */ + if (uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* Get final statistics */ + retval = CVodeGetNumSteps(cvode_mem, &nst); + check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid); + retval = CVodeGetNumRhsEvals(cvode_mem, &nfi); + check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid); + retval = CVodeGetNumErrTestFails(cvode_mem, &netf); + check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid); + retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni); + check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid); + retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf); + check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid); + if (uopt->nls == "newton") + { + retval = CVodeGetNumLinIters(cvode_mem, &nli); + check_retval(&retval, "CVodeGetNumLinIters", 1, udata->myid); + retval = CVodeGetNumPrecSolves(cvode_mem, &npsol); + check_retval(&retval, "CVodeGetNumPrecSolves", 1, udata->myid); + } + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li\n", nst); + printf(" Total RHS evals: %li\n", nfi + udata->nnlfi); + printf(" Total number of error test failures = %li\n", netf); + printf(" Total number of nonlinear solver convergence failures = %li\n", + ncnf); + printf(" Total number of nonlinear iterations = %li\n", nni); + if (uopt->nls == "newton") + { + printf(" Total number of linear iterations = %li\n", nli); + printf(" Total number of preconditioner solves = %li\n", npsol); + } + } + + /* Clean up */ + CVodeFree(&cvode_mem); + if (NLS) SUNNonlinSolFree(NLS); + if (LS) SUNLinSolFree(LS); + + /* Return success */ + return(0); +} + + +/* Setup CVODE and evolve problem in time with Adams method */ +int EvolveProblemAdams(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* cvode_mem = NULL; /* empty CVODE memory structure */ + SUNNonlinearSolver NLS = NULL; /* empty nonlinear solver structure */ + + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, netf; /* step stats */ + long int nfi; /* RHS stats */ + long int nni, ncnf; /* nonlinear solver stats */ + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = true; + + /* Create CVode */ + cvode_mem = CVodeCreate(CV_ADAMS, udata->ctx); + if (check_retval((void*)cvode_mem, "CVodeCreate", 0, udata->myid)) return 1; + + /* Initialize CVode */ + retval = CVodeInit(cvode_mem, AdvectionReaction, uopt->t0, y); + if (check_retval((void*)cvode_mem, "CVodeInit", 0, udata->myid)) return 1; + + /* Attach user data */ + retval = CVodeSetUserData(cvode_mem, (void*) udata); + if (check_retval(&retval, "CVodeSetUserData*", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = CVodeSStolerances(cvode_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "CVodeSStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = CVodeSetMaxNumSteps(cvode_mem, 100000); + if (check_retval(&retval, "CVodeSetMaxNumSteps", 1, udata->myid)) return 1; + + /* Create nonlinear solver */ + NLS = SUNNonlinSol_FixedPoint(y, uopt->fpaccel, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_FixedPoint", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = CVodeSetNonlinearSolver(cvode_mem, NLS); + if (check_retval(&retval, "CVodeSetNonlinearSolver", 1, udata->myid)) return 1; + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL); + if (check_retval(&retval, "CVode", 1, udata->myid)) break; + + /* Output state */ + if (uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* Get final statistics */ + retval = CVodeGetNumSteps(cvode_mem, &nst); + check_retval(&retval, "CVodeGetNumSteps", 1, udata->myid); + retval = CVodeGetNumRhsEvals(cvode_mem, &nfi); + check_retval(&retval, "CVodeGetNumRhsEvals", 1, udata->myid); + retval = CVodeGetNumErrTestFails(cvode_mem, &netf); + check_retval(&retval, "CVodeGetNumErrTestFails", 1, udata->myid); + retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni); + check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1, udata->myid); + retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncnf); + check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1, udata->myid); + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li\n", nst); + printf(" Total RHS evals: %li\n", nfi + udata->nnlfi); + printf(" Total number of error test failures = %li\n", netf); + printf(" Total number of nonlinear solver convergence failures = %li\n", + ncnf); + } + + /* Clean up */ + CVodeFree(&cvode_mem); + SUNNonlinSolFree(NLS); + + /* Return success */ + return(0); +} diff --git a/benchmarks/advection_reaction_3D/raja/ida_driver.cpp b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp new file mode 100644 index 0000000000..3ae28a43ca --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/ida_driver.cpp @@ -0,0 +1,195 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ---------------------------------------------------------------------------*/ + +#include "ida/ida.h" +#include "sunlinsol/sunlinsol_spgmr.h" +#include "sunnonlinsol/sunnonlinsol_newton.h" +#include "sunnonlinsol/sunnonlinsol_fixedpoint.h" +#include "advection_reaction_3D.hpp" +#include "rhs3D.hpp" + + +/* Initial condition function */ +int SetICDot(N_Vector y, N_Vector yp, UserData* udata) +{ + int retval; + + retval = AdvectionReaction(0, y, yp, (void*)udata); + if (check_retval(&retval, "AdvectionReaction", 1, udata->myid)) return 1; + + /* Return success */ + return(0); +} + + +/* Setup IDA and evolve problem in time with BDF method */ +int EvolveDAEProblem(N_Vector y, UserData* udata, UserOptions* uopt) +{ + void* ida_mem = NULL; /* empty IDA memory structure */ + SUNNonlinearSolver NLS = NULL; /* empty nonlinear solver structure */ + SUNLinearSolver LS = NULL; /* empty linear solver structure */ + N_Vector yp = NULL; /* empty vector structure */ + + realtype t, dtout, tout; /* current/output time data */ + int retval; /* reusable error-checking flag */ + int iout; /* output counter */ + long int nst, netf; /* step stats */ + long int nfi; /* RHS stats */ + long int nni, ncnf; /* nonlinear solver stats */ + long int nli, npsol; /* linear solver stats */ + + /* Additively split methods should not add the advection and reaction terms */ + udata->add_reactions = true; + + /* Create ydot' vector */ + yp = N_VClone(y); + if (check_retval((void*)yp, "N_VClone", 0, udata->myid)) return 1; + + /* Create IDA */ + ida_mem = IDACreate(udata->ctx); + if (check_retval((void*)ida_mem, "IDACreate", 0, udata->myid)) return 1; + + /* Initialize IDA */ + retval = IDAInit(ida_mem, AdvectionReactionResidual, uopt->t0, y, yp); + if (check_retval(&retval, "IDAInit", 1, udata->myid)) return 1; + + /* Attach user data */ + retval = IDASetUserData(ida_mem, (void*) udata); + if (check_retval(&retval, "IDASetUserData*", 1, udata->myid)) return 1; + + /* Specify tolerances */ + retval = IDASStolerances(ida_mem, uopt->rtol, uopt->atol); + if (check_retval(&retval, "IDASStolerances", 1, udata->myid)) return 1; + + /* Increase the max number of steps allowed between outputs */ + retval = IDASetMaxNumSteps(ida_mem, 100000); + if (check_retval(&retval, "IDASetMaxNumSteps", 1, udata->myid)) return 1; + + /* Increase the max number of ETF allowed between outputs */ + retval = IDASetMaxErrTestFails(ida_mem, 25); + if (check_retval(&retval, "IDASetMaxErrTestFails", 1, udata->myid)) return 1; + + /* Create the (non)linear solver */ + if (uopt->nls == "newton") + { + /* Create nonlinear solver */ + NLS = SUNNonlinSol_Newton(y, udata->ctx); + if (check_retval((void *)NLS, "SUNNonlinSol_Newton", 0, udata->myid)) return 1; + + /* Attach nonlinear solver */ + retval = IDASetNonlinearSolver(ida_mem, NLS); + if (check_retval(&retval, "IDASetNonlinearSolver", 1, udata->myid)) return 1; + + /* Create linear solver */ + LS = uopt->precond ? SUNLinSol_SPGMR(y, PREC_LEFT, 0, udata->ctx) : SUNLinSol_SPGMR(y, PREC_NONE, 0, udata->ctx); + if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0, udata->myid)) return 1; + + /* Attach linear solver */ + retval = IDASetLinearSolver(ida_mem, LS, NULL); + if (check_retval(&retval, "IDASetLinearSolver", 1, udata->myid)) return 1; + + // /* Attach preconditioner */ + retval = IDASetPreconditioner(ida_mem, NULL, PSolveRes); + if (check_retval(&retval, "IDASetPreconditioner", 1, udata->myid)) return 1; + } + else + { + fprintf(stderr, "\nERROR: IDA method is not compatible with the nls option provided\n"); + return 1; + } + + /* Set ydot' initial condition */ + retval = SetICDot(y, yp, udata); + if (check_retval(&retval, "SetICDot", 1, udata->myid)) return 1; + + /* Output initial condition */ + if (uopt->nout > 0) + { + if (udata->myid == 0) + { + printf("\n t ||u||_rms ||v||_rms ||w||_rms\n"); + printf(" ----------------------------------------------------\n"); + } + WriteOutput(uopt->t0, y, udata, uopt); + } + + /* Integrate to final time */ + t = uopt->t0; + dtout = (uopt->tf - uopt->t0); + if (uopt->nout != 0) + dtout /= uopt->nout; + tout = t + dtout; + iout = 0; + + do + { + /* Integrate to output time */ + retval = IDASolve(ida_mem, tout, &t, y, yp, IDA_NORMAL); + if (check_retval(&retval, "IDA", 1, udata->myid)) break; + + /* Output state */ + if(uopt->nout > 0) WriteOutput(t, y, udata, uopt); + + /* Update output time */ + tout += dtout; + tout = (tout > uopt->tf) ? uopt->tf : tout; + + iout++; + } while (iout < uopt->nout); + + /* Get final statistics */ + retval = IDAGetNumSteps(ida_mem, &nst); + check_retval(&retval, "IDAGetNumSteps", 1, udata->myid); + retval = IDAGetNumResEvals(ida_mem, &nfi); + check_retval(&retval, "IDAGetNumResEvals", 1, udata->myid); + retval = IDAGetNumErrTestFails(ida_mem, &netf); + check_retval(&retval, "IDAGetNumErrTestFails", 1, udata->myid); + retval = IDAGetNumNonlinSolvIters(ida_mem, &nni); + check_retval(&retval, "IDAGetNumNonlinSolvIters", 1, udata->myid); + retval = IDAGetNumNonlinSolvConvFails(ida_mem, &ncnf); + check_retval(&retval, "IDAGetNumNonlinSolvConvFails", 1, udata->myid); + if (uopt->nls == "newton") + { + retval = IDAGetNumLinIters(ida_mem, &nli); + check_retval(&retval, "IDAGetNumLinIters", 1, udata->myid); + retval = IDAGetNumPrecSolves(ida_mem, &npsol); + check_retval(&retval, "IDAGetNumPrecSolves", 1, udata->myid); + } + + /* Print final statistics */ + if (udata->myid == 0) + { + printf("\nFinal Solver Statistics (for processor 0):\n"); + printf(" Internal solver steps = %li\n", nst); + printf(" Total RHS evals: %li\n", nfi + udata->nnlfi); + printf(" Total number of error test failures = %li\n", netf); + printf(" Total number of nonlinear solver convergence failures = %li\n", + ncnf); + printf(" Total number of nonlinear iterations = %li\n", nni); + if (uopt->nls == "newton") + { + printf(" Total number of linear iterations = %li\n", nli); + printf(" Total number of preconditioner solves = %li\n", npsol); + } + } + + /* Clean up */ + IDAFree(&ida_mem); + if (yp) N_VDestroy(yp); + if (NLS) SUNNonlinSolFree(NLS); + if (LS) SUNLinSolFree(LS); + + /* Return success */ + return(0); +} diff --git a/benchmarks/advection_reaction_3D/raja/rhs3D.hpp b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp new file mode 100644 index 0000000000..1bb2b6f105 --- /dev/null +++ b/benchmarks/advection_reaction_3D/raja/rhs3D.hpp @@ -0,0 +1,598 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL + * Daniel R. Reynolds @ SMU + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * -----------------------------------------------------------------------------*/ + +#ifndef ADVECTION_REACTION_3D_RHS_HPP +#define ADVECTION_REACTION_3D_RHS_HPP + +#include "advection_reaction_3D.hpp" + +/* -------------------------------------------------------------- + * Right hand side (RHS) and residual functions + * --------------------------------------------------------------*/ + +/* Compute the advection term f(t,y) = -c (grad * y). This is done using + upwind 1st order finite differences. */ +static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* set variable shortcuts */ + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + const realtype c = udata->c; + const realtype cx = -c / udata->grid->dx; + const realtype cy = -c / udata->grid->dy; + const realtype cz = -c / udata->grid->dz; + + /* local variables */ + int retval; + + /* fill send buffers and begin exchanging boundary information */ + SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); + retval = FillSendBuffers(y, udata); + if (check_retval(&retval, "FillSendBuffers", 1, udata->myid)) + return(-1); + retval = udata->grid->ExchangeStart(); + if (check_retval(&retval, "ExchangeStart", 1, udata->myid)) + return(-1); + SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); + + /* set output to zero */ + N_VConst(0.0, ydot); + + /* create views of the state and RHS vectors */ + RAJA::View > Yview(GetVecData(y), nxl, nyl, nzl, dof); + RAJA::View > dYview(GetVecData(ydot), nxl, nyl, nzl, dof); + + /* iterate over domain interior, computing advection */ + if (c > 0.0) + { + /* flow moving in the positive x,y,z direction */ + auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl), + RAJA::RangeSegment(1, nyl), + RAJA::RangeSegment(1, nzl)); + RAJA::kernel(range, + [=] DEVICE_FUNC (int i, int j, int k) { + const realtype u_ijk = Yview(i,j,k,0); + const realtype v_ijk = Yview(i,j,k,1); + const realtype w_ijk = Yview(i,j,k,2); + + // grad * u + dYview(i,j,k,0) = cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz + dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy + dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx + + // grad * v + dYview(i,j,k,1) = cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz + dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy + dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx + + // grad * w + dYview(i,j,k,2) = cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz + dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy + dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx + }); + + } + else if (c < 0.0) + { + /* flow moving in the negative x,y,z direction */ + auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1), + RAJA::RangeSegment(0, nyl-1), + RAJA::RangeSegment(0, nzl-1)); + RAJA::kernel(range, + [=] DEVICE_FUNC (int i, int j, int k) { + const realtype u_ijk = Yview(i,j,k,0); + const realtype v_ijk = Yview(i,j,k,1); + const realtype w_ijk = Yview(i,j,k,2); + + // grad * u + dYview(i,j,k,0) = cz * (Yview(i,j,k+1,0) - u_ijk); // du/dz + dYview(i,j,k,0) += cy * (Yview(i,j+1,k,0) - u_ijk); // du/dy + dYview(i,j,k,0) += cx * (Yview(i+1,j,k,0) - u_ijk); // du/dx + + // grad * v + dYview(i,j,k,1) = cz * (Yview(i,j,k+1,1) - v_ijk); // dv/dz + dYview(i,j,k,1) += cy * (Yview(i,j+1,k,1) - v_ijk); // dv/dy + dYview(i,j,k,1) += cx * (Yview(i+1,j,k,1) - v_ijk); // dv/dx + + // grad * w + dYview(i,j,k,2) = cz * (Yview(i,j,k+1,2) - w_ijk); // dw/dz + dYview(i,j,k,2) += cy * (Yview(i,j+1,k,2) - w_ijk); // dw/dy + dYview(i,j,k,2) += cx * (Yview(i+1,j,k,2) - w_ijk); // dw/dx + }); + + } + + /* finish exchanging boundary information */ + SUNDIALS_MARK_BEGIN(udata->prof, "Neighbor Exchange"); + retval = udata->grid->ExchangeEnd(); + if (check_retval(&retval, "ExchangeEnd", 1, udata->myid)) + return(-1); + SUNDIALS_MARK_END(udata->prof, "Neighbor Exchange"); + + + /* compute advection at process boundaries */ + if (c > 0.0) + { + /* Flow moving in the positive x,y,z direction: + * boundaries are west face, south face, back face */ + + /* Perform calculations on each "lower" face */ + RAJA::View> + Wrecv(udata->grid->getRecvBuffer("WEST"), nyl, nzl, dof); + RAJA::View> + Srecv(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof); + RAJA::View> + Brecv(udata->grid->getRecvBuffer("BACK"), nxl, nyl, dof); + + auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(west_face, + [=] DEVICE_FUNC (int j, int k, int l) { + const int i = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l); + const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l); + dYview(i,j,k,l) = cx * (Yijkl - Wrecv(j,k,l)); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - YSouth); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - YBack); // d/dz + }); + + auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(south_face, + [=] DEVICE_FUNC (int i, int k, int l) { + const int j = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l); + const realtype YBack = (k > 0) ? Yview(i,j,k-1,l) : Brecv(i,j,l); + dYview(i,j,k,l) = cx * (Yijkl - YWest); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - Srecv(i,k,l)); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - YBack); // d/dz + }); + + auto back_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(back_face, + [=] DEVICE_FUNC (int i, int j, int l) { + const int k = 0; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YWest = (i > 0) ? Yview(i-1,j,k,l) : Wrecv(j,k,l); + const realtype YSouth = (j > 0) ? Yview(i,j-1,k,l) : Srecv(i,k,l); + dYview(i,j,k,l) = cx * (Yijkl - YWest); // d/dx + dYview(i,j,k,l) += cy * (Yijkl - YSouth); // d/dy + dYview(i,j,k,l) += cz * (Yijkl - Brecv(i,j,l)); // d/dz + }); + + } + else if (c < 0.0) + { + + /* Flow moving in the negative x,y,z direction: + * boundaries are east face, north face, and front face */ + + /* Perform calculations on each "upper" face */ + RAJA::View > + Erecv(udata->grid->getRecvBuffer("EAST"), nyl, nzl, dof); + RAJA::View > + Nrecv(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof); + RAJA::View > + Frecv(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof); + + auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(east_face, + [=] DEVICE_FUNC (int j, int k, int l) { + const int i = nxl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l); + const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l); + dYview(i,j,k,l) = cx * (Erecv(j,k,l) - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (YNorth - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (YFront - Yijkl); // d/dz + }); + + auto north_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nzl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(north_face, + [=] DEVICE_FUNC (int i, int k, int l) { + const int j = nyl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YEast = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l); + const realtype YFront = (k < nzl-1) ? Yview(i,j,k+1,l) : Frecv(i,j,l); + dYview(i,j,k,l) = cx * (YEast - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (Nrecv(i,k,l) - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (YFront - Yijkl); // d/dz + }); + + auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, dof)); + RAJA::kernel(front_face, + [=] DEVICE_FUNC (int i, int j, int l) { + const int k = nzl-1; + const realtype Yijkl = Yview(i,j,k,l); + const realtype YEast = (i < nxl-1) ? Yview(i+1,j,k,l) : Erecv(j,k,l); + const realtype YNorth = (j < nyl-1) ? Yview(i,j+1,k,l) : Nrecv(i,k,l); + dYview(i,j,k,l) = cx * (YEast - Yijkl); // d/dx + dYview(i,j,k,l) += cy * (YNorth - Yijkl); // d/dy + dYview(i,j,k,l) += cz * (Frecv(i,j,l) - Yijkl); // d/dz + }); + } + + /* return success */ + return(0); +} + + +/* Compute the reaction term g(t,y). */ +static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* set variable shortcuts */ + const realtype A = udata->A; + const realtype B = udata->B; + const realtype k1 = udata->k1; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k5 = udata->k5; + const realtype k6 = udata->k6; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const int dof = udata->grid->dof; + + /* Zero output if not adding reactions to existing RHS */ + if (!udata->add_reactions) + N_VConst(0.0, ydot); + + /* access data arrays */ + realtype* Ydata = NULL; + Ydata = GetVecData(y); + if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid)) + return(-1); + realtype* dYdata = NULL; + dYdata = GetVecData(ydot); + if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid)) + return(-1); + + /* create 4D views of state and RHS vectors */ + RAJA::View > Yview(GetVecData(y), nxl, nyl, nzl, dof); + RAJA::View > dYview(GetVecData(ydot), nxl, nyl, nzl, dof); + + /* add reaction terms to RHS */ + auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl)); + RAJA::kernel(range, + [=] DEVICE_FUNC (int i, int j, int k) { + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u; + dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v; + dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w; + }); + + /* return success */ + return(0); +} + + +/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */ +static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot, + void *user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + int retval; + + /* NOTE: The order in which Advection and Reaction are called + is critical here. Advection must be computed first. */ + retval = Advection(t, y, ydot, user_data); + if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); + + retval = Reaction(t, y, ydot, user_data); + if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); + + /* return success */ + return(0); +} + +/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */ +static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot, + N_Vector F, void *user_data) +{ + /* access problem data */ + UserData* udata = (UserData*) user_data; + int retval; + + /* NOTE: The order in which Advection and Reaction are called + is critical here. Advection must be computed first. */ + retval = Advection(t, y, F, user_data); /* F = -c y_x */ + if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); + + retval = Reaction(t, y, F, user_data); /* F = -c y_x + g(t,y) */ + if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); + + /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */ + N_VLinearSum(1.0, ydot, -1.0, F, F); + + /* return success */ + return(0); +} + +/* -------------------------------------------------------------- + * Linear system and Jacobian functions + * --------------------------------------------------------------*/ + +/* Solve the linear systems Ax = b where A = I - gamma*dg/dy. + When using a fully implicit method, we are approximating + dh/dy as dg/dy. */ +static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b, + realtype gamma, UserData* udata) +{ + /* set variable shortcuts */ + const int dof = udata->grid->dof; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k6 = udata->k6; + + /* create 4D views of state, RHS and solution vectors */ + RAJA::View> Yview(GetVecData(y), nxl, nyl, nzl, dof); + RAJA::View> Bview(GetVecData(b), nxl, nyl, nzl, dof); + RAJA::View> Xview(GetVecData(x), nxl, nyl, nzl, dof); + + /* solve reaction linear system */ + auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl)); + RAJA::kernel(blocks, + [=] DEVICE_FUNC (int i, int j, int k) { + + /* shortcuts to u, v, w for the block */ + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + + // + // compute J = dg/dy + // + + /* 1st row: u, v, w */ + realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4; + realtype A1 = k3 * u * u; + realtype A2 = -k2 * u; + + /* 2nd row: u, v, w */ + realtype A3 = k2 * w - 2.0 * k3 * u * v; + realtype A4 = -k3 * u * u; + realtype A5 = k2 * u; + + /* 3rd row: u, v, w */ + realtype A6 = -k2 * w; + realtype A7 = 0.0; + realtype A8 = -k2 * u - k6; + + // + // compute A = I - gamma*J + // + + A0 = 1. - (gamma * A0); + A1 = -gamma * A1; + A2 = -gamma * A2; + A3 = -gamma * A3; + A4 = 1. - (gamma * A4); + A5 = -gamma * A5; + A6 = -gamma * A6; + A7 = -gamma * A7; + A8 = 1. - (gamma * A8); + + // + // compute x = A^{-1}b + // + + realtype scratch_0 = A4*A8; + realtype scratch_1 = A1*A5; + realtype scratch_2 = A2*A7; + realtype scratch_3 = A5*A7; + realtype scratch_4 = A1*A8; + realtype scratch_5 = A2*A4; + realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); + realtype scratch_7 = A2*A3; + realtype scratch_8 = A6*Bview(i,j,k,0); + realtype scratch_9 = A2*A6; + realtype scratch_10 = A3*Bview(i,j,k,0); + realtype scratch_11 = 1.0/A0; + realtype scratch_12 = A1*scratch_11; + realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); + + Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3) + + Bview(i,j,k,1)*(scratch_2 - scratch_4) + + Bview(i,j,k,2)*(scratch_1 - scratch_5)); + Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5) + + Bview(i,j,k,1)*(A0*A8 - scratch_9) + + A5*scratch_8 - A8*scratch_10 ); + Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8 + + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) / + (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); + + }); + + return(0); +} + +/* Solve the linear systems Ax = b where A = -dg/dy + gamma. + We are approximating dh/dy as dg/dy. */ +static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b, + realtype gamma, UserData* udata) +{ + /* set variable shortcuts */ + const int dof = udata->grid->dof; + const int nxl = udata->grid->nxl; + const int nyl = udata->grid->nyl; + const int nzl = udata->grid->nzl; + const realtype k2 = udata->k2; + const realtype k3 = udata->k3; + const realtype k4 = udata->k4; + const realtype k6 = udata->k6; + + /* create 4D views of state, RHS and solution vectors */ + RAJA::View> Yview(GetVecData(y), nxl, nyl, nzl, dof); + RAJA::View> Bview(GetVecData(b), nxl, nyl, nzl, dof); + RAJA::View> Xview(GetVecData(x), nxl, nyl, nzl, dof); + + /* solve reaction linear system */ + auto blocks = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), + RAJA::RangeSegment(0, nyl), + RAJA::RangeSegment(0, nzl)); + RAJA::kernel(blocks, + [=] DEVICE_FUNC (int i, int j, int k) { + + /* shortcuts to u, v, w for the block */ + const realtype u = Yview(i,j,k,0); + const realtype v = Yview(i,j,k,1); + const realtype w = Yview(i,j,k,2); + + // + // compute dg/dy + // + + /* 1st row: u, v, w */ + realtype A0 = -k2 * w + 2.0 * k3 * u * v - k4; + realtype A1 = k3 * u * u; + realtype A2 = -k2 * u; + + /* 2nd row: u, v, w */ + realtype A3 = k2 * w - 2.0 * k3 * u * v; + realtype A4 = -k3 * u * u; + realtype A5 = k2 * u; + + /* 3rd row: u, v, w */ + realtype A6 = -k2 * w; + realtype A7 = 0.0; + realtype A8 = -k2 * u - k6; + + // + // compute A = -dg/dy + gamma*diag(df/dydot) + // where diag(df/dydot) is approximated as + // diag([udot, vdot, wdot]) + // + + A0 = -A0 + gamma; + A1 = -A1; + A2 = -A2; + A3 = -A3; + A4 = -A4 + gamma; + A5 = -A5; + A6 = -A6; + A7 = -A7; + A8 = -A8 + gamma; + + // + // compute x = A^{-1}b + // + + realtype scratch_0 = A4*A8; + realtype scratch_1 = A1*A5; + realtype scratch_2 = A2*A7; + realtype scratch_3 = A5*A7; + realtype scratch_4 = A1*A8; + realtype scratch_5 = A2*A4; + realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); + realtype scratch_7 = A2*A3; + realtype scratch_8 = A6*Bview(i,j,k,0); + realtype scratch_9 = A2*A6; + realtype scratch_10 = A3*Bview(i,j,k,0); + realtype scratch_11 = 1.0/A0; + realtype scratch_12 = A1*scratch_11; + realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); + + Xview(i,j,k,0) = scratch_6*( Bview(i,j,k,0)*(scratch_0 - scratch_3) + + Bview(i,j,k,1)*(scratch_2 - scratch_4) + + Bview(i,j,k,2)*(scratch_1 - scratch_5)); + Xview(i,j,k,1) = scratch_6*( Bview(i,j,k,2)*(scratch_7 - A0*A5) + + Bview(i,j,k,1)*(A0*A8 - scratch_9) + + A5*scratch_8 - A8*scratch_10 ); + Xview(i,j,k,2) = ( -Bview(i,j,k,2) + scratch_11*scratch_8 + + scratch_13*(Bview(i,j,k,1) - scratch_10*scratch_11)) / + (-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); + + }); + + return(0); +} + + +/* -------------------------------------------------------------- + * Preconditioner functions + * --------------------------------------------------------------*/ + +/* Solves Pz = r where P = I - gamma * dg/dy */ +static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r, + N_Vector z, realtype gamma, realtype delta, int lr, + void *user_data) +{ + /* local variables */ + UserData* udata = (UserData*) user_data; + int retval; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* solve the task-local linear system Pz = r */ + retval = SolveReactionLinSys(y, z, r, gamma, udata); + + return(retval); +} + +/* Solves Pz = r where P = -dg/dy + gamma */ +static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F, + N_Vector r, N_Vector z, realtype cj, realtype delta, + void *user_data) +{ + /* local variables */ + UserData* udata = (UserData*) user_data; + int retval; + + SUNDIALS_CXX_MARK_FUNCTION(udata->prof); + + /* solve the task-local linear system Pz = r */ + retval = SolveReactionLinSysRes(y, z, r, cj, udata); + + return(retval); +} + + +#endif diff --git a/benchmarks/advection_reaction_3D/rhs3D.hpp b/benchmarks/advection_reaction_3D/rhs3D.hpp deleted file mode 100644 index 874e5cb8bb..0000000000 --- a/benchmarks/advection_reaction_3D/rhs3D.hpp +++ /dev/null @@ -1,700 +0,0 @@ -/* ----------------------------------------------------------------------------- - * Programmer(s): David J. Gardner, Cody J. Balos @ LLNL - * ----------------------------------------------------------------------------- - * SUNDIALS Copyright Start - * Copyright (c) 2002-2023, Lawrence Livermore National Security - * and Southern Methodist University. - * All rights reserved. - * - * See the top-level LICENSE and NOTICE files for details. - * - * SPDX-License-Identifier: BSD-3-Clause - * SUNDIALS Copyright End - * -----------------------------------------------------------------------------*/ - -#ifndef ADVECTION_REACTION_3D_RHS_HPP -#define ADVECTION_REACTION_3D_RHS_HPP - -#include "advection_reaction_3D.hpp" - -using raja_xyz_tuple = camp::tuple; - -/* -------------------------------------------------------------- - * Right hand side (RHS) and residual functions - * --------------------------------------------------------------*/ - -/* Compute the advection term f(t,y) = -c (grad * y). This is done using - upwind 1st order finite differences. */ -static int Advection(realtype t, N_Vector y, N_Vector ydot, void* user_data) -{ - /* access problem data */ - UserData* udata = (UserData*) user_data; - - SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - - /* set variable shortcuts */ - const int nxl = udata->grid->nxl; - const int nyl = udata->grid->nyl; - const int nzl = udata->grid->nzl; - const int dof = udata->grid->dof; - const realtype c = udata->c; - const realtype cx = -c / udata->grid->dx; - const realtype cy = -c / udata->grid->dy; - const realtype cz = -c / udata->grid->dz; - - /* local variables */ - int retval; - - /* begin exchanging boundary information */ - if (udata->grid->nprocs() > 1) - { - retval = ExchangeAllStart(y, udata); - if (check_retval(&retval, "ExchangeAllStart", 1, udata->myid)) - return(-1); - } - - /* set output to zero */ - N_VConst(0.0, ydot); - - /* create views of the data */ - RAJA::View > Yview(GetVecData(y), - nxl, nyl, nzl, dof); - RAJA::View > dYview(GetVecData(ydot), - nxl, nyl, nzl, dof); - - /* iterate over domain interior, computing advection */ - if (c > 0.0) - { - /* flow moving in the positive x,y,z direction */ - auto range = RAJA::make_tuple(RAJA::RangeSegment(1, nxl), - RAJA::RangeSegment(1, nyl), - RAJA::RangeSegment(1, nzl)); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - // grad * u - dYview(i,j,k,0) = cz * (u_ijk - Yview(i,j,k-1,0)); // du/dz - dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j-1,k,0)); // du/dy - dYview(i,j,k,0) += cx * (u_ijk - Yview(i-1,j,k,0)); // du/dx - - // grad * v - dYview(i,j,k,1) = cz * (v_ijk - Yview(i,j,k-1,1)); // dv/dz - dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j-1,k,1)); // dv/dy - dYview(i,j,k,1) += cx * (v_ijk - Yview(i-1,j,k,1)); // dv/dx - - // grad * w - dYview(i,j,k,2) = cz * (w_ijk - Yview(i,j,k-1,2)); // dw/dz - dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j-1,k,2)); // dw/dy - dYview(i,j,k,2) += cx * (w_ijk - Yview(i-1,j,k,2)); // dw/dx - }); - } - else if (c < 0.0) - { - /* flow moving in the negative x,y,z direction */ - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1), - RAJA::RangeSegment(0, nyl-1), - RAJA::RangeSegment(0, nzl-1)); - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - // grad * u - dYview(i,j,k,0) = cz * (u_ijk - Yview(i,j,k+1,0)); // du/dz - dYview(i,j,k,0) += cy * (u_ijk - Yview(i,j+1,k,0)); // du/dy - dYview(i,j,k,0) += cx * (u_ijk - Yview(i+1,j,k,0)); // du/dx - - // grad * v - dYview(i,j,k,1) = cz * (v_ijk - Yview(i,j,k+1,1)); // dv/dz - dYview(i,j,k,1) += cy * (v_ijk - Yview(i,j+1,k,1)); // dv/dy - dYview(i,j,k,1) += cx * (v_ijk - Yview(i+1,j,k,1)); // dv/dx - - // grad * w - dYview(i,j,k,2) = cz * (w_ijk - Yview(i,j,k+1,2)); // dw/dz - dYview(i,j,k,2) += cy * (w_ijk - Yview(i,j+1,k,2)); // dw/dy - dYview(i,j,k,2) += cx * (w_ijk - Yview(i+1,j,k,2)); // dw/dx - }); - } - - /* finish exchanging boundary information */ - if (udata->grid->nprocs() > 1) - { - retval = ExchangeAllEnd(udata); - if (check_retval(&retval, "ExchangeAllEnd", 1, udata->myid)) - return(-1); - } - - /* compute advection at process boundaries */ - if (c > 0.0) - { - if (udata->grid->npx > 1) - { - /* Flow moving in the positive x,y,z direction: - * boundaries are west face, south face, front face */ - - RAJA::View > - Yim1jk(udata->grid->getRecvBuffer("WEST"), nyl, nzl, dof); // Wrecv should have data that was sent from East - - auto west_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl), - RAJA::RangeSegment(0, nzl), - RAJA::RangeSegment(0, dof)); - - RAJA::kernel(west_face, - [=] DEVICE_FUNC (int j, int k, int l) { - dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yim1jk(j,k,l)); // d/dx - }); - } - else - { - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1)); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - dYview(i,j,k,0) = cx * (u_ijk - Yview(nxl-1,j,k,0)); // du/dx - dYview(i,j,k,1) = cx * (v_ijk - Yview(nxl-1,j,k,1)); // dv/dx - dYview(i,j,k,2) = cx * (w_ijk - Yview(nxl-1,j,k,2)); // dw/dx - }); - - } - - if (udata->grid->npy > 1) - { - RAJA::View > - Yijm1k(udata->grid->getRecvBuffer("SOUTH"), nxl, nzl, dof); // Nrecv should have data that was sent from North - - auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), - RAJA::RangeSegment(0, nzl), - RAJA::RangeSegment(0, dof)); - - RAJA::kernel(south_face, - [=] DEVICE_FUNC (int i, int k, int l) { - dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijm1k(i,k,l)); // d/dy - }); - } - else - { - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1)); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - dYview(i,j,k,0) += cy * (u_ijk - Yview(i,nyl-1,k,0)); // du/dy - dYview(i,j,k,1) += cy * (v_ijk - Yview(i,nyl-1,k,1)); // dv/dy - dYview(i,j,k,2) += cy * (w_ijk - Yview(i,nyl-1,k,2)); // dw/dy - }); - } - - if (udata->grid->npz > 1) - { - RAJA::View > - Yijkm1(udata->grid->getRecvBuffer("FRONT"), nxl, nyl, dof); // Frecv should have data that was sent from Back - - auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl), - RAJA::RangeSegment(0, nyl), - RAJA::RangeSegment(0, dof)); - - RAJA::kernel(front_face, - [=] DEVICE_FUNC (int i, int j, int l) { - dYview(i,j,0,l) += cz * (Yview(i,j,0,l) - Yijkm1(i,j,l)); // d/dz - }); - - } - else - { - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1), - RAJA::RangeSegment(0, 1)); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - dYview(i,j,k,0) += cz * (u_ijk - Yview(i,j,nzl-1,0)); // du/dz - dYview(i,j,k,1) += cz * (v_ijk - Yview(i,j,nzl-1,1)); // dv/dz - dYview(i,j,k,2) += cz * (w_ijk - Yview(i,j,nzl-1,2)); // dw/dz - }); - } - } - else if (c < 0.0) - { - if (udata->grid->nprocs() != 1) - { - /* Flow moving in the negative x,y,z direction: - * boundaries are west face, south face, and front face */ - - RAJA::View > - Yip1jk(udata->grid->getRecvBuffer("EAST"), nyl, nzl, dof); - RAJA::View > - Yijp1k(udata->grid->getRecvBuffer("NORTH"), nxl, nzl, dof); - RAJA::View > - Yijkp1(udata->grid->getRecvBuffer("BACK"), nxl, nyl, dof); - - auto front_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1), - RAJA::RangeSegment(0, nyl-1), - RAJA::RangeSegment(0, dof)); - RAJA::kernel(front_face, - [=] DEVICE_FUNC (int i, int j, int l) { - dYview(i,j,0,l) = cz * (Yview(i,j,0,l) - Yijkp1(i,nzl+1,l)); // d/dz - dYview(i,j,0,l) += cy * (Yview(i,j,0,l) - Yijp1k(0,j+1,l)); // d/dy - dYview(i,j,0,l) += cx * (Yview(i,j,0,l) - Yip1jk(i+1,0,l)); // d/dx - }); - - auto south_face = RAJA::make_tuple(RAJA::RangeSegment(0, nxl-1), - RAJA::RangeSegment(0, nzl-1), - RAJA::RangeSegment(0, dof)); - RAJA::kernel(south_face, - [=] DEVICE_FUNC (int i, int k, int l) { - dYview(i,0,k,l) = cz * (Yview(i,0,k,l) - Yijkp1(i,k+1,l)); // d/dz - dYview(i,0,k,l) += cy * (Yview(i,0,k,l) - Yijp1k(0,nyl+1,l)); // d/dy - dYview(i,0,k,l) += cx * (Yview(i,0,k,l) - Yip1jk(i+1,0,l)); // d/dx - }); - - auto east_face = RAJA::make_tuple(RAJA::RangeSegment(0, nyl-1), - RAJA::RangeSegment(0, nzl-1), - RAJA::RangeSegment(0, dof)); - RAJA::kernel(east_face, - [=] DEVICE_FUNC (int j, int k, int l) { - dYview(0,j,k,l) = cz * (Yview(0,j,k,l) - Yijkp1(0,k+1,l)); // d/dz - dYview(0,j,k,l) += cy * (Yview(0,j,k,l) - Yijp1k(0,j+1,l)); // d/dy - dYview(0,j,k,l) += cx * (Yview(0,j,k,l) - Yip1jk(nxl+1,0,l)); // d/dx - }); - } - else - { - auto range = RAJA::make_tuple(RAJA::RangeSegment(nxl-2, nxl), - RAJA::RangeSegment(nyl-2, nyl), - RAJA::RangeSegment(nzl-2, nzl)); - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u_ijk = Yview(i,j,k,0); - const realtype v_ijk = Yview(i,j,k,1); - const realtype w_ijk = Yview(i,j,k,2); - - // grad * u - dYview(i,j,k,0) = cz * (u_ijk - Yview(i,j,0,0)); // du/dz - dYview(i,j,k,0) += cy * (u_ijk - Yview(i,0,k,0)); // du/dy - dYview(i,j,k,0) += cx * (u_ijk - Yview(0,j,k,0)); // du/dx - - // grad * v - dYview(i,j,k,1) = cz * (v_ijk - Yview(i,j,0,1)); // dv/dz - dYview(i,j,k,1) += cy * (v_ijk - Yview(i,0,k,1)); // dv/dy - dYview(i,j,k,1) += cx * (v_ijk - Yview(0,j,k,1)); // dv/dx - - // grad * w - dYview(i,j,k,2) = cz * (w_ijk - Yview(i,j,0,2)); // dw/dz - dYview(i,j,k,2) += cy * (w_ijk - Yview(i,0,k,2)); // dw/dy - dYview(i,j,k,2) += cx * (w_ijk - Yview(0,j,k,2)); // dw/dx - }); - } - } - - /* return success */ - return(0); -} - - -/* Compute the reaction term g(t,y). */ -static int Reaction(realtype t, N_Vector y, N_Vector ydot, void* user_data) -{ - /* access problem data */ - UserData* udata = (UserData*) user_data; - - SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - - /* set variable shortcuts */ - const realtype A = udata->A; - const realtype B = udata->B; - const realtype k1 = udata->k1; - const realtype k2 = udata->k2; - const realtype k3 = udata->k3; - const realtype k4 = udata->k4; - const realtype k5 = udata->k5; - const realtype k6 = udata->k6; - - /* local variables */ - realtype* Ydata = NULL; - realtype* dYdata = NULL; - - /* access data arrays */ - Ydata = GetVecData(y); - if (check_retval((void *)Ydata, "GetVecData", 0, udata->myid)) - return(-1); - - dYdata = GetVecData(ydot); - if (check_retval((void *)dYdata, "GetVecData", 0, udata->myid)) - return(-1); - - RAJA::View > Yview(GetVecData(y), - udata->grid->nxl, - udata->grid->nyl, - udata->grid->nzl, - udata->grid->dof); - - RAJA::View > dYview(GetVecData(ydot), - udata->grid->nxl, - udata->grid->nyl, - udata->grid->nzl, - udata->grid->dof); - - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), - RAJA::RangeSegment(0, udata->grid->nyl), - RAJA::RangeSegment(0, udata->grid->nzl)); - - /* iterate over domain, computing reactions */ - if (udata->add_reactions) - { - /* when we are not additively splitting the rhs, we add to ydot - as we expect it to hold the advection term already */ - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u = Yview(i,j,k,0); - const realtype v = Yview(i,j,k,1); - const realtype w = Yview(i,j,k,2); - dYview(i,j,k,0) += k1 * A - k2 * w * u + k3 * u * u * v - k4 * u; - dYview(i,j,k,1) += k2 * w * u - k3 * u * u * v; - dYview(i,j,k,2) += -k2 * w * u + k5 * B - k6 * w; - }); - } - else - { - /* set output to zero */ - N_VConst(0.0, ydot); - - RAJA::kernel(range, - [=] DEVICE_FUNC (int i, int j, int k) { - const realtype u = Yview(i,j,k,0); - const realtype v = Yview(i,j,k,1); - const realtype w = Yview(i,j,k,2); - dYview(i,j,k,0) = k1 * A - k2 * w * u + k3 * u * u * v - k4 * u; - dYview(i,j,k,1) = k2 * w * u - k3 * u * u * v; - dYview(i,j,k,2) = -k2 * w * u + k5 * B - k6 * w; - }); - } - - /* return success */ - return(0); -} - - -/* Compute the RHS as h(t,y) = f(t,y) + g(t,y). */ -static int AdvectionReaction(realtype t, N_Vector y, N_Vector ydot, - void *user_data) -{ - /* access problem data */ - UserData* udata = (UserData*) user_data; - int retval; - - /* NOTE: The order in which Advection and Reaction are - called is critical here. Advection must be - computed first. */ - retval = Advection(t, y, ydot, user_data); - if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); - - retval = Reaction(t, y, ydot, user_data); - if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); - - /* return success */ - return(0); -} - -/* Compute the residual F(t,y,y') = ydot - h(t,y) = 0. */ -static int AdvectionReactionResidual(realtype t, N_Vector y, N_Vector ydot, - N_Vector F, void *user_data) -{ - /* access problem data */ - UserData* udata = (UserData*) user_data; - int retval; - - /* NOTE: The order in which Advection and Reaction are - called is critical here. Advection must be - computed first. */ - retval = Advection(t, y, F, user_data); /* F = -c y_x */ - if (check_retval((void *)&retval, "Advection", 1, udata->myid)) return(-1); - - retval = Reaction(t, y, F, user_data); /* F = -c y_x + g(t,y) */ - if (check_retval((void *)&retval, "Reaction", 1, udata->myid)) return(-1); - - /* F = ydot - h(t,y) = ydot + c y_x - g(t,y) */ - N_VLinearSum(1.0, ydot, -1.0, F, F); - - /* return success */ - return(0); -} - -/* -------------------------------------------------------------- - * Linear system and Jacobian functions - * --------------------------------------------------------------*/ - -/* Solve the linear systems Ax = b where A = I - gamma*dg/dy. - When using a fully implicit method, we are approximating - dh/dy as dg/dy. */ -static int SolveReactionLinSys(N_Vector y, N_Vector x, N_Vector b, - realtype gamma, raja_xyz_tuple blocks, - UserData* udata) -{ - /* shortcuts */ - int dof, nxl, nyl, nzl; - realtype k2, k3, k4, k6; - - /* set shortcuts */ - dof = udata->grid->dof; - nxl = udata->grid->nxl; - nyl = udata->grid->nyl; - nzl = udata->grid->nzl; - k2 = udata->k2; - k3 = udata->k3; - k4 = udata->k4; - k6 = udata->k6; - - /* create views of the data */ - RAJA::View > Yview(GetVecData(y), - nxl, nyl, nzl, dof); - RAJA::View > Bview(GetVecData(b), - nxl, nyl, nzl, dof); - RAJA::View > Xview(GetVecData(x), - nxl, nyl, nzl, dof); - - RAJA::kernel(blocks, - [=] DEVICE_FUNC (int i, int j, int k) { - - /* and the corresponding vectors */ - realtype *b = &(Bview(i,j,k,0)); - realtype *x = &(Xview(i,j,k,0)); - - /* shortcuts to u, v, w for the block */ - realtype u = Yview(i,j,k,0); - realtype v = Yview(i,j,k,1); - realtype w = Yview(i,j,k,2); - - realtype A0, A1, A2, A3, A4, A5, A6, A7, A8; - - // - // compute J = dg/dy - // - - /* 1st row: u, v, w */ - A0 = -k2 * w + 2.0 * k3 * u * v - k4; - A1 = k3 * u * u; - A2 = -k2 * u; - - /* 2nd row: u, v, w */ - A3 = k2 * w - 2.0 * k3 * u * v; - A4 = -k3 * u * u; - A5 = k2 * u; - - /* 3rd row: u, v, w */ - A6 = -k2 * w; - A7 = 0.0; - A8 = -k2 * u - k6; - - // - // compute A = I - gamma*J - // - - A0 = 1. - (gamma * A0); - A1 = -gamma * A1; - A2 = -gamma * A2; - A3 = -gamma * A3; - A4 = 1. - (gamma * A4); - A5 = -gamma * A5; - A6 = -gamma * A6; - A7 = -gamma * A7; - A8 = 1. - (gamma * A8); - - // - // compute x = A^{-1}b - // - - realtype scratch_0 = A4*A8; - realtype scratch_1 = A1*A5; - realtype scratch_2 = A2*A7; - realtype scratch_3 = A5*A7; - realtype scratch_4 = A1*A8; - realtype scratch_5 = A2*A4; - realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); - realtype scratch_7 = A2*A3; - realtype scratch_8 = A6*b[0]; - realtype scratch_9 = A2*A6; - realtype scratch_10 = A3*b[0]; - realtype scratch_11 = 1.0/A0; - realtype scratch_12 = A1*scratch_11; - realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); - - x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5); - x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7); - x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); - }); - - return(0); -} - -/* Solve the linear systems Ax = b where A = -dg/dy + gamma. - We are approximating dh/dy as dg/dy. */ -static int SolveReactionLinSysRes(N_Vector y, N_Vector x, N_Vector b, - realtype gamma, raja_xyz_tuple blocks, - UserData* udata) -{ - /* shortcuts */ - int dof, nxl, nyl, nzl; - realtype k2, k3, k4, k6; - - /* set shortcuts */ - dof = udata->grid->dof; - nxl = udata->grid->nxl; - nyl = udata->grid->nyl; - nzl = udata->grid->nzl; - k2 = udata->k2; - k3 = udata->k3; - k4 = udata->k4; - k6 = udata->k6; - - /* create views of the data */ - RAJA::View > Yview(GetVecData(y), - nxl, nyl, nzl, dof); - RAJA::View > Bview(GetVecData(b), - nxl, nyl, nzl, dof); - RAJA::View > Xview(GetVecData(x), - nxl, nyl, nzl, dof); - - RAJA::kernel(blocks, - [=] DEVICE_FUNC (int i, int j, int k) { - - /* and the corresponding vectors */ - realtype *b = &(Bview(i,j,k,0)); - realtype *x = &(Xview(i,j,k,0)); - - /* shortcuts to u, v, w for the block */ - realtype u = Yview(i,j,k,0); - realtype v = Yview(i,j,k,1); - realtype w = Yview(i,j,k,2); - - realtype A0, A1, A2, A3, A4, A5, A6, A7, A8; - - // - // compute dg/dy - // - - /* 1st row: u, v, w */ - A0 = -k2 * w + 2.0 * k3 * u * v - k4; - A1 = k3 * u * u; - A2 = -k2 * u; - - /* 2nd row: u, v, w */ - A3 = k2 * w - 2.0 * k3 * u * v; - A4 = -k3 * u * u; - A5 = k2 * u; - - /* 3rd row: u, v, w */ - A6 = -k2 * w; - A7 = 0.0; - A8 = -k2 * u - k6; - - // - // compute A = -dg/dy + gamma*diag(df/dydot) - // where diag(df/dydot) is approximated as - // diag([udot, vdot, wdot]) - // - - A0 = -A0 + gamma; - A1 = -A1; - A2 = -A2; - A3 = -A3; - A4 = -A4 + gamma; - A5 = -A5; - A6 = -A6; - A7 = -A7; - A8 = -A8 + gamma; - - // - // compute x = A^{-1}b - // - - realtype scratch_0 = A4*A8; - realtype scratch_1 = A1*A5; - realtype scratch_2 = A2*A7; - realtype scratch_3 = A5*A7; - realtype scratch_4 = A1*A8; - realtype scratch_5 = A2*A4; - realtype scratch_6 = 1.0/(A0*scratch_0 - A0*scratch_3 + A3*scratch_2 - A3*scratch_4 + A6*scratch_1 - A6*scratch_5); - realtype scratch_7 = A2*A3; - realtype scratch_8 = A6*b[0]; - realtype scratch_9 = A2*A6; - realtype scratch_10 = A3*b[0]; - realtype scratch_11 = 1.0/A0; - realtype scratch_12 = A1*scratch_11; - realtype scratch_13 = (-A6*scratch_12 + A7)/(-A3*scratch_12 + A4); - - x[0] = scratch_6*(b[0]*scratch_0 - b[0]*scratch_3 + b[1]*scratch_2 - b[1]*scratch_4 + b[2]*scratch_1 - b[2]*scratch_5); - x[1] = scratch_6*(-A0*A5*b[2] + A0*A8*b[1] + A5*scratch_8 - A8*scratch_10 - b[1]*scratch_9 + b[2]*scratch_7); - x[2] = (-b[2] + scratch_11*scratch_8 + scratch_13*(b[1] - scratch_10*scratch_11))/(-A8 + scratch_11*scratch_9 + scratch_13*(A5 - scratch_11*scratch_7)); - }); - - return(0); -} - - -/* -------------------------------------------------------------- - * Preconditioner functions - * --------------------------------------------------------------*/ - -/* Solves Pz = r where P = I - gamma * dg/dy */ -static int PSolve(realtype t, N_Vector y, N_Vector ydot, N_Vector r, - N_Vector z, realtype gamma, realtype delta, int lr, - - void *user_data) -{ - /* local variables */ - UserData* udata = (UserData*) user_data; - int retval; - - SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - - /* solve the task-local linear system Pz = r */ - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), - RAJA::RangeSegment(0, udata->grid->nyl), - RAJA::RangeSegment(0, udata->grid->nzl)); - retval = SolveReactionLinSys(y, z, r, gamma, range, udata); - - return(retval); -} - -/* Solves Pz = r where P = -dg/dy + gamma */ -static int PSolveRes(realtype t, N_Vector y, N_Vector ydot, N_Vector F, - N_Vector r, N_Vector z, realtype cj, realtype delta, - void *user_data) -{ - /* local variables */ - UserData* udata = (UserData*) user_data; - int retval; - - SUNDIALS_CXX_MARK_FUNCTION(udata->prof); - - /* solve the task-local linear system Pz = r */ - auto range = RAJA::make_tuple(RAJA::RangeSegment(0, udata->grid->nxl), - RAJA::RangeSegment(0, udata->grid->nyl), - RAJA::RangeSegment(0, udata->grid->nzl)); - retval = SolveReactionLinSysRes(y, z, r, cj, range, udata); - - return(retval); -} - - -#endif diff --git a/benchmarks/advection_reaction_3D/scripts/make_plots.py b/benchmarks/advection_reaction_3D/scripts/make_plots.py new file mode 100755 index 0000000000..7728562510 --- /dev/null +++ b/benchmarks/advection_reaction_3D/scripts/make_plots.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------------ +# Programmer(s): Daniel R. Reynolds @ SMU +# ------------------------------------------------------------------------------ +# SUNDIALS Copyright Start +# Copyright (c) 2002-2023, Lawrence Livermore National Security +# and Southern Methodist University. +# All rights reserved. +# +# See the top-level LICENSE and NOTICE files for details. +# +# SPDX-License-Identifier: BSD-3-Clause +# SUNDIALS Copyright End +# ------------------------------------------------------------------------------ +# matplotlib-based plotting script for the advection_reaction_3D benchmark codes +# ------------------------------------------------------------------------------ + +# imports +from os.path import exists +import numpy as np +import matplotlib.pyplot as plt + +# ------------------------------------------------------------------------------ + +# utility functions +def parallel_coords(rank): + if (rank == 0): + return [0, 0, 0] + if (rank == 1): + return [0, 0, 1] + if (rank == 2): + return [0, 1, 0] + if (rank == 3): + return [0, 1, 1] + if (rank == 4): + return [1, 0, 0] + if (rank == 5): + return [1, 0, 1] + if (rank == 6): + return [1, 1, 0] + if (rank == 7): + return [1, 1, 1] + +def xslice(u,it,ix): + return u[it,ix,:,:] + +def yslice(u,it,iy): + return u[it,:,iy,:] + +def zslice(u,it,iz): + return u[it,:,:,iz] + +def xproj(u,it): + return np.average(u[it,:,:,:], axis=0) + +def yproj(u,it): + return np.average(u[it,:,:,:], axis=1) + +def zproj(u,it): + return np.average(u[it,:,:,:], axis=2) + +def myplot(axis, X, Y, Z, xlabel='none', ylabel='none'): + frame = axis.contourf(X, Y, Z) + plt.colorbar(frame, ax=axis) + if (xlabel != 'none'): + axis.set_xlabel(xlabel) + if (ylabel != 'none'): + axis.set_ylabel(ylabel) + + + +# read time mesh +times = np.loadtxt("t.000000.txt") +nt = times.size + +# read spatial mesh +mesh = np.loadtxt("mesh.txt", dtype=float) +x = mesh[0,:] +y = mesh[1,:] +z = mesh[2,:] +nx = x.size +ny = y.size +nz = z.size + +# ensure that the run used exactly 1 or 8 MPI ranks +for i in range(9): + if (exists("u.00000" + str(i) + ".txt" ) and + not exists("u.00000" + str(i+1) + ".txt" )): + nprocs = i+1 +if ((nprocs != 1) and (nprocs != 8)): + print("make_plots.py error: run must have used either 1 or 8 MPI ranks") + exit() + +# load data for run +if (nprocs == 1): + u = np.zeros((nt,nx,ny,nz), dtype=float) + v = np.zeros((nt,nx,ny,nz), dtype=float) + w = np.zeros((nt,nx,ny,nz), dtype=float) + udata = np.loadtxt("u.000000.txt") + vdata = np.loadtxt("v.000000.txt") + wdata = np.loadtxt("w.000000.txt") + if (nt != udata.shape[0]): + print("make_plots.py error: mesh and data have incompatible sizes") + exit() + if (nx*ny*nz != udata.shape[1]): + print("make_plots.py error: mesh and data have incompatible sizes") + exit() + for it in range(nt): + u[it,:,:,:] = np.reshape(udata[it,:], (nx,ny,nz), order='C') + v[it,:,:,:] = np.reshape(vdata[it,:], (nx,ny,nz), order='C') + w[it,:,:,:] = np.reshape(wdata[it,:], (nx,ny,nz), order='C') +else: + u = np.zeros((nt,nx,ny,nz), dtype=float) + v = np.zeros((nt,nx,ny,nz), dtype=float) + w = np.zeros((nt,nx,ny,nz), dtype=float) + nxl = nx//2 + nyl = ny//2 + nzl = nz//2 + for ip in range(8): + udata = np.loadtxt("u.00000" + str(ip) + ".txt") + vdata = np.loadtxt("v.00000" + str(ip) + ".txt") + wdata = np.loadtxt("w.00000" + str(ip) + ".txt") + if (nt != udata.shape[0]): + print("make_plots.py error: mesh and data have incompatible sizes") + exit() + if (nxl*nyl*nzl != udata.shape[1]): + print("make_plots.py error: mesh and data have incompatible sizes") + exit() + coords = parallel_coords(ip) + ilo = coords[0]*nxl + ihi = (coords[0]+1)*nxl + jlo = coords[1]*nyl + jhi = (coords[1]+1)*nyl + klo = coords[2]*nzl + khi = (coords[2]+1)*nzl + for it in range(nt): + u[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(udata[it,:], (nxl,nyl,nzl), order='C') + v[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(vdata[it,:], (nxl,nyl,nzl), order='C') + w[it,ilo:ihi,jlo:jhi,klo:khi] = np.reshape(wdata[it,:], (nxl,nyl,nzl), order='C') + + +# set meshgrid objects +xy0,xy1 = np.meshgrid(x, y) +yz0,yz1 = np.meshgrid(y, z) +xz0,xz1 = np.meshgrid(x, z) + +# generate plots +sliceidx = 25 +tslice = [0, 5, 10] +figsize = (9,7) + +# xy slices at various times +plt.figure(1) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, xy0, xy1, zslice(u,tslice[0],sliceidx), ylabel = 'u') +myplot(ax2, xy0, xy1, zslice(u,tslice[1],sliceidx)) +myplot(ax3, xy0, xy1, zslice(u,tslice[2],sliceidx)) +myplot(ax4, xy0, xy1, zslice(v,tslice[0],sliceidx), ylabel = 'v') +myplot(ax5, xy0, xy1, zslice(v,tslice[1],sliceidx)) +myplot(ax6, xy0, xy1, zslice(v,tslice[2],sliceidx)) +myplot(ax7, xy0, xy1, zslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, xy0, xy1, zslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1])) +myplot(ax9, xy0, xy1, zslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2])) +plt.savefig('xy-slices.png') + +# yz slices at various times +plt.figure(2) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, yz0, yz1, xslice(u,tslice[0],sliceidx), ylabel = 'u') +myplot(ax2, yz0, yz1, xslice(u,tslice[1],sliceidx)) +myplot(ax3, yz0, yz1, xslice(u,tslice[2],sliceidx)) +myplot(ax4, yz0, yz1, xslice(v,tslice[0],sliceidx), ylabel = 'v') +myplot(ax5, yz0, yz1, xslice(v,tslice[1],sliceidx)) +myplot(ax6, yz0, yz1, xslice(v,tslice[2],sliceidx)) +myplot(ax7, yz0, yz1, xslice(w,tslice[0],sliceidx), ylabel = 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, yz0, yz1, xslice(w,tslice[1],sliceidx), xlabel = 't = ' + str(times[1])) +myplot(ax9, yz0, yz1, xslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2])) +plt.savefig('yz-slices.png') + +# xz slices at various times +plt.figure(3) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, xz0, xz1, yslice(u,tslice[0],sliceidx), ylabel ='u') +myplot(ax2, xz0, xz1, yslice(u,tslice[1],sliceidx)) +myplot(ax3, xz0, xz1, yslice(u,tslice[2],sliceidx)) +myplot(ax4, xz0, xz1, yslice(v,tslice[0],sliceidx), ylabel = 'v') +myplot(ax5, xz0, xz1, yslice(v,tslice[1],sliceidx)) +myplot(ax6, xz0, xz1, yslice(v,tslice[2],sliceidx)) +myplot(ax7, xz0, xz1, yslice(w,tslice[0],sliceidx), ylabel= 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, xz0, xz1, yslice(w,tslice[1],sliceidx), xlabel ='t = ' + str(times[1])) +myplot(ax9, xz0, xz1, yslice(w,tslice[2],sliceidx), xlabel = 't = ' + str(times[2])) +plt.savefig('xz-slices.png') + +# xy projection at various times +plt.figure(4) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, xy0, xy1, zproj(u,tslice[0]), ylabel = 'u') +myplot(ax2, xy0, xy1, zproj(u,tslice[1])) +myplot(ax3, xy0, xy1, zproj(u,tslice[2])) +myplot(ax4, xy0, xy1, zproj(v,tslice[0]), ylabel = 'v') +myplot(ax5, xy0, xy1, zproj(v,tslice[1])) +myplot(ax6, xy0, xy1, zproj(v,tslice[2])) +myplot(ax7, xy0, xy1, zproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, xy0, xy1, zproj(w,tslice[1]), xlabel = 't = ' + str(times[1])) +myplot(ax9, xy0, xy1, zproj(w,tslice[2]), xlabel = 't = ' + str(times[2])) +plt.savefig('xy-projections.png') + +# yz projection at various times +fig = plt.figure(5) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, yz0, yz1, xproj(u,tslice[0]), ylabel = 'u') +myplot(ax2, yz0, yz1, xproj(u,tslice[1])) +myplot(ax3, yz0, yz1, xproj(u,tslice[2])) +myplot(ax4, yz0, yz1, xproj(v,tslice[0]), ylabel = 'v') +myplot(ax5, yz0, yz1, xproj(v,tslice[1])) +myplot(ax6, yz0, yz1, xproj(v,tslice[2])) +myplot(ax7, yz0, yz1, xproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, yz0, yz1, xproj(w,tslice[1]), xlabel = 't = ' + str(times[1])) +myplot(ax9, yz0, yz1, xproj(w,tslice[2]), xlabel = 't = ' + str(times[2])) +plt.savefig('yz-projections.png') + +# xz projection at various times +fig = plt.figure(6) +fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True, figsize=figsize) +myplot(ax1, xz0, xz1, yproj(u,tslice[0]), ylabel = 'u') +myplot(ax2, xz0, xz1, yproj(u,tslice[1])) +myplot(ax3, xz0, xz1, yproj(u,tslice[2])) +myplot(ax4, xz0, xz1, yproj(v,tslice[0]), ylabel = 'v') +myplot(ax5, xz0, xz1, yproj(v,tslice[1])) +myplot(ax6, xz0, xz1, yproj(v,tslice[2])) +myplot(ax7, xz0, xz1, yproj(w,tslice[0]), ylabel = 'w', xlabel = 't = ' + str(times[0])) +myplot(ax8, xz0, xz1, yproj(w,tslice[1]), xlabel = 't = ' + str(times[1])) +myplot(ax9, xz0, xz1, yproj(w,tslice[2]), xlabel = 't = ' + str(times[2])) +plt.savefig('xz-projections.png') + +#plt.show() +plt.close() + +##### end of script ##### diff --git a/benchmarks/diffusion_2D/README.md b/benchmarks/diffusion_2D/README.md index d60869a8d8..453879953f 100644 --- a/benchmarks/diffusion_2D/README.md +++ b/benchmarks/diffusion_2D/README.md @@ -8,24 +8,26 @@ required. This code simulates the anisotropic 2D heat equation, -$$\frac{\partial u}{\partial t} = k_x \frac{\partial^2 u}{\partial x^2} + k_y \frac{\partial^2 u}{\partial y^2} + b,$$ +$$\frac{\partial u}{\partial t} = \nabla \cdot (D \nabla u) + b(t, \mathbf{x})$$ -where $k_x$ and $k_y$ are the diffusion coefficients. The system is evolved for -$t$ in $[0, t_f]$ and $(x,y) = X$ in $[0, X_{max}]^2$ with the initial condition +where $D$ is a diagonal matrix with entries $k_x$ and $k_y$. The system is +evolved for $t \in [0, t_f]$ on the rectangular domain +$(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x}_{\text{max}}]^2$, with the +initial condition -$$u(0,X) = \sin^2(\pi x) \sin^2(\pi y),$$ +$$u(0,\mathbf{x}) = \sin^2(\pi x) \sin^2(\pi y),$$ and stationary boundary conditions -$$\frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{max},y) = \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{max}) = 0.$$ +$$\frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{\text{max}},y) = \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{\text{max}}) = 0.$$ The source term is given by -$$b(t,X) = -2 \pi \sin^2(\pi x) \sin^2(\pi y) \sin(\pi t) \cos(\pi t) - k_x 2 \pi^2 (\cos^2(\pi x) - \sin^2(\pi x)) \sin^2(\pi y) \cos^2(\pi t) - k_y 2 \pi^2 (\cos^2(\pi y) - \sin^2(\pi y)) \sin^2(\pi x) \cos^2(\pi t).$$ +$$b(t,\mathbf{x}) = -2 \pi \sin^2(\pi x) \sin^2(\pi y) \sin(\pi t) \cos(\pi t) - k_x 2 \pi^2 (\cos^2(\pi x) - \sin^2(\pi x)) \sin^2(\pi y) \cos^2(\pi t) - k_y 2 \pi^2 (\cos^2(\pi y) - \sin^2(\pi y)) \sin^2(\pi x) \cos^2(\pi t).$$ Under this setup, the problem has the analytical solution -$$u(t,X) = \sin^2(\pi x) \sin^2(\pi y) \cos^2(\pi t).$$ +$$u(t,\mathbf{x}) = \sin^2(\pi x) \sin^2(\pi y) \cos^2(\pi t).$$ Spatial derivatives are computed using second-order centered differences on a uniform spatial grid. The problem can be evolved in time with ARKODE, CVODE, or @@ -33,11 +35,12 @@ IDA. With ARKODE, an adaptive step diagonally implicit Runge-Kutta (DIRK) method is applied. When using CVODE or IDA, adaptive order and step BDF methods are used. -In all cases, the nonlinear system(s) in each time step are solved using an -inexact Newton method paired with a matrix-free PCG or GMRES linear solver and a -Jacobi preconditioner. If SUNDIALS is built with the SuperLU_DIST interface enabled -a modified Newton method with SuperLU_DIST as the direct linear solver may also be -selected at run time. +By default, the nonlinear system(s) in each time step are solved using an +inexact Newton method paired with a matrix-free CG linear solver and a Jacobi +preconditioner. A matrix-free GMRES linear solver may be selected at run time. +If SUNDIALS is built with the SuperLU_DIST interface enabled a modified Newton +method with SuperLU_DIST as the direct linear solver may also be selected at run +time. ## Options @@ -53,10 +56,10 @@ listed below. | `--npy ` | Number of MPI tasks in the y-direction (0 forces MPI to decide) | 0 | | `--nx ` | Number of mesh points in the x-direction | 32 | | `--ny ` | Number of mesh points in the y-direction | 32 | -| `--ux ` | The domain upper bound in the x-direction `x_max` | 1.0 | -| `--uy ` | The domain upper bound in the y-direction `y_max` | 1.0 | -| `--kx ` | Diffusion coefficient in the x-direction `kx` | 1.0 | -| `--ky ` | Diffusion coefficient in the y-direction `ky` | 1.0 | +| `--xu ` | The domain upper bound in the x-direction $x_{\text{max}}$ | 1.0 | +| `--yu ` | The domain upper bound in the y-direction $y_{\text{max}}$ | 1.0 | +| `--kx ` | Diffusion coefficient in the x-direction $k_x$ | 1.0 | +| `--ky ` | Diffusion coefficient in the y-direction $k_y$ | 1.0 | | `--tf ` | The final time `tf` | 1.0 | | `--noforcing` | Disable the forcing term | Enabled | | Output Options | | | @@ -94,9 +97,11 @@ Based on the configuration, executables for each integrator and backend option are built and installed in `/diffusion_2D`. The executables follow the naming convention `_diffusion_2D_` where `` is `arkode`, `cvode`, or `ida` and `` is `mpi` for -MPI only parallelism, `mpicuda` for MPI + CUDA, and `mpihip` for MPI + HIP. Note -when using the SuperLU_DIST linear solver computations will be offloaded to the -GPU in the MPI only executables if CUDA or ROCM support is enabled in SuperLU_DIST. +MPI only parallelism, `mpicuda` for MPI + CUDA, and `mpihip` for MPI + HIP. + +**Note:** When using the SuperLU_DIST linear solver computations will be +offloaded to the GPU in the MPI only executables if CUDA or ROCM support is +enabled in SuperLU_DIST. On Summit, with the default environment ``` diff --git a/cmake/SundialsBuildOptionsPre.cmake b/cmake/SundialsBuildOptionsPre.cmake index b1532d2ce3..a28b3485ea 100644 --- a/cmake/SundialsBuildOptionsPre.cmake +++ b/cmake/SundialsBuildOptionsPre.cmake @@ -294,3 +294,5 @@ sundials_option(SUNDIALS_TEST_DEVTESTS BOOL # Include unit tests in regression tests sundials_option(SUNDIALS_TEST_UNITTESTS BOOL "Include unit tests in make test" OFF ADVANCED) + +sundials_option(SUNDIALS_TEST_MPIRUN_COMMAND STRING "Job scheduler or mpirun command used to launch SUNDIALS MPI tests." "" ADVANCED) diff --git a/cmake/SundialsTPLOptions.cmake b/cmake/SundialsTPLOptions.cmake index f01a0ac14d..11e39d0f99 100644 --- a/cmake/SundialsTPLOptions.cmake +++ b/cmake/SundialsTPLOptions.cmake @@ -61,6 +61,11 @@ sundials_option(ENABLE_HIP BOOL "Enable HIP support" OFF) # ------------------------------------------------------------- sundials_option(ENABLE_SYCL BOOL "Enable SYCL support" OFF) +sundials_option(SUNDIALS_SYCL_2020_UNSUPPORTED BOOL + "Disable the use of some SYCL 2020 features in SUNDIALS libraries and examples" OFF + DEPENDS_ON ENABLE_SYCL + ADVANCED) + # --------------------------------------------------------------- # Enable LAPACK support? # --------------------------------------------------------------- @@ -288,6 +293,16 @@ sundials_option(ONEMKL_WORKS BOOL "Set to ON to force CMake to accept a given on DEPENDS_ON ENABLE_ONEMKL ADVANCED) +sundials_option(SUNDIALS_ONEMKL_USE_GETRF_LOOP BOOL + "Replace batched getrf call with loop over getrf" OFF + DEPENDS_ON ENABLE_ONEMKL + ADVANCED) + +sundials_option(SUNDIALS_ONEMKL_USE_GETRS_LOOP BOOL + "Replace batched getrs call with loop over getrs" OFF + DEPENDS_ON ENABLE_ONEMKL + ADVANCED) + # --------------------------------------------------------------- # Enable Caliper support? # --------------------------------------------------------------- diff --git a/cmake/macros/SundialsAddTest.cmake b/cmake/macros/SundialsAddTest.cmake index 45bf3d8ebd..b93027a017 100644 --- a/cmake/macros/SundialsAddTest.cmake +++ b/cmake/macros/SundialsAddTest.cmake @@ -135,8 +135,13 @@ macro(SUNDIALS_ADD_TEST NAME EXECUTABLE) endif() # check if this test is run with MPI and set the MPI run command - if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND (MPIEXEC_EXECUTABLE)) - set(RUN_COMMAND "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}") + if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND ((MPIEXEC_EXECUTABLE) OR (SUNDIALS_TEST_MPIRUN_COMMAND))) + if (SUNDIALS_TEST_MPIRUN_COMMAND) + set(RUN_COMMAND "${SUNDIALS_TEST_MPIRUN_COMMAND} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}") + elseif(MPIEXEC_EXECUTABLE) + set(RUN_COMMAND "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${MPIEXEC_PREFLAGS}") + endif() + # remove trailing white space (empty MPIEXEC_PREFLAGS) as it can cause # erroneous test failures with some MPI implementations string(STRIP "${RUN_COMMAND}" RUN_COMMAND) @@ -174,11 +179,16 @@ macro(SUNDIALS_ADD_TEST NAME EXECUTABLE) endif() # check if this test is run with MPI and add the test run command - if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND (MPIEXEC_EXECUTABLE)) + if((SUNDIALS_ADD_TEST_MPI_NPROCS) AND ((MPIEXEC_EXECUTABLE) OR (SUNDIALS_TEST_MPIRUN_COMMAND))) if(MPIEXEC_PREFLAGS) string(REPLACE " " ";" PREFLAGS "${MPIEXEC_PREFLAGS}") endif() - add_test(NAME ${NAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $ ${TEST_ARGS}) + if (SUNDIALS_TEST_MPIRUN_COMMAND) + string(REPLACE " " ";" MPI_EXEC_ARGS "${SUNDIALS_TEST_MPIRUN_COMMAND}") + add_test(NAME ${NAME} COMMAND ${MPI_EXEC_ARGS} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $ ${TEST_ARGS}) + else() + add_test(NAME ${NAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${SUNDIALS_ADD_TEST_MPI_NPROCS} ${PREFLAGS} $ ${TEST_ARGS}) + endif() else() add_test(NAME ${NAME} COMMAND $ ${TEST_ARGS}) endif() diff --git a/cmake/tpl/SundialsONEMKL.cmake b/cmake/tpl/SundialsONEMKL.cmake index 34177ff0fe..a807a2e7f4 100644 --- a/cmake/tpl/SundialsONEMKL.cmake +++ b/cmake/tpl/SundialsONEMKL.cmake @@ -64,6 +64,7 @@ find_package(MKL CONFIG NO_DEFAULT_PATH REQUIRED) +message(STATUS "MKL Version: ${MKL_VERSION}") message(STATUS "MKL Targets: ${MKL_IMPORTED_TARGETS}") # ----------------------------------------------------------------------------- diff --git a/doc/arkode/guide/source/Butcher.rst b/doc/arkode/guide/source/Butcher.rst index 5bfdd6169c..6666a4f7dd 100644 --- a/doc/arkode/guide/source/Butcher.rst +++ b/doc/arkode/guide/source/Butcher.rst @@ -180,6 +180,41 @@ This is the default 2nd order explicit method. region is outlined in blue; the embedding's region is in red. +.. _Butcher.ARK2_ERK: + +ARK2-ERK-3-1-2 +^^^^^^^^^^^^^^ + +.. index:: ARK2-ERK-3-1-2 + +Accessible via the constant ``ARKODE_ARK2_ERK_3_1_2`` to +:c:func:`ARKStepSetTableNum()`, :c:func:`ERKStepSetTableNum()` or +:c:func:`ARKodeButcherTable_LoadERK()`. +Accessible via the string ``"ARKODE_ARK2_ERK_3_1_2"`` to +:c:func:`ARKStepSetTableName()`, :c:func:`ERKStepSetTableName()` or +:c:func:`ARKodeButcherTable_LoadERKByName()`. +This is the explicit portion of the default 2nd order additive method (the +explicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`). + +.. math:: + + \renewcommand{\arraystretch}{1.5} + \begin{array}{r|ccc} + 0 & 0 & 0 & 0 \\ + 2 - \sqrt{2} & 2 - \sqrt{2} & 0 & 0 \\ + 1 & 1 - \frac{3 + 2\sqrt{2}}{6} & \frac{3 + 2\sqrt{2}}{6} & 0 \\ + \hline + 2 & \frac{1}{2\sqrt{2}} & \frac{1}{2\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} \\ + 1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}} \\ + \end{array} + +.. figure:: /figs/arkode/ark2_erk_stab_region.png + :scale: 65 % + :align: center + + Linear stability region for the ARK2-ERK method. The method's + region is outlined in blue; the embedding's region is in red. + .. _Butcher.Bogacki_Shampine: @@ -816,6 +851,41 @@ are A- and B-stable. region is outlined in blue; the embedding's region is in red. +.. _Butcher.ARK2_DIRK: + +ARK2-DIRK-3-1-2 +^^^^^^^^^^^^^^^ + +.. index:: ARK2-DIRK-3-1-2 + +Accessible via the constant ``ARKODE_ARK2_DIRK_3_1_2`` to +:c:func:`ARKStepSetTableNum()`, or +:c:func:`ARKodeButcherTable_LoadDIRK()`. +Accessible via the string ``"ARKODE_ARK2_DIRK_3_1_2"`` to +:c:func:`ARKStepSetTableName()`, or +:c:func:`ARKodeButcherTable_LoadDIRKByName()`. +This is the implicit portion of the default 2nd order additive method (the +implicit portion of the ARK2 method from :cite:p:`giraldo2013implicit`). + +.. math:: + + \renewcommand{\arraystretch}{1.5} + \begin{array}{r|ccc} + 0 & 0 & 0 & 0 \\ + 2 - \sqrt{2} & 1 - \frac{1}{\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} & 0 \\ + 1 & \frac{1}{2\sqrt{2}} & \frac{1}{2\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} \\ + \hline + 2 & \frac{1}{2\sqrt{2}} & \frac{1}{2\sqrt{2}} & 1 - \frac{1}{\sqrt{2}} \\ + 1 & \frac{4 - \sqrt{2}}{8} & \frac{4 - \sqrt{2}}{8} & \frac{1}{2\sqrt{2}} \\ + \end{array} + +.. figure:: /figs/arkode/ark2_dirk_stab_region.png + :scale: 65 % + :align: center + + Linear stability region for the ARK2-DIRK method. The method's + region is outlined in blue; the embedding's region is in red. + .. _Butcher.Billington: @@ -1590,10 +1660,16 @@ Additive Butcher tables --------------------------- In the category of additive Runge--Kutta methods for split implicit and -explicit calculations, ARKODE includes methods that have orders 3 -through 5, with embeddings that are of orders 2 through 4. These +explicit calculations, ARKODE includes methods that have orders 2 +through 5, with embeddings that are of orders 1 through 4. These Butcher table pairs are as follows: +* :index:`2nd-order pair `: + :numref:`Butcher.ARK2_ERK` with :numref:`Butcher.ARK2_DIRK`, + corresponding to Butcher tables ``ARKODE_ARK2_ERK_3_1_2`` and + ``ARKODE_ARK2_DIRK_3_1_2`` for :c:func:`ARKStepSetTableNum()` + or :c:func:`ARKStepSetTableName()`. + * :index:`3rd-order pair `: :numref:`Butcher.ARK_4_2_3_E` with :numref:`Butcher.ARK_4_2_3_I`, corresponding to Butcher tables ``ARKODE_ARK324L2SA_ERK_4_2_3`` and diff --git a/doc/arkode/guide/source/Introduction.rst b/doc/arkode/guide/source/Introduction.rst index 10174ccd8c..7e8d6d6fa6 100644 --- a/doc/arkode/guide/source/Introduction.rst +++ b/doc/arkode/guide/source/Introduction.rst @@ -118,6 +118,18 @@ provided with SUNDIALS, or again may utilize a user-supplied module. Changes from previous versions ============================== +Changes in v5.6.0 +----------------- + +Added the second order IMEX method from :cite:p:`giraldo2013implicit` as the +default second order IMEX method in ARKStep. The explicit table is given by +``ARKODE_ARK2_ERK_3_1_2`` (see :numref:`Butcher.ARK2_ERK`) and the implicit +table by ``ARKODE_ARK2_DIRK_3_1_2`` (see :numref:`Butcher.ARK2_DIRK`). + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + Changes in v5.5.1 ----------------- diff --git a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst index cd4acec555..97fa502b7e 100644 --- a/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst +++ b/doc/arkode/guide/source/Usage/ARKStep_c_interface/User_callable.rst @@ -1454,7 +1454,7 @@ Set additive RK tables via their names :c:func:`ARKStepSetTableName()` int For explicit methods, the allowed values are :math:`2 \le` *ord* :math:`\le 8`. For implicit methods, the allowed values are :math:`2\le` *ord* :math:`\le 5`, and for ImEx methods the allowed - values are :math:`3 \le` *ord* :math:`\le 5`. Any illegal input + values are :math:`2 \le` *ord* :math:`\le 5`. Any illegal input will result in the default value of 4. Since *ord* affects the memory requirements for the internal diff --git a/doc/cvode/guide/source/Introduction.rst b/doc/cvode/guide/source/Introduction.rst index 496dfa5be6..debed4bd24 100644 --- a/doc/cvode/guide/source/Introduction.rst +++ b/doc/cvode/guide/source/Introduction.rst @@ -111,6 +111,13 @@ implementations. Changes from previous versions ============================== +Changes in v6.6.0 +----------------- + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + Changes in v6.5.1 ----------------- @@ -145,7 +152,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``, ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions. Compiling and running code that uses the SUNDIALS Fortran interfaces with -bounds checking will now work. +bounds checking will now work. Changes in v6.4.1 ----------------- diff --git a/doc/cvodes/guide/source/Introduction.rst b/doc/cvodes/guide/source/Introduction.rst index aec32d3649..dd0384c250 100644 --- a/doc/cvodes/guide/source/Introduction.rst +++ b/doc/cvodes/guide/source/Introduction.rst @@ -111,6 +111,13 @@ Fortran. Changes from previous versions ============================== +Changes in v6.6.0 +----------------- + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + Changes in v6.5.1 ----------------- @@ -146,7 +153,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``, ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions. Compiling and running code that uses the SUNDIALS Fortran interfaces with -bounds checking will now work. +bounds checking will now work. Changes in v6.4.1 ----------------- diff --git a/doc/ida/guide/source/Introduction.rst b/doc/ida/guide/source/Introduction.rst index 76458544b9..b2a5a15671 100644 --- a/doc/ida/guide/source/Introduction.rst +++ b/doc/ida/guide/source/Introduction.rst @@ -72,6 +72,13 @@ systems. Changes from previous versions ============================== +Changes in v6.6.0 +----------------- + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + Changes in v6.5.1 ----------------- @@ -107,7 +114,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``, ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions. Compiling and running code that uses the SUNDIALS Fortran interfaces with -bounds checking will now work. +bounds checking will now work. Changes in v6.4.1 ----------------- diff --git a/doc/idas/guide/source/Introduction.rst b/doc/idas/guide/source/Introduction.rst index f5b828966d..a2fd55c464 100644 --- a/doc/idas/guide/source/Introduction.rst +++ b/doc/idas/guide/source/Introduction.rst @@ -86,6 +86,14 @@ integrate any final-condition ODE dependent on the solution of the original IVP Changes from previous versions ============================== +Changes in v5.6.0 +----------------- + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + + Changes in v5.5.1 ----------------- @@ -121,7 +129,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``, ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions. Compiling and running code that uses the SUNDIALS Fortran interfaces with -bounds checking will now work. +bounds checking will now work. Changes in v5.4.1 ----------------- diff --git a/doc/kinsol/guide/source/Introduction.rst b/doc/kinsol/guide/source/Introduction.rst index 6e6edcffcf..3b8f5a1f21 100644 --- a/doc/kinsol/guide/source/Introduction.rst +++ b/doc/kinsol/guide/source/Introduction.rst @@ -88,6 +88,14 @@ applications written in Fortran. Changes from previous versions ============================== +Changes in v6.6.0 +----------------- + +Updated the F2003 utility routines :c:func:`SUNDIALSFileOpen` and :c:func:`SUNDIALSFileClose` +to support user specification of ``stdout`` and ``stderr`` strings for the output +file names. + + Changes in v6.5.1 ----------------- @@ -115,7 +123,7 @@ Fixed the shape of the arrays returned by ``FN_VGetArrayPointer`` functions as w as the ``FSUNDenseMatrix_Data``, ``FSUNBandMatrix_Data``, ``FSUNSparseMatrix_Data``, ``FSUNSparseMatrix_IndexValues``, and ``FSUNSparseMatrix_IndexPointers`` functions. Compiling and running code that uses the SUNDIALS Fortran interfaces with -bounds checking will now work. +bounds checking will now work. Changes in v6.4.1 ----------------- diff --git a/doc/shared/Install.rst b/doc/shared/Install.rst index 7ad8c3bcd1..4682ba1906 100644 --- a/doc/shared/Install.rst +++ b/doc/shared/Install.rst @@ -776,6 +776,20 @@ illustration only. Default: none +.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRF_LOOP + + This advanced debugging option replaces the batched LU factorization with a + loop over each system in the batch and a non-batched LU factorization. + + Default: OFF + +.. cmakeoption:: SUNDIALS_ONEMKL_USE_GETRS_LOOP + + This advanced debugging option replaces the batched LU solve with a loop over + each system in the batch and a non-batched solve. + + Default: OFF + .. cmakeoption:: ENABLE_OPENMP Enable OpenMP support (build the OpenMP NVector) @@ -944,6 +958,14 @@ illustration only. ``dpcpp`` and ``icpx``. When using ``icpx`` the ``-fsycl`` flag and any ahead of time compilation flags must be added to ``CMAKE_CXX_FLAGS``. +.. cmakeoption:: SUNDIALS_SYCL_2020_UNSUPPORTED + + This advanced option disables the use of *some* features from the SYCL 2020 + standard in SUNDIALS libraries and examples. This can be used to work around + some cases of incomplete compiler support for SYCL 2020. + + Default: OFF + .. cmakeoption:: SUNDIALS_LOGGING_LEVEL diff --git a/doc/shared/figs/arkode/ark2_dirk_stab_region.png b/doc/shared/figs/arkode/ark2_dirk_stab_region.png new file mode 100644 index 0000000000..83929af11e Binary files /dev/null and b/doc/shared/figs/arkode/ark2_dirk_stab_region.png differ diff --git a/doc/shared/figs/arkode/ark2_erk_stab_region.png b/doc/shared/figs/arkode/ark2_erk_stab_region.png new file mode 100644 index 0000000000..45b125d708 Binary files /dev/null and b/doc/shared/figs/arkode/ark2_erk_stab_region.png differ diff --git a/doc/shared/sundials.bib b/doc/shared/sundials.bib index 2851d968bd..d62f0d9c22 100644 --- a/doc/shared/sundials.bib +++ b/doc/shared/sundials.bib @@ -1784,6 +1784,19 @@ @techreport{Fehlberg:69 year = {1969} } + +@article{giraldo2013implicit, + title = {Implicit-explicit formulations of a three-dimensional nonhydrostatic unified model of the atmosphere (NUMA)}, + author = {Giraldo, F. X. and Kelly, J. F. and Constantinescu, E. M.}, + journal = {SIAM Journal on Scientific Computing}, + volume = {35}, + number = {5}, + pages = {B1162--B1194}, + year = {2013}, + publisher = {SIAM}, + doi = {10.1137/120876034} +} + @article{Gust:91, author = {Gustafsson, K.}, title = {Control theoretic techniques for stepsize selection in explicit {Runge-Kutta} methods}, diff --git a/doc/shared/sundials/Fortran.rst b/doc/shared/sundials/Fortran.rst index 20246ce8f8..bef4eb5aca 100644 --- a/doc/shared/sundials/Fortran.rst +++ b/doc/shared/sundials/Fortran.rst @@ -490,8 +490,10 @@ a C file pointer, SUNDIALS provides two utility functions for creating a the provided filename and I/O mode. **Arguments:** - * ``filename`` -- the full path to the file, that should have Fortran - type ``character(kind=C_CHAR, len=*)``. + * ``filename`` -- the path to the file, that should have Fortran + type ``character(kind=C_CHAR, len=*)``. There are two special filenames: + ``stdout`` and ``stderr`` -- these two filenames will result in output + going to the standard output file and standard error file, respectively. * ``mode`` -- the I/O mode to use for the file. This should have the Fortran type ``character(kind=C_CHAR, len=*)``. The string begins with one of the following characters: @@ -517,7 +519,9 @@ a C file pointer, SUNDIALS provides two utility functions for creating a **Arguments:** * ``fp`` -- the C ``FILE*`` that was previously obtained from ``fopen``. - This should have the Fortran type ``type(c_ptr)``. + This should have the Fortran type ``type(c_ptr)``. Note that if either + ``stdout`` or ``stderr`` were opened using :c:func:`SUNDIALSFileOpen()` + then that stream *will not be closed* by this function. .. _SUNDIALS.Fortran.Portability: diff --git a/doc/sundials_developers/source/benchmarks/diffusion.rst b/doc/sundials_developers/source/benchmarks/diffusion.rst index 54b5b08e05..5c35ac4ec2 100644 --- a/doc/sundials_developers/source/benchmarks/diffusion.rst +++ b/doc/sundials_developers/source/benchmarks/diffusion.rst @@ -30,11 +30,11 @@ This code simulates the anisotropic 2D heat equation, .. math:: - u_t = \nabla \cdot (D \nabla u) + b(t,\mathbf{x}), + \frac{\partial u}{\partial t} = \nabla \cdot (D \nabla u) + b(t,\mathbf{x}), where :math:`D` is a diagonal matrix with entries :math:`k_x` and :math:`k_y`. The system is evolved for :math:`t \in [0, t_f]` on the rectangular domain -:math:`(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x_{\text{max}}}]^2`, +:math:`(x,y) \equiv \mathbf{x} \in [\mathbf{0}, \mathbf{x}_{\text{max}}]^2`, with the initial condition .. math:: @@ -45,8 +45,8 @@ and stationary boundary conditions .. math:: - u_t(t,0,y) = u_t(t,x_{\text{max}},y) = - u_t(t,x,0) = u_t(t,x,y_{\text{max}}) = 0. + \frac{\partial u}{\partial t}(t,0,y) = \frac{\partial u}{\partial t}(t,x_{\text{max}},y) = + \frac{\partial u}{\partial t}(t,x,0) = \frac{\partial u}{\partial t}(t,x,y_{\text{max}}) = 0. The source term is given by @@ -68,9 +68,12 @@ IDA. With ARKODE, an adaptive step diagonally implicit Runge-Kutta (DIRK) method is applied. When using CVODE or IDA, adaptive order and step BDF methods are used. -In all cases, the nonlinear system(s) in each time step are solved using an -inexact Newton method paired with a matrix-free PCG or GMRES linear solver and a -Jacobi preconditioner. +By default, the nonlinear system(s) in each time step are solved using an +inexact Newton method paired with a matrix-free CG linear solver and a Jacobi +preconditioner. A matrix-free GMRES linear solver may be selected at run time. +If SUNDIALS is built with the SuperLU_DIST interface enabled a modified Newton +method with SuperLU_DIST as the direct linear solver may also be selected at run +time. Options @@ -143,7 +146,8 @@ listed in :numref:`Benchmarks.Table.2D_diffusion_options`. | | ``ONE_STEP`` mode for debugging | | | | (0 uses ``NORMAL`` mode) | | +-------------------------------+--------------------------------------+---------------+ - | ``--gmres`` | Use GMRES rather than PCG | PCG | + | ``--ls`` | Linear solver: ``cg``, ``gmres``, | ``cg`` | + | | ``sludist`` | | +-------------------------------+--------------------------------------+---------------+ | ``--lsinfo`` | Output linear solver diagnostics | Off | +-------------------------------+--------------------------------------+---------------+ @@ -169,22 +173,33 @@ listed in :numref:`Benchmarks.Table.2D_diffusion_options`. +-------------------------------+--------------------------------------+---------------+ -Building and Running -^^^^^^^^^^^^^^^^^^^^ +Building +^^^^^^^^ To build the benchmark executables SUNDIALS should be configured with ARKODE, -CVODE, or IDA enabled and with MPI support on. Additionally, either CUDA or HIP -support must be on to build executables utilizing NVIDIA or AMD GPUs. See the -installation guide for more details on configuring, building, and installing -SUNDIALS. +CVODE, or IDA enabled, MPI support turned on, and benchmarks enabled. If +SUNDIALS is configured with SuperLU_DIST enabled this linear solver can be +selected at run time and may utilize OpenMP, CUDA, or ROCM (HIP) for on-node +parallelism. If SUNDIALS is configured with CUDA or HIP support enabled +additional executables utilizing CUDA and HIP will be built. See the SUNDIALS +installation guide for more details on configuring, building, and installing. + +Running +^^^^^^^ Based on the configuration, executables for each integrator and backend option -are built and installed in the ``/bin/benchmarks/diffusion_2D`` -directory. The executables follow the naming convention -``_diffusion_2D_`` where ```` is ``arkode``, +are built and installed in ``/diffusion_2D``. The +executables follow the naming convention +```_diffusion_2D_`` where ```` is ``arkode``, ``cvode``, or ``ida`` and ```` is ``mpi`` for MPI only parallelism, ``mpicuda`` for MPI + CUDA, and ``mpihip`` for MPI + HIP. +.. note:: + + When using the SuperLU_DIST linear solver computations will be offloaded to + the GPU in the MPI only executables if CUDA or ROCM support is enabled in + SuperLU_DIST. + On Summit, with the default environment * Compiler: xl/16.1.1-5 @@ -209,3 +224,15 @@ an example ``jsrun`` command using CUDA-aware MPI .. code-block:: none jsrun -n 2 -a 1 -c 1 -g 1 ./cvode_diffusion_2D_mpicuda + +On Crusher, with the environment + +* Compiler: clang/14.0.2 +* MPI: cray-mpich/8.1.17 +* ROCM: rocm/5.2.0 + +an example ``srun`` command is + +.. code-block:: none + + srun -N1 -n8 -c1 --gpus-per-node=8 --gpu-bind=closest ./cvode_diffusion_2D_mpi diff --git a/doc/sundials_developers/source/testing/CI.rst b/doc/sundials_developers/source/testing/CI.rst index 928564e70a..52626c5301 100644 --- a/doc/sundials_developers/source/testing/CI.rst +++ b/doc/sundials_developers/source/testing/CI.rst @@ -275,11 +275,11 @@ Caliper, as we need a newer version than in the Spack commit currently used. Updating Spack -------------- -To update the spack commit used for the CI, the first thing to do is update -the spack commit in the ``.uberenv_config.json`` file. Then, a pipeline -should be manually launched with the ``SHARED_SPACK`` CI variable set -to ``ON``. This will cause Spack to re-concretize the specs and ideally -update to newer packages. However, there is no guarantee that individual -dependencies will be updated, so due dilligence is required (i.e., ensure -that the output from the CI job shows that Spack has selected the versions -of dependencies that you expected). +To update the spack commit used for the CI: + +1. The first thing to do is update the spack commit in the ``.uberenv_config.json`` file. +2. Then, a pipeline should be manually launched from the GitLab UI with the ``SHARED_SPACK`` CI variable set +to ``ON`` and the ``SPACK_PREFIX`` variable to the version of spack being set in the uberenv_config.json. + +This will create a new spack installation and rebuild all of the specs. + diff --git a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp index 6c176ee53e..45eae942b2 100644 --- a/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp +++ b/examples/cvode/CXX_onemkl/cvRoberts_blockdiag_onemkl.cpp @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) if (argc > 3) output = (atoi(argv[3])) ? true : false; // Create an in-order GPU queue -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else diff --git a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp index dc3286701e..44fe80d7b7 100644 --- a/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp +++ b/examples/cvode/CXX_sycl/cvAdvDiff_kry_sycl.cpp @@ -119,7 +119,7 @@ int main(int argc, char** argv) int retval; // Create an in-order GPU queue -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else diff --git a/examples/cvode/kokkos/CMakeLists.txt b/examples/cvode/kokkos/CMakeLists.txt index 84d0e1528f..3b20ec3cd4 100644 --- a/examples/cvode/kokkos/CMakeLists.txt +++ b/examples/cvode/kokkos/CMakeLists.txt @@ -16,6 +16,7 @@ # 'develop' for examples excluded from 'make test' in releases set(examples_list "cv_bruss_batched_kokkos.cpp\;\;develop" + "cv_bruss_batched_kokkos_2D.cpp\;\;develop" ) # Add the build targets for each example diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out new file mode 100644 index 0000000000..6f2c19c3a9 --- /dev/null +++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out @@ -0,0 +1,137 @@ + +Batch of independent 3-species kinetics problems + number of batches = 100 + linear solver = KokkosKernels + test type = 2 + execution space = Cuda + +At t = 0 + batch 0: y = 1.2 3.1 3 + batch 10: y = 1.2 3.1 3 + batch 20: y = 1.2 3.1 3 + batch 30: y = 1.2 3.1 3 + batch 40: y = 1.2 3.1 3 + batch 50: y = 1.2 3.1 3 + batch 60: y = 1.2 3.1 3 + batch 70: y = 1.2 3.1 3 + batch 80: y = 1.2 3.1 3 + batch 90: y = 1.2 3.1 3 +At t = 1 + batch 0: y = 1.10389 3.01314 3.49998 + batch 10: y = 1.10389 3.01314 3.49998 + batch 20: y = 1.10389 3.01314 3.49998 + batch 30: y = 1.10389 3.01314 3.49998 + batch 40: y = 1.10389 3.01314 3.49998 + batch 50: y = 1.10389 3.01314 3.49998 + batch 60: y = 1.10389 3.01314 3.49998 + batch 70: y = 1.10389 3.01314 3.49998 + batch 80: y = 1.10389 3.01314 3.49998 + batch 90: y = 1.10389 3.01314 3.49998 +At t = 2 + batch 0: y = 0.688033 3.5213 3.49999 + batch 10: y = 0.688033 3.5213 3.49999 + batch 20: y = 0.688033 3.5213 3.49999 + batch 30: y = 0.688033 3.5213 3.49999 + batch 40: y = 0.688033 3.5213 3.49999 + batch 50: y = 0.688033 3.5213 3.49999 + batch 60: y = 0.688033 3.5213 3.49999 + batch 70: y = 0.688033 3.5213 3.49999 + batch 80: y = 0.688033 3.5213 3.49999 + batch 90: y = 0.688033 3.5213 3.49999 +At t = 3 + batch 0: y = 0.409472 4.27781 3.49999 + batch 10: y = 0.409472 4.27781 3.49999 + batch 20: y = 0.409472 4.27781 3.49999 + batch 30: y = 0.409472 4.27781 3.49999 + batch 40: y = 0.409472 4.27781 3.49999 + batch 50: y = 0.409472 4.27781 3.49999 + batch 60: y = 0.409472 4.27781 3.49999 + batch 70: y = 0.409472 4.27781 3.49999 + batch 80: y = 0.409472 4.27781 3.49999 + batch 90: y = 0.409472 4.27781 3.49999 +At t = 4 + batch 0: y = 0.36788 4.94194 3.49999 + batch 10: y = 0.36788 4.94194 3.49999 + batch 20: y = 0.36788 4.94194 3.49999 + batch 30: y = 0.36788 4.94194 3.49999 + batch 40: y = 0.36788 4.94194 3.49999 + batch 50: y = 0.36788 4.94194 3.49999 + batch 60: y = 0.36788 4.94194 3.49999 + batch 70: y = 0.36788 4.94194 3.49999 + batch 80: y = 0.36788 4.94194 3.49999 + batch 90: y = 0.36788 4.94194 3.49999 +At t = 5 + batch 0: y = 0.413842 5.51057 3.49999 + batch 10: y = 0.413842 5.51057 3.49999 + batch 20: y = 0.413842 5.51057 3.49999 + batch 30: y = 0.413842 5.51057 3.49999 + batch 40: y = 0.413842 5.51057 3.49999 + batch 50: y = 0.413842 5.51057 3.49999 + batch 60: y = 0.413842 5.51057 3.49999 + batch 70: y = 0.413842 5.51057 3.49999 + batch 80: y = 0.413842 5.51057 3.49999 + batch 90: y = 0.413842 5.51057 3.49999 +At t = 6 + batch 0: y = 0.589207 5.85566 3.49999 + batch 10: y = 0.589207 5.85566 3.49999 + batch 20: y = 0.589207 5.85566 3.49999 + batch 30: y = 0.589207 5.85566 3.49999 + batch 40: y = 0.589207 5.85566 3.49999 + batch 50: y = 0.589207 5.85566 3.49999 + batch 60: y = 0.589207 5.85566 3.49999 + batch 70: y = 0.589207 5.85566 3.49999 + batch 80: y = 0.589207 5.85566 3.49999 + batch 90: y = 0.589207 5.85566 3.49999 +At t = 7 + batch 0: y = 4.75675 0.735405 3.49992 + batch 10: y = 4.75675 0.735405 3.49992 + batch 20: y = 4.75675 0.735405 3.49992 + batch 30: y = 4.75675 0.735405 3.49992 + batch 40: y = 4.75675 0.735405 3.49992 + batch 50: y = 4.75675 0.735405 3.49992 + batch 60: y = 4.75675 0.735405 3.49992 + batch 70: y = 4.75675 0.735405 3.49992 + batch 80: y = 4.75675 0.735405 3.49992 + batch 90: y = 4.75675 0.735405 3.49992 +At t = 8 + batch 0: y = 1.81355 1.57573 3.49997 + batch 10: y = 1.81355 1.57573 3.49997 + batch 20: y = 1.81355 1.57573 3.49997 + batch 30: y = 1.81355 1.57573 3.49997 + batch 40: y = 1.81355 1.57573 3.49997 + batch 50: y = 1.81355 1.57573 3.49997 + batch 60: y = 1.81355 1.57573 3.49997 + batch 70: y = 1.81355 1.57573 3.49997 + batch 80: y = 1.81355 1.57573 3.49997 + batch 90: y = 1.81355 1.57573 3.49997 +At t = 9 + batch 0: y = 0.527935 2.80731 3.49999 + batch 10: y = 0.527935 2.80731 3.49999 + batch 20: y = 0.527935 2.80731 3.49999 + batch 30: y = 0.527935 2.80731 3.49999 + batch 40: y = 0.527935 2.80731 3.49999 + batch 50: y = 0.527935 2.80731 3.49999 + batch 60: y = 0.527935 2.80731 3.49999 + batch 70: y = 0.527935 2.80731 3.49999 + batch 80: y = 0.527935 2.80731 3.49999 + batch 90: y = 0.527935 2.80731 3.49999 +At t = 10 + batch 0: y = 0.305602 3.65734 3.49999 + batch 10: y = 0.305602 3.65734 3.49999 + batch 20: y = 0.305602 3.65734 3.49999 + batch 30: y = 0.305602 3.65734 3.49999 + batch 40: y = 0.305602 3.65734 3.49999 + batch 50: y = 0.305602 3.65734 3.49999 + batch 60: y = 0.305602 3.65734 3.49999 + batch 70: y = 0.305602 3.65734 3.49999 + batch 80: y = 0.305602 3.65734 3.49999 + batch 90: y = 0.305602 3.65734 3.49999 + +Final Statistics: + Steps = 344 + RHS evals = 464 + LS setups = 59 + Jac evals = 7 + NLS iters = 461 + NLS fails = 1 + Error test fails = 20 diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out new file mode 100644 index 0000000000..69f0b74a18 --- /dev/null +++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out @@ -0,0 +1,137 @@ + +Batch of independent 3-species kinetics problems + number of batches = 100 + linear solver = KokkosKernels + test type = 2 + execution space = OpenMP + +At t = 0 + batch 0: y = 1.2 3.1 3 + batch 10: y = 1.2 3.1 3 + batch 20: y = 1.2 3.1 3 + batch 30: y = 1.2 3.1 3 + batch 40: y = 1.2 3.1 3 + batch 50: y = 1.2 3.1 3 + batch 60: y = 1.2 3.1 3 + batch 70: y = 1.2 3.1 3 + batch 80: y = 1.2 3.1 3 + batch 90: y = 1.2 3.1 3 +At t = 1 + batch 0: y = 1.10389 3.01314 3.49998 + batch 10: y = 1.10389 3.01314 3.49998 + batch 20: y = 1.10389 3.01314 3.49998 + batch 30: y = 1.10389 3.01314 3.49998 + batch 40: y = 1.10389 3.01314 3.49998 + batch 50: y = 1.10389 3.01314 3.49998 + batch 60: y = 1.10389 3.01314 3.49998 + batch 70: y = 1.10389 3.01314 3.49998 + batch 80: y = 1.10389 3.01314 3.49998 + batch 90: y = 1.10389 3.01314 3.49998 +At t = 2 + batch 0: y = 0.688033 3.5213 3.49999 + batch 10: y = 0.688033 3.5213 3.49999 + batch 20: y = 0.688033 3.5213 3.49999 + batch 30: y = 0.688033 3.5213 3.49999 + batch 40: y = 0.688033 3.5213 3.49999 + batch 50: y = 0.688033 3.5213 3.49999 + batch 60: y = 0.688033 3.5213 3.49999 + batch 70: y = 0.688033 3.5213 3.49999 + batch 80: y = 0.688033 3.5213 3.49999 + batch 90: y = 0.688033 3.5213 3.49999 +At t = 3 + batch 0: y = 0.409472 4.27781 3.49999 + batch 10: y = 0.409472 4.27781 3.49999 + batch 20: y = 0.409472 4.27781 3.49999 + batch 30: y = 0.409472 4.27781 3.49999 + batch 40: y = 0.409472 4.27781 3.49999 + batch 50: y = 0.409472 4.27781 3.49999 + batch 60: y = 0.409472 4.27781 3.49999 + batch 70: y = 0.409472 4.27781 3.49999 + batch 80: y = 0.409472 4.27781 3.49999 + batch 90: y = 0.409472 4.27781 3.49999 +At t = 4 + batch 0: y = 0.36788 4.94194 3.49999 + batch 10: y = 0.36788 4.94194 3.49999 + batch 20: y = 0.36788 4.94194 3.49999 + batch 30: y = 0.36788 4.94194 3.49999 + batch 40: y = 0.36788 4.94194 3.49999 + batch 50: y = 0.36788 4.94194 3.49999 + batch 60: y = 0.36788 4.94194 3.49999 + batch 70: y = 0.36788 4.94194 3.49999 + batch 80: y = 0.36788 4.94194 3.49999 + batch 90: y = 0.36788 4.94194 3.49999 +At t = 5 + batch 0: y = 0.413842 5.51057 3.49999 + batch 10: y = 0.413842 5.51057 3.49999 + batch 20: y = 0.413842 5.51057 3.49999 + batch 30: y = 0.413842 5.51057 3.49999 + batch 40: y = 0.413842 5.51057 3.49999 + batch 50: y = 0.413842 5.51057 3.49999 + batch 60: y = 0.413842 5.51057 3.49999 + batch 70: y = 0.413842 5.51057 3.49999 + batch 80: y = 0.413842 5.51057 3.49999 + batch 90: y = 0.413842 5.51057 3.49999 +At t = 6 + batch 0: y = 0.589207 5.85566 3.49999 + batch 10: y = 0.589207 5.85566 3.49999 + batch 20: y = 0.589207 5.85566 3.49999 + batch 30: y = 0.589207 5.85566 3.49999 + batch 40: y = 0.589207 5.85566 3.49999 + batch 50: y = 0.589207 5.85566 3.49999 + batch 60: y = 0.589207 5.85566 3.49999 + batch 70: y = 0.589207 5.85566 3.49999 + batch 80: y = 0.589207 5.85566 3.49999 + batch 90: y = 0.589207 5.85566 3.49999 +At t = 7 + batch 0: y = 4.75675 0.735405 3.49992 + batch 10: y = 4.75675 0.735405 3.49992 + batch 20: y = 4.75675 0.735405 3.49992 + batch 30: y = 4.75675 0.735405 3.49992 + batch 40: y = 4.75675 0.735405 3.49992 + batch 50: y = 4.75675 0.735405 3.49992 + batch 60: y = 4.75675 0.735405 3.49992 + batch 70: y = 4.75675 0.735405 3.49992 + batch 80: y = 4.75675 0.735405 3.49992 + batch 90: y = 4.75675 0.735405 3.49992 +At t = 8 + batch 0: y = 1.81355 1.57573 3.49997 + batch 10: y = 1.81355 1.57573 3.49997 + batch 20: y = 1.81355 1.57573 3.49997 + batch 30: y = 1.81355 1.57573 3.49997 + batch 40: y = 1.81355 1.57573 3.49997 + batch 50: y = 1.81355 1.57573 3.49997 + batch 60: y = 1.81355 1.57573 3.49997 + batch 70: y = 1.81355 1.57573 3.49997 + batch 80: y = 1.81355 1.57573 3.49997 + batch 90: y = 1.81355 1.57573 3.49997 +At t = 9 + batch 0: y = 0.527935 2.80731 3.49999 + batch 10: y = 0.527935 2.80731 3.49999 + batch 20: y = 0.527935 2.80731 3.49999 + batch 30: y = 0.527935 2.80731 3.49999 + batch 40: y = 0.527935 2.80731 3.49999 + batch 50: y = 0.527935 2.80731 3.49999 + batch 60: y = 0.527935 2.80731 3.49999 + batch 70: y = 0.527935 2.80731 3.49999 + batch 80: y = 0.527935 2.80731 3.49999 + batch 90: y = 0.527935 2.80731 3.49999 +At t = 10 + batch 0: y = 0.305602 3.65734 3.49999 + batch 10: y = 0.305602 3.65734 3.49999 + batch 20: y = 0.305602 3.65734 3.49999 + batch 30: y = 0.305602 3.65734 3.49999 + batch 40: y = 0.305602 3.65734 3.49999 + batch 50: y = 0.305602 3.65734 3.49999 + batch 60: y = 0.305602 3.65734 3.49999 + batch 70: y = 0.305602 3.65734 3.49999 + batch 80: y = 0.305602 3.65734 3.49999 + batch 90: y = 0.305602 3.65734 3.49999 + +Final Statistics: + Steps = 344 + RHS evals = 464 + LS setups = 59 + Jac evals = 7 + NLS iters = 461 + NLS fails = 1 + Error test fails = 20 diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out new file mode 100644 index 0000000000..6cabd0d57d --- /dev/null +++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out @@ -0,0 +1,137 @@ + +Batch of independent 3-species kinetics problems + number of batches = 100 + linear solver = KokkosKernels + test type = 2 + execution space = Serial + +At t = 0 + batch 0: y = 1.2 3.1 3 + batch 10: y = 1.2 3.1 3 + batch 20: y = 1.2 3.1 3 + batch 30: y = 1.2 3.1 3 + batch 40: y = 1.2 3.1 3 + batch 50: y = 1.2 3.1 3 + batch 60: y = 1.2 3.1 3 + batch 70: y = 1.2 3.1 3 + batch 80: y = 1.2 3.1 3 + batch 90: y = 1.2 3.1 3 +At t = 1 + batch 0: y = 1.10389 3.01314 3.49998 + batch 10: y = 1.10389 3.01314 3.49998 + batch 20: y = 1.10389 3.01314 3.49998 + batch 30: y = 1.10389 3.01314 3.49998 + batch 40: y = 1.10389 3.01314 3.49998 + batch 50: y = 1.10389 3.01314 3.49998 + batch 60: y = 1.10389 3.01314 3.49998 + batch 70: y = 1.10389 3.01314 3.49998 + batch 80: y = 1.10389 3.01314 3.49998 + batch 90: y = 1.10389 3.01314 3.49998 +At t = 2 + batch 0: y = 0.688033 3.5213 3.49999 + batch 10: y = 0.688033 3.5213 3.49999 + batch 20: y = 0.688033 3.5213 3.49999 + batch 30: y = 0.688033 3.5213 3.49999 + batch 40: y = 0.688033 3.5213 3.49999 + batch 50: y = 0.688033 3.5213 3.49999 + batch 60: y = 0.688033 3.5213 3.49999 + batch 70: y = 0.688033 3.5213 3.49999 + batch 80: y = 0.688033 3.5213 3.49999 + batch 90: y = 0.688033 3.5213 3.49999 +At t = 3 + batch 0: y = 0.409472 4.27781 3.49999 + batch 10: y = 0.409472 4.27781 3.49999 + batch 20: y = 0.409472 4.27781 3.49999 + batch 30: y = 0.409472 4.27781 3.49999 + batch 40: y = 0.409472 4.27781 3.49999 + batch 50: y = 0.409472 4.27781 3.49999 + batch 60: y = 0.409472 4.27781 3.49999 + batch 70: y = 0.409472 4.27781 3.49999 + batch 80: y = 0.409472 4.27781 3.49999 + batch 90: y = 0.409472 4.27781 3.49999 +At t = 4 + batch 0: y = 0.36788 4.94194 3.49999 + batch 10: y = 0.36788 4.94194 3.49999 + batch 20: y = 0.36788 4.94194 3.49999 + batch 30: y = 0.36788 4.94194 3.49999 + batch 40: y = 0.36788 4.94194 3.49999 + batch 50: y = 0.36788 4.94194 3.49999 + batch 60: y = 0.36788 4.94194 3.49999 + batch 70: y = 0.36788 4.94194 3.49999 + batch 80: y = 0.36788 4.94194 3.49999 + batch 90: y = 0.36788 4.94194 3.49999 +At t = 5 + batch 0: y = 0.413842 5.51057 3.49999 + batch 10: y = 0.413842 5.51057 3.49999 + batch 20: y = 0.413842 5.51057 3.49999 + batch 30: y = 0.413842 5.51057 3.49999 + batch 40: y = 0.413842 5.51057 3.49999 + batch 50: y = 0.413842 5.51057 3.49999 + batch 60: y = 0.413842 5.51057 3.49999 + batch 70: y = 0.413842 5.51057 3.49999 + batch 80: y = 0.413842 5.51057 3.49999 + batch 90: y = 0.413842 5.51057 3.49999 +At t = 6 + batch 0: y = 0.589207 5.85566 3.49999 + batch 10: y = 0.589207 5.85566 3.49999 + batch 20: y = 0.589207 5.85566 3.49999 + batch 30: y = 0.589207 5.85566 3.49999 + batch 40: y = 0.589207 5.85566 3.49999 + batch 50: y = 0.589207 5.85566 3.49999 + batch 60: y = 0.589207 5.85566 3.49999 + batch 70: y = 0.589207 5.85566 3.49999 + batch 80: y = 0.589207 5.85566 3.49999 + batch 90: y = 0.589207 5.85566 3.49999 +At t = 7 + batch 0: y = 4.75675 0.735405 3.49992 + batch 10: y = 4.75675 0.735405 3.49992 + batch 20: y = 4.75675 0.735405 3.49992 + batch 30: y = 4.75675 0.735405 3.49992 + batch 40: y = 4.75675 0.735405 3.49992 + batch 50: y = 4.75675 0.735405 3.49992 + batch 60: y = 4.75675 0.735405 3.49992 + batch 70: y = 4.75675 0.735405 3.49992 + batch 80: y = 4.75675 0.735405 3.49992 + batch 90: y = 4.75675 0.735405 3.49992 +At t = 8 + batch 0: y = 1.81355 1.57573 3.49997 + batch 10: y = 1.81355 1.57573 3.49997 + batch 20: y = 1.81355 1.57573 3.49997 + batch 30: y = 1.81355 1.57573 3.49997 + batch 40: y = 1.81355 1.57573 3.49997 + batch 50: y = 1.81355 1.57573 3.49997 + batch 60: y = 1.81355 1.57573 3.49997 + batch 70: y = 1.81355 1.57573 3.49997 + batch 80: y = 1.81355 1.57573 3.49997 + batch 90: y = 1.81355 1.57573 3.49997 +At t = 9 + batch 0: y = 0.527935 2.80731 3.49999 + batch 10: y = 0.527935 2.80731 3.49999 + batch 20: y = 0.527935 2.80731 3.49999 + batch 30: y = 0.527935 2.80731 3.49999 + batch 40: y = 0.527935 2.80731 3.49999 + batch 50: y = 0.527935 2.80731 3.49999 + batch 60: y = 0.527935 2.80731 3.49999 + batch 70: y = 0.527935 2.80731 3.49999 + batch 80: y = 0.527935 2.80731 3.49999 + batch 90: y = 0.527935 2.80731 3.49999 +At t = 10 + batch 0: y = 0.305602 3.65734 3.49999 + batch 10: y = 0.305602 3.65734 3.49999 + batch 20: y = 0.305602 3.65734 3.49999 + batch 30: y = 0.305602 3.65734 3.49999 + batch 40: y = 0.305602 3.65734 3.49999 + batch 50: y = 0.305602 3.65734 3.49999 + batch 60: y = 0.305602 3.65734 3.49999 + batch 70: y = 0.305602 3.65734 3.49999 + batch 80: y = 0.305602 3.65734 3.49999 + batch 90: y = 0.305602 3.65734 3.49999 + +Final Statistics: + Steps = 344 + RHS evals = 464 + LS setups = 59 + Jac evals = 7 + NLS iters = 461 + NLS fails = 1 + Error test fails = 20 diff --git a/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp new file mode 100644 index 0000000000..58a136a74f --- /dev/null +++ b/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp @@ -0,0 +1,425 @@ +/* ----------------------------------------------------------------------------- + * Programmer(s): Daniel R. Reynolds @ SMU + * David J. Gardner and Cody J. Balos @ LLNL + * ----------------------------------------------------------------------------- + * SUNDIALS Copyright Start + * Copyright (c) 2002-2023, Lawrence Livermore National Security + * and Southern Methodist University. + * All rights reserved. + * + * See the top-level LICENSE and NOTICE files for details. + * + * SPDX-License-Identifier: BSD-3-Clause + * SUNDIALS Copyright End + * ----------------------------------------------------------------------------- + * The following is a simple example problem based off of ark_brusselator.c. + * + * We simulate a scenario where a set of independent ODEs are batched together + * to form a larger system. Each independent ODE system has 3 components, + * Y = [u, v, w], satisfying the equations, + * + * du/dt = a - (w + 1) * u + v * u^2 + * dv/dt = w * u - v * u^2 + * dw/dt = (b - w) / ep - w * u + * + * for t in the interval [0, 10], with initial conditions Y0 = [u0, v0, w0]. + * The problem is stiff and there are 3 testing scenarios: + * + * Reactor 0: u0 = 3.9, v0 = 1.1, w0 = 2.8, a = 1.2, b = 2.5, ep = 1.0e-5 + * Here, all three components exhibit a rapid transient change during the + * first 0.2 time units, followed by a slow and smooth evolution. + * + * Reactor 1: u0 = 3, v0 = 3, w0 = 3.5, a = 0.5, b = 3, ep = 5.0e-4 + * Here, all components undergo very rapid initial transients during the first + * 0.3 time units, and all then proceed very smoothly for the remainder of the + * simulation. + * + * Reactor 2: u0 = 1.2, v0 = 3.1, w0 = 3, a = 1, b = 3.5, ep = 5.0e-6 + * Here, w experiences a fast initial transient, jumping 0.5 within a few + * steps. All values proceed smoothly until around t=6.5, when both u and v + * undergo a sharp transition, with u increasing from around 0.5 to 5 and v + * decreasing from around 6 to 1 in less than 0.5 time units. After this + * transition, both u and v continue to evolve somewhat rapidly for another + * 1.4 time units, and finish off smoothly. + * + * This program solves the problem with the BDF method, Newton iteration, a + * user-supplied Jacobian routine, and, since the grouping of the independent + * systems results in a block diagonal linear system, the dense KOKKOS + * SUNLinearSolver which supports batched systems. 100 outputs are printed at + * equal intervals, and run statistics are printed at the end. + * + * Unlike the example cv_bruss_batched_kokkos.cpp, this example utilizes Kokkos' + * multi-dimensional view functionality to consider a 2D grouping, y(i,j), where + * i corresponds with the batch index, and j corresponds to the component (u,v,w). + * + * The program takes three optional arguments, the number of independent ODE + * systems (i.e., number of batches), the linear solver type (KOKKOS batched LU + * or non-batched GMRES with the Jacobian computed by difference quotients) + * the test type (uniform_0, uniform_1, or uniform_2). + * + * ./cv_bruss_batched_kokkos [num_batches] [solver_type] [test_type] + * + * Options: + * num_batches + * solver_type: + * 0 - KOKKOS batched LU (default) + * 1 - SUNDIALS non-batched GMRES with difference quotients Jacobian + * test_type: + * 0 - uniform_0, all batches are Reactor 0 + * 1 - uniform 1, all batches are Reactor 1 + * 2 - uniform 2, all batches are Reactor 2 (default) + * ---------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +// Common utility functions +#include + +// Execution space +#if defined(USE_CUDA) +using ExecSpace = Kokkos::Cuda; +using MemSpace = Kokkos::CudaSpace; +#elif defined(USE_HIP) +#if KOKKOS_VERSION / 10000 > 3 +using ExecSpace = Kokkos::HIP; +using MemSpace = Kokkos::HIPSpace; +#else +using ExecSpace = Kokkos::Experimental::HIP; +using MemSpace = Kokkos::Experimental::HIPSpace; +#endif +#elif defined(USE_OPENMP) +using ExecSpace = Kokkos::OpenMP; +using MemSpace = Kokkos::HostSpace; +#else +using ExecSpace = Kokkos::Serial; +using MemSpace = Kokkos::HostSpace; +#endif + +using Vec1D = Kokkos::View; +using Vec2D = Kokkos::View; +using Vec2DHost = Vec2D::HostMirror; +using VecType = sundials::kokkos::Vector; +using MatType = sundials::kokkos::DenseMatrix; +using LSType = sundials::kokkos::DenseLinearSolver; +using SizeType = VecType::size_type; + +// Constants +#define ZERO SUN_RCONST(0.0) +#define ONE SUN_RCONST(1.0) +#define TWO SUN_RCONST(2.0) + +// User-supplied functions called by CVODE +static int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data); + +static int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, + void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3); + +// User data structure available in user-supplied callback functions +struct UserData +{ + int nbatches = 100; // number of chemical networks + int batchSize = 3; // size of each network + sunrealtype a, b; // chemical concentrations that are constant + sunrealtype ep; // stiffness parameter +}; + +/* ----------------------------------------------------------------------------- + * Main Program + * ---------------------------------------------------------------------------*/ + +int main(int argc, char* argv[]) +{ + // Create the SUNDIALS context + sundials::Context sunctx; + + Kokkos::initialize(argc, argv); + { + // Create UserData + UserData udata; + + // Parse command line options + int argi = 0; + + // Total number of batch systems + if (argc > 1) udata.nbatches = atoi(argv[++argi]); + + // Linear solver type + int solver_type = 0; + if (argc > 2) solver_type = atoi(argv[++argi]); + + // Problem setup + int test_type = 2; + if (argc > 3) test_type = atoi(argv[++argi]); + + // Shortcuts + int nbatches = udata.nbatches; + int batchSize = udata.batchSize; + + std::cout << "\nBatch of independent 3-species kinetics problems\n" + << " number of batches = " << nbatches << "\n" + << " linear solver = " + << (solver_type ? "GMRES" : "KokkosKernels") << "\n" + << " test type = " << test_type << "\n" + << " execution space = " << ExecSpace().name() << "\n\n"; + + sunrealtype u0, v0, w0; + if (test_type == 0) + { + u0 = SUN_RCONST(3.9); + v0 = SUN_RCONST(1.1); + w0 = SUN_RCONST(2.8); + + udata.a = SUN_RCONST(1.2); + udata.b = SUN_RCONST(2.5); + udata.ep = SUN_RCONST(1.0e-5); + } + else if (test_type == 1) + { + u0 = SUN_RCONST(3.0); + v0 = SUN_RCONST(3.0); + w0 = SUN_RCONST(3.5); + + udata.a = SUN_RCONST(0.5); + udata.b = SUN_RCONST(3.0); + udata.ep = SUN_RCONST(5.0e-4); + } + else if (test_type == 2) + { + u0 = SUN_RCONST(1.2); + v0 = SUN_RCONST(3.1); + w0 = SUN_RCONST(3.0); + + udata.a = SUN_RCONST(1.0); + udata.b = SUN_RCONST(3.5); + udata.ep = SUN_RCONST(5.0e-6); + } + else + { + std::cerr << "ERROR: Invalid test type option\n"; + return -1; + } + + // Create vector with the initial condition + const sunrealtype T0 = SUN_RCONST(0.0); + + SizeType length{static_cast(batchSize * nbatches)}; + VecType y{length, sunctx}; + Vec2D y2d((y.View()).data(), nbatches, batchSize); + + Kokkos::parallel_for( + "fill_y", Kokkos::RangePolicy(0, nbatches), + KOKKOS_LAMBDA(const SizeType i) { + y2d(i,0) = u0; + y2d(i,1) = v0; + y2d(i,2) = w0; + }); + + // Create vector of absolute tolerances + VecType abstol{length, sunctx}; + N_VConst(SUN_RCONST(1.0e-10), abstol); + + // Create CVODE using Backward Differentiation Formula methods + void* cvode_mem = CVodeCreate(CV_BDF, sunctx); + if (check_ptr(cvode_mem, "CVodeCreate")) { return 1; } + + // Initialize the integrator and set the ODE right-hand side function + int retval = CVodeInit(cvode_mem, f, T0, y); + if (check_flag(retval, "CVodeInit")) { return 1; } + + // Attach the user data structure + retval = CVodeSetUserData(cvode_mem, &udata); + if (check_flag(retval, "CVodeSetUserData")) { return 1; } + + // Specify the scalar relative tolerance and vector absolute tolerances + retval = CVodeSVtolerances(cvode_mem, SUN_RCONST(1.0e-6), abstol); + if (check_flag(retval, "CVodeSVtolerances")) { return 1; } + + // Create the matrix and linear solver objects + std::unique_ptr> A; + std::unique_ptr> LS; + + if (solver_type == 0) + { + // Create Kokkos dense block diagonal matrix + A = std::make_unique(nbatches, batchSize, batchSize, sunctx); + + // Create Kokkos batched dense linear solver + LS = std::make_unique(sunctx); + + // Attach the matrix and linear solver to CVODE + retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), A->Convert()); + if (check_flag(retval, "CVodeSetLinearSolver")) return 1; + + // Set the user-supplied Jacobian function + retval = CVodeSetJacFn(cvode_mem, Jac); + if (check_flag(retval, "CVodeSetJacFn")) return 1; + } + else + { + // Create matrix-free GMRES linear solver + LS = std::make_unique( + SUNLinSol_SPGMR(y, SUN_PREC_NONE, 0, sunctx)); + + // Attach the linear solver to CVODE + retval = CVodeSetLinearSolver(cvode_mem, LS->Convert(), nullptr); + if (check_flag(retval, "CVodeSetLinearSolver")) return 1; + } + + // Final time and time between outputs + const sunrealtype Tf = SUN_RCONST(10.0); + const sunrealtype dTout = SUN_RCONST(1.0); + + // Number of output times + const int Nt = static_cast(ceil(Tf / dTout)); + + // Current time and first output time + sunrealtype t = T0; + sunrealtype tout = T0 + dTout; + + // Initial output + Vec2DHost y2d_h((y.HostView()).data(), nbatches, batchSize); + sundials::kokkos::CopyFromDevice(y); + Kokkos::fence(); + std::cout << "At t = " << t << std::endl; + for (int j = 0; j < nbatches; j += 10) + { + std::cout << " batch " << j << ": y = " << y2d_h(j,0) << " " + << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl; + } + + // Loop over output times + for (int iout = 0; iout < Nt; iout++) + { + // Advance in time + retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL); + if (check_flag(retval, "CVode")) break; + + // Output solution from some batches + sundials::kokkos::CopyFromDevice(y); + Kokkos::fence(); + std::cout << "At t = " << t << std::endl; + for (int j = 0; j < nbatches; j += 10) + { + std::cout << " batch " << j << ": y = " << y2d_h(j,0) << " " + << y2d_h(j,1) << " " << y2d_h(j,2) << std::endl; + } + + tout += dTout; + tout = (tout > Tf) ? Tf : tout; + } + + // Print some final statistics + long int nst, nfe, nsetups, nje, nni, ncfn, netf; + + retval = CVodeGetNumSteps(cvode_mem, &nst); + check_flag(retval, "CVodeGetNumSteps"); + retval = CVodeGetNumRhsEvals(cvode_mem, &nfe); + check_flag(retval, "CVodeGetNumRhsEvals"); + retval = CVodeGetNumLinSolvSetups(cvode_mem, &nsetups); + check_flag(retval, "CVodeGetNumLinSolvSetups"); + retval = CVodeGetNumErrTestFails(cvode_mem, &netf); + check_flag(retval, "CVodeGetNumErrTestFails"); + retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni); + check_flag(retval, "CVodeGetNumNonlinSolvIters"); + retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn); + check_flag(retval, "CVodeGetNumNonlinSolvConvFails"); + retval = CVodeGetNumJacEvals(cvode_mem, &nje); + check_flag(retval, "CVodeGetNumJacEvals"); + + std::cout << "\nFinal Statistics:\n" + << " Steps = " << nst << "\n" + << " RHS evals = " << nfe << "\n" + << " LS setups = " << nsetups << "\n" + << " Jac evals = " << nje << "\n" + << " NLS iters = " << nni << "\n" + << " NLS fails = " << ncfn << "\n" + << " Error test fails = " << netf << "\n"; + + // Free objects + CVodeFree(&cvode_mem); + } + Kokkos::finalize(); + + return 0; +} + +/* ----------------------------------------------------------------------------- + * User-supplied functions called by CVODE + * ---------------------------------------------------------------------------*/ + +// Right hand side function dy/dt = f(t,y) +int f(sunrealtype t, N_Vector y, N_Vector ydot, void* user_data) +{ + auto udata = static_cast(user_data); + + const auto nbatches = udata->nbatches; + const auto batchSize = udata->batchSize; + + const auto a = udata->a; + const auto b = udata->b; + const auto ep = udata->ep; + + Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize); + Vec2D ydot2d(N_VGetDeviceArrayPointer(ydot), nbatches, batchSize); + + Kokkos::parallel_for( + "RHS", Kokkos::RangePolicy(0, nbatches), + KOKKOS_LAMBDA(const SizeType i) { + auto u = y2d(i,0); + auto v = y2d(i,1); + auto w = y2d(i,2); + ydot2d(i,0) = a - (w + ONE) * u + v * u * u; + ydot2d(i,1) = w * u - v * u * u; + ydot2d(i,2) = (b - w) / ep - w * u; + }); + + return 0; +} + +// Jacobian of f(t,y) +int Jac(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, + N_Vector tmp1, N_Vector tmp2, N_Vector tmp3) +{ + auto udata = static_cast(user_data); + auto y_data = sundials::kokkos::GetVec(y)->View(); + auto J_data = sundials::kokkos::GetDenseMat(J)->View(); + + const auto nbatches = udata->nbatches; + const auto batchSize = udata->batchSize; + + const auto ep = udata->ep; + Vec2D y2d(N_VGetDeviceArrayPointer(y), nbatches, batchSize); + + Kokkos::parallel_for( + "Jac", Kokkos::RangePolicy(0, nbatches), + KOKKOS_LAMBDA(const SizeType i) { + // get y values + auto u = y2d(i,0); + auto v = y2d(i,1); + auto w = y2d(i,2); + + // first col of block + J_data(i, 0, 0) = -(w + ONE) + TWO * u * v; + J_data(i, 1, 0) = u * u; + J_data(i, 2, 0) = -u; + + // second col of block + J_data(i, 0, 1) = u * u; + J_data(i, 1, 1) = -u * u; + J_data(i, 2, 1) = u; + + // third col of block + J_data(i, 0, 2) = -w; + J_data(i, 1, 2) = ZERO; + J_data(i, 2, 2) = -ONE / ep - u; + }); + + return 0; +} diff --git a/examples/nvector/sycl/test_nvector_sycl.cpp b/examples/nvector/sycl/test_nvector_sycl.cpp index 4ce143451c..b3b6f5c8dc 100644 --- a/examples/nvector/sycl/test_nvector_sycl.cpp +++ b/examples/nvector/sycl/test_nvector_sycl.cpp @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) SetTiming(print_timing, 0); /* Create an in-order GPU queue */ -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else diff --git a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp index 6c63f71e6e..c3adc0a08e 100644 --- a/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp +++ b/examples/sunlinsol/onemkldense/test_sunlinsol_onemkldense.cpp @@ -71,7 +71,7 @@ int main(int argc, char *argv[]) (long int) cols, (long int) nblocks); // Create an in-order GPU queue -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else diff --git a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp index 75d11de689..615e1ee9dd 100644 --- a/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp +++ b/examples/sunmatrix/onemkldense/test_sunmatrix_onemkldense.cpp @@ -80,7 +80,7 @@ int main(int argc, char *argv[]) (long int) matrows, (long int) matcols); // Create an in-order GPU queue -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else diff --git a/include/arkode/arkode_arkstep.h b/include/arkode/arkode_arkstep.h index 7d624635c5..2b1ec96962 100644 --- a/include/arkode/arkode_arkstep.h +++ b/include/arkode/arkode_arkstep.h @@ -50,9 +50,11 @@ static const int ARKSTEP_DEFAULT_DIRK_4 = ARKODE_SDIRK_5_3_4; static const int ARKSTEP_DEFAULT_DIRK_5 = ARKODE_ARK548L2SA_DIRK_8_4_5; /* ImEx */ +static const int ARKSTEP_DEFAULT_ARK_ETABLE_2 = ARKODE_ARK2_ERK_3_1_2; static const int ARKSTEP_DEFAULT_ARK_ETABLE_3 = ARKODE_ARK324L2SA_ERK_4_2_3; static const int ARKSTEP_DEFAULT_ARK_ETABLE_4 = ARKODE_ARK436L2SA_ERK_6_3_4; static const int ARKSTEP_DEFAULT_ARK_ETABLE_5 = ARKODE_ARK548L2SA_ERK_8_4_5; +static const int ARKSTEP_DEFAULT_ARK_ITABLE_2 = ARKODE_ARK2_DIRK_3_1_2; static const int ARKSTEP_DEFAULT_ARK_ITABLE_3 = ARKODE_ARK324L2SA_DIRK_4_2_3; static const int ARKSTEP_DEFAULT_ARK_ITABLE_4 = ARKODE_ARK436L2SA_DIRK_6_3_4; static const int ARKSTEP_DEFAULT_ARK_ITABLE_5 = ARKODE_ARK548L2SA_DIRK_8_4_5; diff --git a/include/arkode/arkode_butcher_dirk.h b/include/arkode/arkode_butcher_dirk.h index 76f1d1cb7d..aecaf16a82 100644 --- a/include/arkode/arkode_butcher_dirk.h +++ b/include/arkode/arkode_butcher_dirk.h @@ -92,7 +92,8 @@ typedef enum { ARKODE_ESDIRK437L2SA_7_3_4, ARKODE_ESDIRK547L2SA_7_4_5, ARKODE_ESDIRK547L2SA2_7_4_5, - ARKODE_MAX_DIRK_NUM = ARKODE_ESDIRK547L2SA2_7_4_5 + ARKODE_ARK2_DIRK_3_1_2, + ARKODE_MAX_DIRK_NUM = ARKODE_ARK2_DIRK_3_1_2 } ARKODE_DIRKTableID; /* Accessor routine to load built-in DIRK table */ diff --git a/include/arkode/arkode_butcher_erk.h b/include/arkode/arkode_butcher_erk.h index acd1d613fb..6673acb119 100644 --- a/include/arkode/arkode_butcher_erk.h +++ b/include/arkode/arkode_butcher_erk.h @@ -84,7 +84,8 @@ typedef enum { ARKODE_KNOTH_WOLKE_3_3, ARKODE_ARK437L2SA_ERK_7_3_4, ARKODE_ARK548L2SAb_ERK_8_4_5, - ARKODE_MAX_ERK_NUM = ARKODE_ARK548L2SAb_ERK_8_4_5 + ARKODE_ARK2_ERK_3_1_2, + ARKODE_MAX_ERK_NUM = ARKODE_ARK2_ERK_3_1_2 } ARKODE_ERKTableID; /* Accessor routine to load built-in ERK table */ diff --git a/include/nvector/nvector_kokkos.hpp b/include/nvector/nvector_kokkos.hpp index 1b424a7d44..269d15dcef 100644 --- a/include/nvector/nvector_kokkos.hpp +++ b/include/nvector/nvector_kokkos.hpp @@ -650,6 +650,20 @@ void CopyFromDevice(VectorType& v) Kokkos::deep_copy(v.HostView(), v.View()); } +template +view_type GetView(N_Vector v) +{ + auto vec{GetVec(v)}; + return vec->View(); +} + +template +host_view_type GetHostView(N_Vector v) +{ + auto vec{GetVec(v)}; + return vec->HostView(); +} + } // namespace kokkos } // namespace sundials diff --git a/include/sundials/sundials_config.in b/include/sundials/sundials_config.in index 4c79b40040..f3f66ff89c 100644 --- a/include/sundials/sundials_config.in +++ b/include/sundials/sundials_config.in @@ -130,12 +130,16 @@ */ #cmakedefine01 SUNDIALS_MPI_ENABLED - /* SUPERLUMT threading type */ +/* oneMKL interface options */ +#cmakedefine SUNDIALS_ONEMKL_USE_GETRF_LOOP +#cmakedefine SUNDIALS_ONEMKL_USE_GETRS_LOOP + +/* SUPERLUMT threading type */ #define SUNDIALS_SUPERLUMT_THREAD_TYPE "@SUPERLUMT_THREAD_TYPE@" - /* Trilinos with MPI is available, then - * #define SUNDIALS_TRILINOS_HAVE_MPI - */ +/* Trilinos with MPI is available, then + * #define SUNDIALS_TRILINOS_HAVE_MPI + */ #cmakedefine SUNDIALS_TRILINOS_HAVE_MPI /* RAJA backends */ @@ -143,6 +147,10 @@ #cmakedefine SUNDIALS_RAJA_BACKENDS_HIP #cmakedefine SUNDIALS_RAJA_BACKENDS_SYCL +/* SYCL options */ +#cmakedefine SUNDIALS_SYCL_2020_UNSUPPORTED + + /* ------------------------------------------------------------------ * SUNDIALS modules enabled * -----------------------------------------------------------------*/ diff --git a/scripts/cvode b/scripts/cvode index 57261482ba..af97c77f74 100755 --- a/scripts/cvode +++ b/scripts/cvode @@ -221,6 +221,10 @@ $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.cpp $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.CUDA.out $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.OPENMP.out $tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos.SERIAL.out +$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.cpp +$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.CUDA.out +$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.OPENMP.out +$tar $tarfile $distrobase/examples/cvode/kokkos/cv_bruss_batched_kokkos_2D.SERIAL.out $tar $tarfile $distrobase/examples/cvode/magma/README $tar $tarfile $distrobase/examples/cvode/magma/CMakeLists.txt diff --git a/src/arkode/arkode_arkstep.c b/src/arkode/arkode_arkstep.c index ef38649f90..8b5bde3a1b 100644 --- a/src/arkode/arkode_arkstep.c +++ b/src/arkode/arkode_arkstep.c @@ -1924,6 +1924,9 @@ int arkStep_SetButcherTables(ARKodeMem ark_mem) switch (step_mem->q) { case(2): + etable = ARKSTEP_DEFAULT_ARK_ETABLE_2; + itable = ARKSTEP_DEFAULT_ARK_ITABLE_2; + break; case(3): etable = ARKSTEP_DEFAULT_ARK_ETABLE_3; itable = ARKSTEP_DEFAULT_ARK_ITABLE_3; diff --git a/src/arkode/arkode_butcher_dirk.c b/src/arkode/arkode_butcher_dirk.c index d1346acdd4..b74a543a52 100644 --- a/src/arkode/arkode_butcher_dirk.c +++ b/src/arkode/arkode_butcher_dirk.c @@ -66,8 +66,8 @@ ARKODE_DIRKTableID arkButcherTableDIRKNameToID(const char *imethod) { #undef ARK_BUTCHER_TABLE arkProcessError(NULL, ARK_ILL_INPUT, "ARKODE", - "arkButcherTableDIRKNameToID", - "Unknown Butcher table"); + "arkButcherTableDIRKNameToID", + "Unknown Butcher table"); return ARKODE_DIRK_NONE; } diff --git a/src/arkode/arkode_butcher_dirk.def b/src/arkode/arkode_butcher_dirk.def index f9e63023ab..a3cca75f66 100644 --- a/src/arkode/arkode_butcher_dirk.def +++ b/src/arkode/arkode_butcher_dirk.def @@ -57,7 +57,8 @@ ARKODE_ARK548L2SAb_DIRK_8_4_5* ESDIRK Y Y N ARKODE_ESDIRK547L2SA_7_4_5 ESDIRK Y Y N ARKODE_ESDIRK547L2SA2_7_4_5 ESDIRK Y Y N - ----------------------------------------------------------------- + ARKODE_ARK2_DIRK_3_1_2 ESDIRK Y Y Y + ----------------------------------------------------------------- */ ARK_BUTCHER_TABLE(ARKODE_DIRK_NONE, { @@ -68,22 +69,56 @@ ARK_BUTCHER_TABLE(ARKODE_SDIRK_2_1_2, { /* SDIRK-2-1 (A,B stable) */ ARKodeButcherTable B = ARKodeButcherTable_Alloc(2, SUNTRUE); B->q = 2; B->p = 1; - + B->A[0][0] = RCONST(1.0); B->A[1][0] = RCONST(-1.0); B->A[1][1] = RCONST(1.0); - + B->b[0] = RCONST(0.5); B->b[1] = RCONST(0.5); - + B->d[0] = RCONST(1.0); - B->c[0] = RCONST(1.0); B->c[1] = RCONST(0.0); return B; }) +ARK_BUTCHER_TABLE(ARKODE_ARK2_DIRK_3_1_2, { /* ARK2 Implicit Table (A,L stable) */ + ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE); + + /* 1 - 1 / sqrt(2) */ + const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0)); + /* 1 / (2 sqrt(2)) */ + const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))); + /* 2 - sqrt(2) */ + const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0)); + /* (4 - sqrt(2)) / 8 */ + const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0); + + B->q = 2; + B->p = 1; + + B->A[1][0] = gamma; + B->A[1][1] = gamma; + B->A[2][0] = delta; + B->A[2][1] = delta; + B->A[2][2] = gamma; + + B->b[0] = delta; + B->b[1] = delta; + B->b[2] = gamma; + + B->d[0] = beta; + B->d[1] = beta; + B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))); + + B->c[1] = twogamma; + B->c[2] = SUN_RCONST(1.0); + + return B; + }) + ARK_BUTCHER_TABLE(ARKODE_BILLINGTON_3_3_2, { /* Billington-SDIRK */ ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE); @@ -620,7 +655,7 @@ ARK_BUTCHER_TABLE(ARKODE_ESDIRK324L2SA_4_2_3, { /* ESDIRK3(2)4L[2]SA (A,L stable const sunrealtype g4 = g3 * g; const sunrealtype g5 = g4 * g; const sunrealtype c3 = RCONST(0.6); - + ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE); B->q = 3; B->p = 2; diff --git a/src/arkode/arkode_butcher_erk.def b/src/arkode/arkode_butcher_erk.def index 43a8fe7210..f25fafb830 100644 --- a/src/arkode/arkode_butcher_erk.def +++ b/src/arkode/arkode_butcher_erk.def @@ -52,6 +52,7 @@ ARKODE_ARK548L2SAb_ERK_8_4_5* N ARKODE_VERNER_8_5_6 Y ARKODE_FEHLBERG_13_7_8 Y + ARKODE_ARK2_ERK_3_1_2 Y -------------------------------- ARKODE_KNOTH_WOLKE_3_3^ Y -------------------------------- @@ -75,7 +76,42 @@ ARK_BUTCHER_TABLE(ARKODE_HEUN_EULER_2_1_2, { /* Heun-Euler-ERK */ B->c[1] = RCONST(1.0); return B; - }) + }) + +ARK_BUTCHER_TABLE(ARKODE_ARK2_ERK_3_1_2, { /* ARK2 Explicit Table */ + ARKodeButcherTable B = ARKodeButcherTable_Alloc(3, SUNTRUE); + + /* 1 - 1 / sqrt(2) */ + const sunrealtype gamma = SUN_RCONST(1.0) - SUN_RCONST(1.0) / SUNRsqrt(SUN_RCONST(2.0)); + /* (3 + 2 sqrt(2)) / 6 */ + const sunrealtype alpha = (SUN_RCONST(3.0) + SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(6.0); + /* 1 / (2 sqrt(2)) */ + const sunrealtype delta = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))); + /* 2 - sqrt(2) */ + const sunrealtype twogamma = SUN_RCONST(2.0) - SUNRsqrt(SUN_RCONST(2.0)); + /* (4 - sqrt(2)) / 8 */ + const sunrealtype beta = (SUN_RCONST(4.0) - SUNRsqrt(SUN_RCONST(2.0))) / SUN_RCONST(8.0); + + B->q = 2; + B->p = 1; + + B->A[1][0] = twogamma; + B->A[2][0] = SUN_RCONST(1.0) - alpha; + B->A[2][1] = alpha; + + B->b[0] = delta; + B->b[1] = delta; + B->b[2] = gamma; + + B->d[0] = beta; + B->d[1] = beta; + B->d[2] = SUN_RCONST(1.0) / (SUN_RCONST(2.0) * SUNRsqrt(SUN_RCONST(2.0))); + + B->c[1] = twogamma; + B->c[2] = SUN_RCONST(1.0); + + return B; + }) ARK_BUTCHER_TABLE(ARKODE_BOGACKI_SHAMPINE_4_2_3, { /* Bogacki-Shampine-ERK */ ARKodeButcherTable B = ARKodeButcherTable_Alloc(4, SUNTRUE); @@ -684,4 +720,3 @@ ARK_BUTCHER_TABLE(ARKODE_KNOTH_WOLKE_3_3, { /* Knoth-Wolke-ERK */ B->c[2] = RCONST(3.0)/RCONST(4.0); return B; }) - diff --git a/src/cvode/cvode_ls.c b/src/cvode/cvode_ls.c index a32679ab3c..a9d986c193 100644 --- a/src/cvode/cvode_ls.c +++ b/src/cvode/cvode_ls.c @@ -1636,7 +1636,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight, N_Vector ynow, N_Vector fnow) { CVLsMem cvls_mem; - realtype bnorm, deltar, delta, w_mean; + realtype bnorm = ZERO; + realtype deltar, delta, w_mean; int curiter, nli_inc, retval; #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG realtype resnorm; diff --git a/src/cvodes/cvodes_ls.c b/src/cvodes/cvodes_ls.c index b7477bebca..14d12a9bc6 100644 --- a/src/cvodes/cvodes_ls.c +++ b/src/cvodes/cvodes_ls.c @@ -1723,7 +1723,8 @@ int cvLsSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight, N_Vector ynow, N_Vector fnow) { CVLsMem cvls_mem; - realtype bnorm, deltar, delta, w_mean; + realtype bnorm = ZERO; + realtype deltar, delta, w_mean; int curiter, nli_inc, retval; booleantype do_sensi_sim, do_sensi_stg, do_sensi_stg1; #if SUNDIALS_LOGGING_LEVEL >= SUNDIALS_LOGGING_DEBUG diff --git a/src/sundials/sundials_futils.c b/src/sundials/sundials_futils.c index 194a917ae2..7d32c597db 100644 --- a/src/sundials/sundials_futils.c +++ b/src/sundials/sundials_futils.c @@ -15,15 +15,37 @@ * -----------------------------------------------------------------*/ #include +#include /* Create a file pointer with the given file name and mode. */ FILE* SUNDIALSFileOpen(const char* filename, const char* mode) { - return fopen(filename, mode); + FILE* fp = NULL; + + if (filename) + { + if (!strcmp(filename, "stdout")) + { + fp = stdout; + } + else if (!strcmp(filename, "stderr")) + { + fp = stderr; + } + else + { + fp = fopen(filename, mode); + } + } + + return fp; } /* Close a file pointer with the given file name. */ void SUNDIALSFileClose(FILE* fp) { - fclose(fp); + if (fp && (fp != stdout) && (fp != stderr)) + { + fclose(fp); + } } diff --git a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp index d1c7165ed0..72e87f53f9 100644 --- a/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp +++ b/src/sunlinsol/onemkldense/sunlinsol_onemkldense.cpp @@ -180,6 +180,13 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun if (num_blocks > 1) { +#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP + LS_F_SCRATCH_SIZE(S) = + getrf_scratchpad_size(*queue, // device queue + M, // rows in A_i + N, // columns in A_i + M); // leading dimension +#else LS_F_SCRATCH_SIZE(S) = getrf_batch_scratchpad_size(*queue, // device queue M, // rows in A_i @@ -188,8 +195,17 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun M * N, // stride between A_i M, // stride in P_i num_blocks); // number of blocks +#endif -#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED +#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP + LS_S_SCRATCH_SIZE(S) = + getrs_scratchpad_size(*queue, // device queue + oneapi::mkl::transpose::nontrans, + M, // number of rows in A + 1, // number of right-hand sizes + M, // leading dimension of A + M); // leading dimension of B +#else LS_S_SCRATCH_SIZE(S)= getrs_batch_scratchpad_size(*queue, // device queue oneapi::mkl::transpose::nontrans, @@ -201,14 +217,6 @@ SUNLinearSolver SUNLinSol_OneMklDense(N_Vector y, SUNMatrix Amat, SUNContext sun M, // leading dimension of B_i M, // stride between B_i num_blocks); // number of blocks -#else - LS_S_SCRATCH_SIZE(S) = - getrs_scratchpad_size(*queue, // device queue - oneapi::mkl::transpose::nontrans, - M, // number of rows in A - 1, // number of right-hand sizes - M, // leading dimension of A - M); // leading dimension of B #endif } else @@ -326,6 +334,36 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A) if (num_blocks > 1) { +#ifdef SUNDIALS_ONEMKL_USE_GETRF_LOOP + try + { + for (sunindextype i = 0; i < num_blocks; i++) + { + getrf(*queue, // device queue + M, // number of rows + N, // number of columns + Adata + i * M * N, // matrix data + M, // leading dimension of A + pivots + i * M, // array of pivots + scratchpad, // scratchpad memory + scratch_size); // scratchpad size + } + } + catch(oneapi::mkl::lapack::exception const& e) + { + SUNDIALS_DEBUG_ERROR("An exception occured in getrf\n"); + if (e.info()) + { + // An illegal value was providied or the scratch pad is too small + ier = -1; + } + else + { + // The diagonal element of some of U_i is zero + ier = 1; + } + } +#else try { getrf_batch(*queue, // device queue @@ -354,6 +392,7 @@ int SUNLinSolSetup_OneMklDense(SUNLinearSolver S, SUNMatrix A) ier = 1; } } +#endif } else { @@ -467,7 +506,30 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x, if (num_blocks > 1) { -#ifdef SUNDIALS_ONEMKL_USE_GETRS_BATCHED +#ifdef SUNDIALS_ONEMKL_USE_GETRS_LOOP + try + { + for (sunindextype i = 0; i < num_blocks; i++) + { + getrs(*queue, // device queue + oneapi::mkl::transpose::nontrans, + M, // number of rows + 1, // number of right-hand sides + Adata + i * M * N, // factorized matrix data + M, // leading dimension of A + pivots, // array of pivots + xdata + i * M, // right-hand side data + M, // leading dimension of B_i + scratchpad, // scratchpad memory + scratch_size); // scratchpad size + } + } + catch(oneapi::mkl::lapack::exception const& e) + { + SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n"); + ier = -1; + } +#else try { getrs_batch(*queue, // device queue @@ -491,29 +553,6 @@ int SUNLinSolSolve_OneMklDense(SUNLinearSolver S, SUNMatrix A, N_Vector x, SUNDIALS_DEBUG_ERROR("An exception occured in getrs_batch\n"); ier = -1; } -#else - try - { - for (sunindextype i = 0; i < num_blocks; i++) - { - getrs(*queue, // device queue - oneapi::mkl::transpose::nontrans, - M, // number of rows - 1, // number of right-hand sides - Adata + i * M * N, // factorized matrix data - M, // leading dimension of A - pivots, // array of pivots - xdata + i * M, // right-hand side data - M, // leading dimension of B_i - scratchpad, // scratchpad memory - scratch_size); // scratchpad size - } - } - catch(oneapi::mkl::lapack::exception const& e) - { - SUNDIALS_DEBUG_ERROR("An exception occured in getrs\n"); - ier = -1; - } #endif } else diff --git a/test/answers b/test/answers index 72fd01e63e..96d6e170c1 160000 --- a/test/answers +++ b/test/answers @@ -1 +1 @@ -Subproject commit 72fd01e63edeffe39a800c820ac8aa8447270bf7 +Subproject commit 96d6e170c15f997d1e9062d4e6478e618d3f30ca diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp index d89ecacc39..8ad033603d 100644 --- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp +++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.cpp @@ -29,13 +29,13 @@ int main() { // set vectors of individual tables to test - std::vector Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2", + std::vector Tables_ERK = {"ARKODE_HEUN_EULER_2_1_2", "ARKODE_ARK2_ERK_3_1_2", "ARKODE_BOGACKI_SHAMPINE_4_2_3", "ARKODE_ARK324L2SA_ERK_4_2_3", "ARKODE_ZONNEVELD_5_3_4", "ARKODE_ARK436L2SA_ERK_6_3_4", "ARKODE_SAYFY_ABURUB_6_3_4", "ARKODE_CASH_KARP_6_4_5", "ARKODE_FEHLBERG_6_4_5", "ARKODE_DORMAND_PRINCE_7_4_5", "ARKODE_ARK548L2SA_ERK_8_4_5", "ARKODE_VERNER_8_5_6", "ARKODE_FEHLBERG_13_7_8", "ARKODE_ARK437L2SA_ERK_7_3_4", "ARKODE_ARK548L2SAb_ERK_8_4_5"}; - std::vector Tables_DIRK = {"ARKODE_SDIRK_2_1_2", + std::vector Tables_DIRK = {"ARKODE_SDIRK_2_1_2", "ARKODE_ARK2_DIRK_3_1_2", "ARKODE_BILLINGTON_3_3_2", "ARKODE_TRBDF2_3_3_2", "ARKODE_KVAERNO_4_2_3", "ARKODE_ARK324L2SA_DIRK_4_2_3", "ARKODE_CASH_5_2_4", "ARKODE_CASH_5_3_4", "ARKODE_SDIRK_5_3_4", "ARKODE_KVAERNO_5_3_4", "ARKODE_ARK436L2SA_DIRK_6_3_4", @@ -44,15 +44,15 @@ int main() { "ARKODE_ESDIRK324L2SA_4_2_3", "ARKODE_ESDIRK325L2SA_5_2_3", "ARKODE_ESDIRK32I5L2SA_5_2_3", "ARKODE_ESDIRK436L2SA_6_3_4", "ARKODE_ESDIRK43I6L2SA_6_3_4", "ARKODE_QESDIRK436L2SA_6_3_4", "ARKODE_ESDIRK437L2SA_7_3_4", "ARKODE_ESDIRK547L2SA_7_4_5", "ARKODE_ESDIRK547L2SA2_7_4_5"}; - std::vector Tables_ARK_ERK = {ARKODE_ARK324L2SA_ERK_4_2_3, + std::vector Tables_ARK_ERK = {ARKODE_ARK2_ERK_3_1_2, ARKODE_ARK324L2SA_ERK_4_2_3, ARKODE_ARK436L2SA_ERK_6_3_4, ARKODE_ARK437L2SA_ERK_7_3_4, ARKODE_ARK548L2SA_ERK_8_4_5, ARKODE_ARK548L2SAb_ERK_8_4_5}; - std::vector Tables_ARK_DIRK = {ARKODE_ARK324L2SA_DIRK_4_2_3, + std::vector Tables_ARK_DIRK = {ARKODE_ARK2_DIRK_3_1_2, ARKODE_ARK324L2SA_DIRK_4_2_3, ARKODE_ARK436L2SA_DIRK_6_3_4, ARKODE_ARK437L2SA_DIRK_7_3_4, ARKODE_ARK548L2SA_DIRK_8_4_5, ARKODE_ARK548L2SAb_DIRK_8_4_5}; - std::vector STables_ARK = {"ARKODE_ARK324L2SA_4_2_3", "ARKODE_ARK436L2SA_6_3_4", - "ARKODE_ARK437L2SA_7_3_4", "ARKODE_ARK548L2SA_8_4_5", - "ARKODE_ARK548L2SAb_8_4_5"}; + std::vector STables_ARK = {"ARKODE_ARK2_3_1_2", "ARKODE_ARK324L2SA_4_2_3", + "ARKODE_ARK436L2SA_6_3_4", "ARKODE_ARK437L2SA_7_3_4", + "ARKODE_ARK548L2SA_8_4_5", "ARKODE_ARK548L2SAb_8_4_5"}; int numfails = 0; // loop over individual ERK tables diff --git a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out index 054defea65..8d7971338b 100644 --- a/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out +++ b/test/unit_tests/arkode/CXX_serial/ark_test_butcher.out @@ -2,6 +2,7 @@ Testing individual ERK methods: Testing method ARKODE_HEUN_EULER_2_1_2: table matches predicted method/embedding orders of 2/1 +Testing method ARKODE_ARK2_ERK_3_1_2: table matches predicted method/embedding orders of 2/1 Testing method ARKODE_BOGACKI_SHAMPINE_4_2_3: table matches predicted method/embedding orders of 3/2 Testing method ARKODE_ARK324L2SA_ERK_4_2_3: table matches predicted method/embedding orders of 3/2 Testing method ARKODE_ZONNEVELD_5_3_4: table matches predicted method/embedding orders of 4/3 @@ -25,6 +26,7 @@ Testing method ARKODE_ARK548L2SAb_ERK_8_4_5: table matches predicted method/emb Testing individual DIRK methods: Testing method ARKODE_SDIRK_2_1_2: table matches predicted method/embedding orders of 2/1 +Testing method ARKODE_ARK2_DIRK_3_1_2: table matches predicted method/embedding orders of 2/1 Testing method ARKODE_BILLINGTON_3_3_2: table matches predicted method/embedding orders of 2/3 Testing method ARKODE_TRBDF2_3_3_2: table matches predicted method/embedding orders of 2/3 Testing method ARKODE_KVAERNO_4_2_3: table matches predicted method/embedding orders of 3/2 @@ -50,6 +52,7 @@ Testing method ARKODE_ESDIRK547L2SA2_7_4_5: table matches predicted method/embe Testing ARK pairs: +Testing method ARKODE_ARK2_3_1_2: Method/embedding match predicted orders of 2/1 Testing method ARKODE_ARK324L2SA_4_2_3: Method/embedding match predicted orders of 3/2 Testing method ARKODE_ARK436L2SA_6_3_4: Method/embedding match predicted orders of 4/3 Testing method ARKODE_ARK437L2SA_7_3_4: Method/embedding match predicted orders of 4/3 diff --git a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp index 325d8654d1..52e1dab5dd 100644 --- a/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp +++ b/test/unit_tests/sunmemory/sycl/test_sunmemory_sycl.cpp @@ -20,7 +20,7 @@ int test_instance(SUNMemoryHelper helper, SUNMemoryType mem_type, bool print_test_status) { // Create an in-order GPU queue -#if SYCL_LANGUAGE_VERSION >= 2020 +#if SYCL_LANGUAGE_VERSION >= 2020 && !defined(SUNDIALS_SYCL_2020_UNSUPPORTED) sycl::queue myQueue(sycl::gpu_selector_v, sycl::property_list{sycl::property::queue::in_order{}}); #else