diff --git a/.github/_typos.toml b/.github/_typos.toml new file mode 100644 index 00000000000..4b9d9be6403 --- /dev/null +++ b/.github/_typos.toml @@ -0,0 +1,23 @@ +[files] +extend-exclude = ["third_party/*", "*.svg"] + +[default.extend-words] +dout = "dout" +nd = "nd" +tht = "tht" +automatical = "automatical" +strat = "strat" +entrie = "entrie" +agregate = "agregate" # since that script name is already in ginkgo-data repo + +[default.extend-identifiers] +set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone +HSA_HEADER = "HSA_HEADER" +conj_operaton = "conj_operaton" # considered interface break in range.hpp +imag_operaton = "imag_operaton" # considered interface break in range.hpp +real_operaton = "real_operaton" # considered interface break in range.hpp +one_operaton = "one_operaton" # considered interface break in range.hpp +abs_operaton = "abs_operaton" # considered interface break in range.hpp +max_operaton = "max_operaton" # considered interface break in range.hpp +min_operaton = "min_operaton" # considered interface break in range.hpp +squared_norm_operaton = "squared_norm_operaton" # considered interface break in range.hpp diff --git a/.github/workflows/bot-pr-updated.yml b/.github/workflows/bot-pr-updated.yml index ae357c9db96..8554ca3b1e9 100644 --- a/.github/workflows/bot-pr-updated.yml +++ b/.github/workflows/bot-pr-updated.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest if: github.event.pull_request.author_association == 'COLLABORATOR' || github.event.pull_request.author_association == 'MEMBER' || github.event.pull_request.author_association == 'OWNER' env: - CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF + CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF steps: - name: Checkout the new code (shallow clone) uses: actions/checkout@v3 diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index e612c72b7e7..db18b510e21 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -21,7 +21,8 @@ jobs: fail-fast: false matrix: config: - - {build_type: "Release", name: "intel/release/shared", "mixed": "ON"} + - {compiler: "dpcpp", build_type: "Release", name: "intel/dpcpp/release/shared", mixed: "ON"} + - {compiler: "icpx", build_type: "Release", name: "intel/icpx/release/shared", mixed: "OFF"} name: ${{ matrix.config.name }} runs-on: [gpu_intel] @@ -35,9 +36,9 @@ jobs: spack find --loaded mkdir build cd build - cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON + cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DGINKGO_COMPILER_FLAGS="-ffp-model=precise" -DCMAKE_CXX_COMPILER=${{ matrix.config.compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON make -j8 - SYCL_DEVICE_FILTER=level_zero ctest -j10 --output-on-failure + ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure - name: install run: | diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml new file mode 100644 index 00000000000..0049dce9180 --- /dev/null +++ b/.github/workflows/spell_check.yml @@ -0,0 +1,16 @@ +name: Test GitHub Action +on: + pull_request: + types: [opened, synchronize] + +jobs: + run: + name: Spell Check with Typos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check for typos + uses: crate-ci/typos@master + with: + config: .github/_typos.toml + diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index f7d73e2fd82..87f9214876d 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -27,8 +27,10 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} + # Debug shared exceeds symbol limit + # - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + - {shared: "ON", build_type: "Release", name: "reference/release/shared"} # Debug static needs too much storage # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} name: msvc/${{ matrix.config.name }} diff --git a/.gitignore b/.gitignore index af0a88ef513..827f4025a2e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ compile_commands.json CTestTestfile.cmake build +### Python +__pycache__ + ### IDE # Clion .idea diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f81e271288c..ab78943a409 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,6 +19,13 @@ include: - local: '.gitlab/rules.yml' - local: '.gitlab/scripts.yml' - local: '.gitlab/variables.yml' + # This is a workaround to conditionally make the branch pipelines + # interruptible, because the flag does not directly support rules [1]. + # + # [1] https://gitlab.com/gitlab-org/gitlab/-/issues/194023#note_1225906002 + - local: '.gitlab/add-interrupt.yml' + rules: + - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" && $CI_COMMIT_TAG !~ /^v\d+\.\d+\.\d+/ sync: stage: sync @@ -86,20 +93,6 @@ trigger_pipeline: # Build jobs # Job with example runs. -# cuda 9.2 and friends -build/cuda92/nompi/gcc/all/release/shared: - extends: - - .build_and_test_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_HIP: "ON" - BUILD_TYPE: "Release" - CUDA_ARCH: 61 - # cuda 10.1 and friends # Build CUDA NVIDIA without omp # Make sure that our jobs run when HWLOC is @@ -120,6 +113,7 @@ build/cuda101/nompi/clang/cuda_wo_omp/release/shared: CUDA_ARCH: 35 # Job with example runs. +# Also explicitly test PAPI SDE build/cuda101/openmpi/gcc/all/debug/shared: extends: - .build_template @@ -133,6 +127,7 @@ build/cuda101/openmpi/gcc/all/debug/shared: MPI_AS_ROOT: "ON" BUILD_HIP: "ON" BUILD_TYPE: "Debug" + BUILD_PAPI_SDE: "ON" RUN_EXAMPLES: "ON" CUDA_ARCH: 35 @@ -169,7 +164,6 @@ build/cuda101/nompi/clang/all/release/static: # MPI_AS_ROOT: "ON" # BUILD_HIP: "OFF" # BUILD_TYPE: "Release" -# CUDA_ARCH: 61 #build/clang-cuda101/nompi/clang/cuda/debug/static: @@ -187,7 +181,6 @@ build/cuda101/nompi/clang/all/release/static: # BUILD_TYPE: "Debug" # FAST_TESTS: "ON" # BUILD_SHARED_LIBS: "OFF" -# CUDA_ARCH: 61 # cuda 10.2 and friends @@ -272,6 +265,8 @@ test/cuda110/mvapich2/gcc/cuda/debug/shared: SLURM_GRES: "gpu:4" SLURM_TIME: "02:00:00" dependencies: null + # FIXME: current slurm always reports failure even if all tests are passed. + allow_failure: yes needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ] @@ -302,9 +297,11 @@ test/cuda110/nompi/clang/cuda/release/static: variables: USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}" SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:1" + SLURM_GRES: "gpu:4" SLURM_TIME: "01:30:00" dependencies: null + # FIXME: current slurm always reports failure even if all tests are passed. + allow_failure: yes needs: [ "build/cuda110/nompi/clang/cuda/release/static" ] @@ -336,9 +333,11 @@ test/cuda110/nompi/intel/cuda/debug/static: variables: USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}" SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:1" + SLURM_GRES: "gpu:4" SLURM_TIME: "02:00:00" dependencies: null + # FIXME: current slurm always reports failure even if all tests are passed. + allow_failure: yes needs: [ "build/cuda110/nompi/intel/cuda/debug/static" ] @@ -348,7 +347,7 @@ build/cuda114/nompi/gcc/cuda/debug/shared: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko_cuda114-openmpi-gnu11-llvm12 + - .use_gko_cuda114-openmpi-gnu10-llvm12 variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" @@ -358,7 +357,6 @@ build/cuda114/nompi/gcc/cuda/debug/shared: CXX_FLAGS: "-Wno-error=maybe-uninitialized" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 # nvhpc and friends @@ -381,7 +379,6 @@ build/nvhpc233/cuda120/nompi/nvcpp/release/static: CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 build/nvhpc227/cuda117/nompi/nvcpp/debug/shared: extends: @@ -401,7 +398,6 @@ build/nvhpc227/cuda117/nompi/nvcpp/debug/shared: CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced" # disable spurious unused argument warning EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" - CUDA_ARCH: 61 # ROCm 4.5 and friends build/amd/nompi/gcc/rocm45/release/shared: @@ -538,24 +534,13 @@ build/nocuda/openmpi/clang/omp/debug/static: FAST_TESTS: "ON" BUILD_SHARED_LIBS: "OFF" -test/nocuda/openmpi/clang/omp/debug/static: - extends: - - .build_and_test_template - - .default_variables - - .full_test_condition - - .use_gko-nocuda-openmpi-gnu9-llvm8 - variables: - USE_NAME: "nocuda-openmpi-clang-${CI_PIPELINE_ID}" - dependencies: null - needs: [ "build/nocuda/openmpi/clang/omp/debug/static" ] - # nocuda with the oldest supported compiler build/nocuda/nompi/gcc/omp/release/static: extends: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018 + - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 variables: BUILD_OMP: "ON" BUILD_TYPE: "Release" @@ -566,7 +551,7 @@ build/nocuda-nomixed/nompi/clang/omp/release/static: - .build_and_test_template - .default_variables - .full_test_condition - - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018 + - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019 variables: C_COMPILER: "clang" CXX_COMPILER: "clang++" @@ -602,7 +587,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static: BUILD_SHARED_LIBS: "OFF" MIXED_PRECISION: "OFF" -build/dpcpp/2022-1/cpu/release/static: +build/dpcpp/2022-1/cpu/release/shared: extends: - .build_and_test_template - .default_variables @@ -611,10 +596,11 @@ build/dpcpp/2022-1/cpu/release/static: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" - SYCL_DEVICE_TYPE: "CPU" + SYCL_DEVICE_FILTER: "*:cpu" SLURM_PARTITION: "cpu" SLURM_TIME: "2:00:00" # This job is not in exclusive mode @@ -629,11 +615,12 @@ build/dpcpp/igpu/release/shared: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_TYPE: "GPU" + ONEAPI_DEVICE_SELECTOR: "*:gpu" # TODO: Enable when debug shared library size issues are fixed # build/dpcpp/level_zero_igpu/debug/shared: @@ -645,11 +632,12 @@ build/dpcpp/igpu/release/shared: # variables: # C_COMPILER: "gcc" # CXX_COMPILER: "dpcpp" -# BUILD_DPCPP: "ON" +# BUILD_SYCL: "ON" +# GKO_COMPILER_FLAGS: "-ffp-model=precise" # BUILD_TYPE: "Debug" # BUILD_SHARED_LIBS: "ON" # DPCPP_SINGLE_MODE: "ON" -# SYCL_DEVICE_FILTER: "Level_Zero:GPU" +# ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" # It gives two available backends of GPU on tests build/dpcpp/dgpu/release/static: @@ -661,11 +649,12 @@ build/dpcpp/dgpu/release/static: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OF" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_TYPE: "GPU" + ONEAPI_DEVICE_SELECTOR: "*:gpu" build/dpcpp/level_zero_dgpu/release/shared: extends: @@ -676,10 +665,26 @@ build/dpcpp/level_zero_dgpu/release/shared: variables: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" - BUILD_DPCPP: "ON" + BUILD_SYCL: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" + BUILD_TYPE: "Release" + DPCPP_SINGLE_MODE: "ON" + ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" + +build/icpx/level_zero_dgpu/release/shared: + extends: + - .build_and_test_template + - .default_variables + - .quick_test_condition + - .use_gko-oneapi-dgpu + variables: + C_COMPILER: "icx" + CXX_COMPILER: "icpx" + BUILD_SYCL: "ON" + GKO_COMPILER_FLAGS: "-ffp-model=precise" BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" - SYCL_DEVICE_FILTER: "Level_Zero:GPU" + ONEAPI_DEVICE_SELECTOR: "level_zero:gpu" # Job with important warnings as error warnings: @@ -694,6 +699,7 @@ warnings: BUILD_CUDA: "ON" BUILD_HIP: "ON" CXX_FLAGS: "-Werror=pedantic -pedantic-errors" + GKO_COMPILER_FLAGS: "-Wpedantic" allow_failure: yes # Ensure kernel modules do not depend on core @@ -818,6 +824,7 @@ sonarqube_cov: # Deploy documentation to github-pages gh-pages: stage: deploy + interruptible: false extends: - .default_variables - .deploy_condition @@ -833,7 +840,7 @@ gh-pages: -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF - -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF + -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF -DGINKGO_BUILD_MPI=OFF -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON - make usr @@ -921,6 +928,7 @@ cudamemcheck: new-issue-on-failure: stage: on-failure + interruptible: false extends: - .default_variables - .use_status-job-settings diff --git a/.gitlab/add-interrupt.yml b/.gitlab/add-interrupt.yml new file mode 100644 index 00000000000..cf6fd95fe1e --- /dev/null +++ b/.gitlab/add-interrupt.yml @@ -0,0 +1,2 @@ +default: + interruptible: true diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 50dfbe9d2f8..eb1ab5128af 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -17,19 +17,13 @@ - cpu - amdci -.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018: - image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2018 +.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019: + image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2019 tags: - private_ci - cpu - controller -.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017: - image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017 - tags: - - private_ci - - nvidia-gpu - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019: image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019 tags: @@ -56,8 +50,8 @@ - private_ci - horeka -.use_gko_cuda114-openmpi-gnu11-llvm12: - image: ginkgohub/cuda:114-openmpi-gnu11-llvm12 +.use_gko_cuda114-openmpi-gnu10-llvm12: + image: ginkgohub/cuda:114-openmpi-gnu10-llvm12 tags: - private_ci - nvidia-gpu @@ -78,15 +72,13 @@ image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8 tags: - private_ci - - amdci - - gpu + - amd-gpu .use_gko-rocm502-nompi-gnu11-llvm11: image: ginkgohub/rocm:502-openmpi-gnu11-llvm11 tags: - private_ci - - amdci - - gpu + - amd-gpu .use_gko-oneapi-cpu: image: ginkgohub/oneapi:2022.1 diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 537f2e5e83e..504aa7dad40 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -22,8 +22,7 @@ script: - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} - if [ -n "${CUDA_ARCH}" ]; then - CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; - CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); + export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; fi - if [[ "${MPI_AS_ROOT}" == "ON" ]];then export OMPI_ALLOW_RUN_AS_ROOT=1; @@ -32,17 +31,19 @@ - if [[ "${BUILD_MPI}" == "ON" ]]; then MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX}; fi + - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER} - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX} -GNinja - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} + -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} - -DGINKGO_BUILD_HIP=${BUILD_HIP} + -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} + -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} -DGINKGO_TEST_NONDEFAULT_STREAM=${NONDEFAULT_STREAM} @@ -52,6 +53,7 @@ -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR} - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install + - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr - if [ "${EXPORT_BUILD_DIR}" == "ON" ]; then ninja test_exportbuild; fi - LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ninja test_pkgconfig dependencies: [] @@ -63,11 +65,11 @@ script: - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME} - if [ -n "${CUDA_ARCH}" ]; then - CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; - CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER}); + export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH}; fi - if [ -n "${SYCL_DEVICE_TYPE}" ]; then export SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then export SYCL_DEVICE_FILTER; fi + - if [ -n "${ONEAPI_DEVICE_SELECTOR}" ]; then export ONEAPI_DEVICE_SELECTOR; fi - if [[ "${MPI_AS_ROOT}" == "ON" ]];then export OMPI_ALLOW_RUN_AS_ROOT=1; export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1; @@ -75,17 +77,18 @@ - if [[ "${BUILD_MPI}" == "ON" ]]; then MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX}; fi + - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER} - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX} - -GNinja - -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + -GNinja -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR} + ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} + -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE} -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA} - -DGINKGO_BUILD_HIP=${BUILD_HIP} + -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL} -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR} -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} + -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} @@ -94,9 +97,10 @@ -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR} - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install + - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - - ctest -V --timeout 6000 + - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS} - ninja test_install - pushd test/test_install - ninja install @@ -148,7 +152,7 @@ - cd ${CI_JOB_NAME/test/build} - | (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1 - - ctest -V --timeout 6000 + - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS} - ninja test_install - pushd test/test_install - ninja install diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml index 6ae62b8c899..6c75d60d069 100644 --- a/.gitlab/variables.yml +++ b/.gitlab/variables.yml @@ -11,8 +11,11 @@ BUILD_OMP: "OFF" BUILD_CUDA: "OFF" BUILD_HIP: "OFF" + BUILD_SYCL: "OFF" BUILD_HWLOC: "ON" + BUILD_PAPI_SDE: "OFF" BUILD_MPI: "OFF" + GKO_COMPILER_FLAGS: "" MPI_AS_ROOT: "OFF" FAST_TESTS: "OFF" NONDEFAULT_STREAM: "OFF" diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md index df081e2211b..d6e68911d1a 100644 --- a/ABOUT-LICENSING.md +++ b/ABOUT-LICENSING.md @@ -76,7 +76,7 @@ the following license: When compiling Ginkgo with `-DGINKGO_BUILD_BENCHMARKS=ON` the build system will download, build, and link [gflags](https://github.com/gflags/gflags) and -[RapidJSON](https://github.com/Tencent/rapidjson) with the +[nlohmann-json](https://github.com/nlohmann/json) with the benchmark suites. gtest is available under the following license: > Copyright (c) 2006, Google Inc. @@ -108,110 +108,22 @@ benchmark suites. gtest is available under the following license: > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -RapidJSON is available under the following license (note that Ginkgo's build -system automatically removes the `bin/jsonchecker/` directory which is licensed -under the problematic JSON license): +nlohmann-json is available under the following license: -> Tencent is pleased to support the open source community by making RapidJSON -> available. -> -> Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All -> rights reserved. -> -> If you have downloaded a copy of the RapidJSON binary from Tencent, please -> note that the RapidJSON binary is licensed under the MIT License. If you have -> downloaded a copy of the RapidJSON source code from Tencent, please note that -> RapidJSON source code is licensed under the MIT License, except for the -> third-party components listed below which are subject to different license -> terms. Your integration of RapidJSON into your own projects may require -> compliance with the MIT License, as well as the other licenses applicable to -> the third-party components included within RapidJSON. To avoid the problematic -> JSON license in your own projects, it's sufficient to exclude the -> bin/jsonchecker/ directory, as it's the only code under the JSON license. A -> copy of the MIT License is included in this file. -> -> Other dependencies and licenses: -> -> Open Source Software Licensed Under the BSD License: -> -------------------------------------------------------------------- -> -> The msinttypes r29 -> -> Copyright (c) 2006-2013 Alexander Chemeris -> All rights reserved. -> -> Redistribution and use in source and binary forms, with or without -> modification, are permitted provided that the following conditions are met: -> -> * Redistributions of source code must retain the above copyright notice, this -> list of conditions and the following disclaimer. -> * Redistributions in binary form must reproduce the above copyright notice, -> this list of conditions and the following disclaimer in the documentation -> and/or other materials provided with the distribution. -> * Neither the name of copyright holder nor the names of its contributors may -> be used to endorse or promote products derived from this software without -> specific prior written permission. -> -> THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY -> EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -> WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -> DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY -> DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -> (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -> LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -> ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -> (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -> SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -> -> Open Source Software Licensed Under the JSON License: -> -------------------------------------------------------------------- -> -> json.org -> Copyright (c) 2002 -> JSON.org All Rights Reserved. -> -> JSON_checker -> Copyright (c) 2002 JSON.org -> All Rights Reserved. -> -> -> Terms of the JSON License: -> --------------------------------------------------- -> -> Permission is hereby granted, free of charge, to any person obtaining a copy -> of this software and associated documentation files (the "Software"), to deal -> in the Software without restriction, including without limitation the rights -> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -> copies of the Software, and to permit persons to whom the Software is -> furnished to do so, subject to the following conditions: -> -> The above copyright notice and this permission notice shall be included in all -> copies or substantial portions of the Software. -> -> The Software shall be used for Good, not Evil. -> -> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -> SOFTWARE. -> -> -> Terms of the MIT License: -> -------------------------------------------------------------------- -> +> MIT License +> +> Copyright (c) 2013-2022 Niels Lohmann +> > Permission is hereby granted, free of charge, to any person obtaining a copy > of this software and associated documentation files (the "Software"), to deal > in the Software without restriction, including without limitation the rights > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > copies of the Software, and to permit persons to whom the Software is > furnished to do so, subject to the following conditions: -> + > The above copyright notice and this permission notice shall be included in all > copies or substantial portions of the Software. -> +> > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -220,7 +132,6 @@ under the problematic JSON license): > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > SOFTWARE. - For generating the documentation of Ginkgo, some scripts from the deal.II library are used. You can refer to the `doc/` folder to see which files are a modified version of deal.II's documentation generation scripts. Additionally, diff --git a/CHANGELOG.md b/CHANGELOG.md index 34d53363898..90834b209dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,121 @@ git log --first-parent Please visit our wiki [Changelog](https://github.com/ginkgo-project/ginkgo/wiki/Changelog) for unreleased changes. +## Version 1.7.0 + +The Ginkgo team is proud to announce the new Ginkgo minor release 1.7.0. This release brings new features such as: +- Complete GPU-resident sparse direct solvers feature set and interfaces, +- Improved Cholesky factorization performance, +- A new MC64 reordering, +- Batched iterative solver support with the BiCGSTAB solver with batched Dense and ELL matrix types, +- MPI support for the SYCL backend, +- Improved ParILU(T)/ParIC(T) preconditioner convergence, +and more! + +If you face an issue, please first check our [known issues page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues) and the [open issues list](https://github.com/ginkgo-project/ginkgo/issues) and if you do not find a solution, feel free to [open a new issue](https://github.com/ginkgo-project/ginkgo/issues/new/choose) or ask a question using the [github discussions](https://github.com/ginkgo-project/ginkgo/discussions). + +Supported systems and requirements: ++ For all platforms, CMake 3.16+ ++ C++14 compliant compiler ++ Linux and macOS + + GCC: 5.5+ + + clang: 3.9+ + + Intel compiler: 2019+ + + Apple Clang: 14.0 is tested. Earlier versions might also work. + + NVHPC: 22.7+ + + Cray Compiler: 14.0.1+ + + CUDA module: CMake 3.18+, and CUDA 10.1+ or NVHPC 22.7+ + + HIP module: ROCm 4.5+ + + DPC++ module: Intel oneAPI 2022.1+ with oneMKL and oneDPL. Set the CXX compiler to `dpcpp` or `icpx`. + + MPI: standard version 3.1+, ideally GPU Aware, for best performance ++ Windows + + MinGW: GCC 5.5+ + + Microsoft Visual Studio: VS 2019+ + + CUDA module: CUDA 10.1+, Microsoft Visual Studio + + OpenMP module: MinGW. + +### Version support changes + ++ CUDA 9.2 is no longer supported and 10.0 is untested [#1382](https://github.com/ginkgo-project/ginkgo/pull/1382) ++ Ginkgo now requires CMake version 3.16 (and 3.18 for CUDA) [#1368](https://github.com/ginkgo-project/ginkgo/pull/1368) + +### Interface changes + ++ `const` Factory parameters can no longer be modified through `with_*` functions, as this breaks const-correctness [#1336](https://github.com/ginkgo-project/ginkgo/pull/1336) [#1439](https://github.com/ginkgo-project/ginkgo/pull/1439) + +### New Deprecations + ++ The `device_reset` parameter of CUDA and HIP executors no longer has an effect, and its `allocation_mode` parameters have been deprecated in favor of the `Allocator` interface. [#1315](https://github.com/ginkgo-project/ginkgo/pull/1315) ++ The CMake parameter `GINKGO_BUILD_DPCPP` has been deprecated in favor of `GINKGO_BUILD_SYCL`. [#1350](https://github.com/ginkgo-project/ginkgo/pull/1350) ++ The `gko::reorder::Rcm` interface has been deprecated in favor of `gko::experimental::reorder::Rcm` based on `Permutation`. [#1418](https://github.com/ginkgo-project/ginkgo/pull/1418) ++ The Permutation class' `permute_mask` functionality. [#1415](https://github.com/ginkgo-project/ginkgo/pull/1415) ++ Multiple functions with typos (`set_complex_subpsace()`, range functions such as `conj_operaton` etc). [#1348](https://github.com/ginkgo-project/ginkgo/pull/1348) + +### Summary of previous deprecations ++ `gko::lend()` is not necessary anymore. ++ The classes `RelativeResidualNorm` and `AbsoluteResidualNorm` are deprecated in favor of `ResidualNorm`. ++ The class `AmgxPgm` is deprecated in favor of `Pgm`. ++ Default constructors for the CSR `load_balance` and `automatical` strategies ++ The PolymorphicObject's move-semantic `copy_from` variant ++ The templated `SolverBase` class. ++ The class `MachineTopology` is deprecated in favor of `machine_topology`. ++ Logger constructors and create functions with the `executor` parameter. ++ The virtual, protected, Dense functions `compute_norm1_impl`, `add_scaled_impl`, etc. ++ Logger events for solvers and criterion without the additional `implicit_tau_sq` parameter. ++ The global `gko::solver::default_krylov_dim`, use instead `gko::solver::gmres_default_krylov_dim`. + +### Added features + ++ Adds a batch::BatchLinOp class that forms a base class for batched linear operators such as batched matrix formats, solver and preconditioners [#1379](https://github.com/ginkgo-project/ginkgo/pull/1379) ++ Adds a batch::MultiVector class that enables operations such as dot, norm, scale on batched vectors [#1371](https://github.com/ginkgo-project/ginkgo/pull/1371) ++ Adds a batch::Dense matrix format that stores batched dense matrices and provides gemv operations for these dense matrices. [#1413](https://github.com/ginkgo-project/ginkgo/pull/1413) ++ Adds a batch::Ell matrix format that stores batched Ell matrices and provides spmv operations for these batched Ell matrices. [#1416](https://github.com/ginkgo-project/ginkgo/pull/1416) [#1437](https://github.com/ginkgo-project/ginkgo/pull/1437) ++ Add a batch::Bicgstab solver (class, core, and reference kernels) that enables iterative solution of batched linear systems [#1438](https://github.com/ginkgo-project/ginkgo/pull/1438). ++ Add device kernels (CUDA, HIP, and DPCPP) for batch::Bicgstab solver. [#1443](https://github.com/ginkgo-project/ginkgo/pull/1443). ++ New MC64 reordering algorithm which optimizes the diagonal product or sum of a matrix by permuting the rows, and computes additional scaling factors for equilibriation [#1120](https://github.com/ginkgo-project/ginkgo/pull/1120) ++ New interface for (non-symmetric) permutation and scaled permutation of Dense and Csr matrices [#1415](https://github.com/ginkgo-project/ginkgo/pull/1415) ++ LU and Cholesky Factorizations can now be separated into their factors [#1432](https://github.com/ginkgo-project/ginkgo/pull/1432) ++ New symbolic LU factorization algorithm that is optimized for matrices with an almost-symmetric sparsity pattern [#1445](https://github.com/ginkgo-project/ginkgo/pull/1445) ++ Sorting kernels for SparsityCsr on all backends [#1343](https://github.com/ginkgo-project/ginkgo/pull/1343) ++ Allow passing pre-generated local solver as factory parameter for the distributed Schwarz preconditioner [#1426](https://github.com/ginkgo-project/ginkgo/pull/1426) ++ Add DPCPP kernels for Partition [#1034](https://github.com/ginkgo-project/ginkgo/pull/1034), and CSR's `check_diagonal_entries` and `add_scaled_identity` functionality [#1436](https://github.com/ginkgo-project/ginkgo/pull/1436) ++ Adds a helper function to create a partition based on either local sizes, or local ranges [#1227](https://github.com/ginkgo-project/ginkgo/pull/1227) ++ Add function to compute arithmetic mean of dense and distributed vectors [#1275](https://github.com/ginkgo-project/ginkgo/pull/1275) ++ Adds `icpx` compiler supports [#1350](https://github.com/ginkgo-project/ginkgo/pull/1350) ++ All backends can be built simultaneously [#1333](https://github.com/ginkgo-project/ginkgo/pull/1333) ++ Emits a CMake warning in downstream projects that use different compilers than the installed Ginkgo [#1372](https://github.com/ginkgo-project/ginkgo/pull/1372) ++ Reordering algorithms in sparse_blas benchmark [#1354](https://github.com/ginkgo-project/ginkgo/pull/1354) ++ Benchmarks gained an `-allocator` parameter to specify device allocators [#1385](https://github.com/ginkgo-project/ginkgo/pull/1385) ++ Benchmarks gained an `-input_matrix` parameter that initializes the input JSON based on the filename [#1387](https://github.com/ginkgo-project/ginkgo/pull/1387) ++ Benchmark inputs can now be reordered as a preprocessing step [#1408](https://github.com/ginkgo-project/ginkgo/pull/1408) + + +### Improvements + ++ Significantly improve Cholesky factorization performance [#1366](https://github.com/ginkgo-project/ginkgo/pull/1366) ++ Improve parallel build performance [#1378](https://github.com/ginkgo-project/ginkgo/pull/1378) ++ Allow constrained parallel test execution using CTest resources [#1373](https://github.com/ginkgo-project/ginkgo/pull/1373) ++ Use arithmetic type more inside mixed precision ELL [#1414](https://github.com/ginkgo-project/ginkgo/pull/1414) ++ Most factory parameters of factory type no longer need to be constructed explicitly via `.on(exec)` [#1336](https://github.com/ginkgo-project/ginkgo/pull/1336) [#1439](https://github.com/ginkgo-project/ginkgo/pull/1439) ++ Improve ParILU(T)/ParIC(T) convergence by using more appropriate atomic operations [#1434](https://github.com/ginkgo-project/ginkgo/pull/1434) + +### Fixes + ++ Fix an over-allocation for OpenMP reductions [#1369](https://github.com/ginkgo-project/ginkgo/pull/1369) ++ Fix DPCPP's common-kernel reduction for empty input sizes [#1362](https://github.com/ginkgo-project/ginkgo/pull/1362) ++ Fix several typos in the API and documentation [#1348](https://github.com/ginkgo-project/ginkgo/pull/1348) ++ Fix inconsistent `Threads` between generations [#1388](https://github.com/ginkgo-project/ginkgo/pull/1388) ++ Fix benchmark median condition [#1398](https://github.com/ginkgo-project/ginkgo/pull/1398) ++ Fix HIP 5.6.0 compilation [#1411](https://github.com/ginkgo-project/ginkgo/pull/1411) ++ Fix missing destruction of rand_generator from cuda/hip [#1417](https://github.com/ginkgo-project/ginkgo/pull/1417) ++ Fix PAPI logger destruction order [#1419](https://github.com/ginkgo-project/ginkgo/pull/1419) ++ Fix TAU logger compilation [#1422](https://github.com/ginkgo-project/ginkgo/pull/1422) ++ Fix relative criterion to not iterate if the residual is already zero [#1079](https://github.com/ginkgo-project/ginkgo/pull/1079) ++ Fix memory_order invocations with C++20 changes [#1402](https://github.com/ginkgo-project/ginkgo/pull/1402) ++ Fix `check_diagonal_entries_exist` report correctly when only missing diagonal value in the last rows. [#1440](https://github.com/ginkgo-project/ginkgo/pull/1440) ++ Fix checking OpenMPI version in cross-compilation settings [#1446](https://github.com/ginkgo-project/ginkgo/pull/1446) ++ Fix false-positive deprecation warnings in Ginkgo, especially for the old Rcm (it doesn't emit deprecation warnings anymore as a result but is still considered deprecated) [#1444](https://github.com/ginkgo-project/ginkgo/pull/1444) + ## Version 1.6.0 The Ginkgo team is proud to announce the new Ginkgo minor release 1.6.0. This release brings new features such as: @@ -215,7 +330,7 @@ Supported systems and requirements: + Add reduce_add for arrays ([#831](https://github.com/ginkgo-project/ginkgo/pull/831)) + Add utility to simplify Dense View creation from an existing Dense vector ([#1136](https://github.com/ginkgo-project/ginkgo/pull/1136)). + Add a custom transpose implementation for Fbcsr and Csr transpose for unsupported vendor types ([#1123](https://github.com/ginkgo-project/ginkgo/pull/1123)) -+ Make IDR random initilization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116)) ++ Make IDR random initialization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116)) + Move the algorithm choice for triangular solvers from Csr::strategy_type to a factory parameter ([#1088](https://github.com/ginkgo-project/ginkgo/pull/1088)) + Update CUDA archCoresPerSM ([#1175](https://github.com/ginkgo-project/ginkgo/pull/1116)) + Add kernels for Csr sparsity pattern lookup ([#994](https://github.com/ginkgo-project/ginkgo/pull/994)) @@ -620,7 +735,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues). ### Additions -+ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342)) ++ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342)) + New factorization support in Ginkgo, and addition of the ParILU algorithm ([#305](https://github.com/ginkgo-project/ginkgo/issues/305), [#315](https://github.com/ginkgo-project/ginkgo/issues/315), [#319](https://github.com/ginkgo-project/ginkgo/issues/319), [#324](https://github.com/ginkgo-project/ginkgo/issues/324)) + New ILU preconditioner ([#348](https://github.com/ginkgo-project/ginkgo/issues/348), [#353](https://github.com/ginkgo-project/ginkgo/issues/353)) @@ -632,7 +747,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues). + Allow benchmarking CuSPARSE spmv formats through Ginkgo's benchmarks ([#303](https://github.com/ginkgo-project/ginkgo/issues/303)) + New benchmark for sparse matrix format conversions ([#312](https://github.com/ginkgo-project/ginkgo/issues/312)[#317](https://github.com/ginkgo-project/ginkgo/issues/317)) + Add conversions between CSR and Hybrid formats ([#302](https://github.com/ginkgo-project/ginkgo/issues/302), [#310](https://github.com/ginkgo-project/ginkgo/issues/310)) -+ Support for sorting rows in the CSR format by column idices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322)) ++ Support for sorting rows in the CSR format by column indices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322)) + Addition of a CUDA COO SpMM kernel for improved performance ([#345](https://github.com/ginkgo-project/ginkgo/issues/345)) + Addition of a LinOp to handle perturbations of the form (identity + scalar * basis * projector) ([#334](https://github.com/ginkgo-project/ginkgo/issues/334)) @@ -847,7 +962,7 @@ Ginkgo 1.0.0 is brought to you by: **Karlsruhe Institute of Technology**, Germany **Universitat Jaume I**, Spain -**University of Tennessee, Knoxville**, US +**University of Tennessee, Knoxville**, US These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo. @@ -859,7 +974,7 @@ Ginkgo 1.0.0 contains contributions from: **Goran Flegar**, Universitat Jaume I **Fritz Göbel**, Karlsruhe Institute of Technology **Thomas Grützmacher**, Karlsruhe Institute of Technology -**Pratik Nayak**, Karlsruhe Institue of Technologgy +**Pratik Nayak**, Karlsruhe Institute of Technology **Tobias Ribizel**, Karlsruhe Institute of Technology **Yuhsiang Tsai**, National Taiwan University @@ -869,11 +984,11 @@ Supporting materials are provided by the following individuals: **Frithjof Fleischhammer** - the Ginkgo website The development team is grateful to the following individuals for discussions and comments: - + **Erik Boman** **Jelena Držaić** **Mike Heroux** **Mark Hoemmen** -**Timo Heister** +**Timo Heister** **Jens Saak** diff --git a/CMakeLists.txt b/CMakeLists.txt index d7940e7f40b..e4ffbc4efd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,20 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.16) -# Use *_ROOT environment variables for find_package calls -cmake_policy(SET CMP0074 NEW) - -# Let CAS handle the CUDA architecture flags (for now) -# Windows still gives CMP0104 warning if putting it in cuda. -if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - cmake_policy(SET CMP0104 OLD) -endif() - -project(Ginkgo LANGUAGES C CXX VERSION 1.6.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") +project(Ginkgo LANGUAGES C CXX VERSION 1.7.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "master") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) +# Cuda and Hip also look for Threads. Set it before any find_package to ensure the Threads setting is not changed. +set(THREADS_PREFER_PTHREAD_FLAG ON) # Determine which modules can be compiled include(cmake/hip_path.cmake) include(cmake/autodetect_executors.cmake) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/") +include(cmake/autodetect_system_libs.cmake) + +# rename helper +include(cmake/rename.cmake) # Ginkgo configuration options option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF) @@ -27,8 +24,9 @@ option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON) option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON) option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP}) option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI}) -option(GINKGO_BUILD_DPCPP - "Compile DPC++ kernels for Intel GPUs or other DPC++ enabled hardware" ${GINKGO_HAS_DPCPP}) +gko_rename_cache(GINKGO_BUILD_DPCPP GINKGO_BUILD_SYCL BOOL "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware") +option(GINKGO_BUILD_SYCL + "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware" ${GINKGO_HAS_SYCL}) option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) @@ -56,7 +54,7 @@ set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING if(MSVC) set(GINKGO_COMPILER_FLAGS "" CACHE STRING "Set the required CXX compiler flags, mainly used for warnings. Current default is ``") -elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp") +elseif(GINKGO_BUILD_SYCL OR CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") # For now always use `-ffp-model=precise` with DPC++. This can be removed when # the floating point issues are fixed. set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING @@ -68,8 +66,7 @@ endif() set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING "Set the required NVCC compiler flags, mainly used for warnings. Current default is an empty string") set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING - "A list of target NVIDIA GPU achitectures. See README.md for more detail.") -option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) + "A list of target NVIDIA GPU architectures. See README.md for more detail.") # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING @@ -80,19 +77,24 @@ set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING "Set the required HIP CLANG compiler flags. Current default is an empty string.") set(GINKGO_HIP_AMDGPU "" CACHE STRING "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).") +option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) +mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON) if(MSVC OR WIN32 OR CYGWIN OR APPLE) option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" OFF) else() - option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) + option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Enabled if a system installation is found." ${HWLOC_FOUND}) endif() +option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND}) option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF) +set(GINKGO_CI_TEST_OMP_PARALLELISM "4" CACHE STRING + "The number of OpenMP threads to use for a test binary during CTest resource file-constrained test.") # load executor-specific configuration if(GINKGO_BUILD_CUDA) @@ -101,10 +103,13 @@ endif() if(GINKGO_BUILD_HIP) include(cmake/hip.cmake) endif() +if(GINKGO_BUILD_SYCL) + include(cmake/sycl.cmake) +endif() if(GINKGO_BUILD_OMP) find_package(OpenMP 3.0 REQUIRED) endif() -set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) include(cmake/build_type_helpers.cmake) @@ -197,13 +202,6 @@ endif() include(CheckIncludeFileCXX) check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H) -# Automatically find PAPI and search for the required 'sde' component -set(GINKGO_HAVE_PAPI_SDE 0) -find_package(PAPI OPTIONAL_COMPONENTS sde) -if(PAPI_sde_FOUND) - set(GINKGO_HAVE_PAPI_SDE 1) -endif() - # Automatically find TAU set(GINKGO_HAVE_TAU 0) find_package(PerfStubs QUIET) @@ -233,12 +231,6 @@ if(GINKGO_BUILD_HWLOC AND (MSVC OR WIN32 OR CYGWIN OR APPLE)) set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" FORCE) message(WARNING "Ginkgo does not support HWLOC on Windows/MacOS, switch GINKGO_BUILD_HWLOC to OFF") endif() -if(GINKGO_BUILD_HWLOC) - set(GINKGO_HAVE_HWLOC 1) -else() - set(GINKGO_HAVE_HWLOC 0) - message(STATUS "HWLOC is being forcibly switched off") -endif() set(GINKGO_HAVE_GPU_AWARE_MPI OFF) set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF) @@ -250,14 +242,21 @@ if(GINKGO_BUILD_MPI) set(GINKGO_HAVE_GPU_AWARE_MPI OFF) endif() - try_run(uses_openmpi gko_result_unused + # use try_compile instead of try_run to prevent cross-compiling issues + try_compile(uses_openmpi ${Ginkgo_BINARY_DIR} ${Ginkgo_SOURCE_DIR}/cmake/openmpi_test.cpp + COMPILE_DEFINITIONS -DCHECK_HAS_OPEN_MPI=1 LINK_LIBRARIES MPI::MPI_CXX - RUN_OUTPUT_VARIABLE openmpi_version ) if(uses_openmpi) - if(openmpi_version VERSION_LESS "4.1") + try_compile(valid_openmpi_version + ${Ginkgo_BINARY_DIR} + ${Ginkgo_SOURCE_DIR}/cmake/openmpi_test.cpp + COMPILE_DEFINITIONS -DCHECK_OPEN_MPI_VERSION=1 + LINK_LIBRARIES MPI::MPI_CXX + ) + if(NOT valid_openmpi_version) message(WARNING "OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed " "matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or " @@ -268,17 +267,38 @@ if(GINKGO_BUILD_MPI) endif() # Try to find the third party packages before using our subdirectories -include(cmake/package_helpers.cmake) if(GINKGO_BUILD_TESTS) find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest endif() if(GINKGO_BUILD_BENCHMARKS) find_package(gflags 2.2.2 QUIET) - find_package(RapidJSON 1.1.0 QUIET) + find_package(nlohmann_json 3.9.1 QUIET) endif() + +# System provided, third party libraries (not bundled!) +set(GINKGO_HAVE_HWLOC 0) if(GINKGO_BUILD_HWLOC) - find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC + find_package(HWLOC 2.1) + if (HWLOC_FOUND) + set(GINKGO_HAVE_HWLOC 1) + else() + message(WARNING "HWLOC could not be found. HWLOC support will be disabled.") + set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "HWLOC support was disabled because a system package could not be found." FORCE) + endif() endif() + +set(GINKGO_HAVE_PAPI_SDE 0) +if(GINKGO_BUILD_PAPI_SDE) + find_package(PAPI 7.0.1.0 COMPONENTS sde) + if (PAPI_SDE_FOUND) + set(GINKGO_HAVE_PAPI_SDE 1) + else() + message(WARNING "PAPI (SDE) could not be found. PAPI_SDE support will be disabled.") + set(GINKGO_BUILD_PAPI_SDE OFF CACHE BOOL "PAPI_SDE support was disabled because a system package could not be found." FORCE) + endif() +endif() + +# Bundled third party libraries add_subdirectory(third_party) # Third-party tools and libraries if(MSVC) @@ -289,7 +309,7 @@ if(MSVC) endif() endif() -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION) ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION) else() @@ -312,7 +332,7 @@ endif() if(GINKGO_BUILD_HIP) add_subdirectory(hip) # High-performance kernels for AMD or NVIDIA GPUs endif() -if(GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_SYCL) add_subdirectory(dpcpp) # High-performance DPC++ kernels endif() if(GINKGO_BUILD_OMP) @@ -505,3 +525,8 @@ else() FILE(READ ${PROJECT_BINARY_DIR}/minimal.log GINKGO_LOG_SUMMARY) endif() MESSAGE(STATUS "${GINKGO_LOG_SUMMARY}") + +# make sure no build files get committed accidentally +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/.gitignore) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/.gitignore "*") +endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1dd6f412876..8e2f3990aca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -312,7 +312,7 @@ Thus, contributors should be aware of the following rules for blank lines: However, simply calling function `f` from function `g` does not imply that `f` and `g` are "related". 2. Statements within structures / classes are separated with 1 blank line. - There are no blank lines betweeen the first / last statement in the + There are no blank lines between the first / last statement in the structure / class. 1. _exception_: there is no blank line between an access modifier (`private`, `protected`, `public`) and the following statement. _example_: diff --git a/INSTALL.md b/INSTALL.md index 5f788ed0e28..4da58010ba8 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -31,7 +31,7 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests to speed them up, default is `OFF`. * `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks - (will download gflags and rapidjson), default is `ON`. + (will download gflags and nlohmann-json), default is `ON`. * `-DGINKGO_BUILD_EXAMPLES={ON, OFF}` builds Ginkgo's examples, default is `ON` * `-DGINKGO_BUILD_EXTLIB_EXAMPLE={ON, OFF}` builds the interfacing example with deal.II, default is `OFF`. @@ -42,9 +42,10 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_CUDA={ON, OFF}` builds optimized cuda versions of the kernels (requires CUDA), default is `ON` if a CUDA compiler could be detected, `OFF` otherwise. -* `-DGINKGO_BUILD_DPCPP={ON, OFF}` builds optimized DPC++ versions of the - kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` compiler). - The default is `ON` if `CMAKE_CXX_COMPILER` is a DPC++ compiler, `OFF` +* `-DGINKGO_BUILD_DPCPP={ON, OFF}` is deprecated. Please use `GINKGO_BUILD_SYCL` instead. +* `-DGINKGO_BUILD_SYCL={ON, OFF}` builds optimized SYCL versions of the + kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` or `icpx` compiler). + The default is `ON` if `CMAKE_CXX_COMPILER` is a SYCL compiler, `OFF` otherwise. * `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels (requires HIP), default is `ON` if an installation of HIP could be detected, @@ -205,7 +206,7 @@ packages can be turned off by disabling the relevant options. Test](https://github.com/google/googletest); + GINKGO_BUILD_BENCHMARKS=ON: For argument management we use [gflags](https://github.com/gflags/gflags) and for JSON parsing we use - [RapidJSON](https://github.com/Tencent/rapidjson); + [nlohmann-json](https://github.com/nlohmann/json); + GINKGO_DEVEL_TOOLS=ON: [git-cmake-format](https://github.com/gflegar/git-cmake-format) is our CMake helper for code formatting. @@ -224,7 +225,7 @@ packages can be turned off by disabling the relevant options. Ginkgo attempts to use pre-installed versions of these package if they match version requirements using `find_package`. Otherwise, the configuration step will download the files for each of the packages `GTest`, `gflags`, -`RapidJSON` and `hwloc` and build them internally. +`nlohmann-json` and `hwloc` and build them internally. Note that, if the external packages were not installed to the default location, the CMake option `-DCMAKE_PREFIX_PATH=` needs to be set to the diff --git a/README.md b/README.md index be865e933f2..d873026a34f 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Ginkgo is a high-performance linear algebra library for manycore systems, with a focus on the solution of sparse linear systems. It is implemented using modern C++ (you will need an at least C++14 compliant compiler to build it), with GPU kernels -implemented in CUDA, HIP, and DPC++. +implemented in CUDA, HIP, and DPC++(SYCL). Performance @@ -36,7 +36,7 @@ Prerequisites For Ginkgo core library: -* _cmake 3.13+_ +* _cmake 3.16+_ * C++14 compliant compiler, one of: * _gcc 5.5+_ * _clang 3.9+_ @@ -47,7 +47,8 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ or _NVHPC Package 22.7+_ +* _cmake 3.18+_ (If CUDA was installed through the NVIDIA HPC Toolkit, we require _cmake 3.22+_) +* _CUDA 10.1+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) @@ -58,13 +59,13 @@ The Ginkgo HIP module has the following __additional__ requirements: * _ROCm 4.5+_ * the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: * _AMD_ backend (using the `clang` compiler) - * _9.2 <= CUDA < 11_ backend + * _10.1 <= CUDA < 11_ backend * if the hipFFT package is available, it is used to implement the FFT LinOps. -The Ginkgo DPC++ module has the following __additional__ requirements: +The Ginkgo DPC++(SYCL) module has the following __additional__ requirements: -* _OneAPI 2021.3+_ -* Set `dpcpp` as the `CMAKE_CXX_COMPILER` +* _oneAPI 2022.1+_ +* Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER` * `c++17` is used to compile Ginkgo * The following oneAPI packages should be available: * oneMKL @@ -90,7 +91,7 @@ following: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ +* _CUDA 10.1+_ * _Microsoft Visual Studio_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the @@ -122,7 +123,7 @@ cmake -G "Unix Makefiles" .. && make By default, `GINKGO_BUILD_REFERENCE` is enabled. You should be able to run examples with this executor. By default, Ginkgo tries to enable the relevant modules depending on your machine environment (present of CUDA, ...). You can -also explicitly compile with the OpenMP, CUDA, HIP or DPC++ modules enabled to +also explicitly compile with the OpenMP, CUDA, HIP or DPC++(SYCL) modules enabled to run the examples with these executors. Please refer to the [Installation page](./INSTALL.md) for more details. diff --git a/accessor/accessor_helper.hpp b/accessor/accessor_helper.hpp index 5ee536d28db..5b80f4e13d8 100644 --- a/accessor/accessor_helper.hpp +++ b/accessor/accessor_helper.hpp @@ -78,7 +78,7 @@ struct row_major_helper_s { const std::array 1 ? total_dim - 1 : 0)>& stride, IndexType first, Indices&&... idxs) { - // The ASSERT size check must NOT be indexed with `dim_idx` directy, + // The ASSERT size check must NOT be indexed with `dim_idx` directly, // otherwise, it leads to a linker error. The reason is likely that // `std::array::operator[](const size_type &)` uses a // reference. Since `dim_idx` is constexpr (and not defined in a diff --git a/accessor/row_major.hpp b/accessor/row_major.hpp index 757110f4912..9026cef2116 100644 --- a/accessor/row_major.hpp +++ b/accessor/row_major.hpp @@ -55,7 +55,7 @@ namespace acc { * constructor parameters for this class to the range (it will forward it to * this class). * - * @warning For backward compatability reasons, a specialization is provided + * @warning For backward compatibility reasons, a specialization is provided * for dimensionality == 2. * * @tparam ValueType type of values this accessor returns diff --git a/accessor/utils.hpp b/accessor/utils.hpp index e692138ee4d..dfe30188f83 100644 --- a/accessor/utils.hpp +++ b/accessor/utils.hpp @@ -243,7 +243,7 @@ to_arithmetic_type(const Ref& ref) * @internal * Struct used for testing if an implicit cast is present. The constructor only * takes an OutType, so any argument of a type that is not implicitly - * convertable to OutType is incompatible. + * convertible to OutType is incompatible. */ template struct test_for_implicit_cast { diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 040356f1666..347ecec7699 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,9 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def) endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) - target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE}) - target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) - target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1) + target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) endfunction() function(ginkgo_benchmark_hipsparse_linops type def) @@ -48,7 +46,7 @@ endfunction() # Generates an executable for one precision. Each executable will be linked to -# `ginkgo`, `gflags` and `rapidjson`. +# `ginkgo`, `gflags` and `nlohmann-json`. # Note: This should only be used by `ginkgo_add_typed_benchmark_executables` # # \param name name for the executable to create (including type suffix) @@ -59,7 +57,7 @@ endfunction() # All remaining arguments will be treated as source files function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def type) add_executable("${name}" ${ARGN}) - target_link_libraries("${name}" ginkgo gflags rapidjson) + target_link_libraries("${name}" ginkgo gflags nlohmann_json::nlohmann_json) # always include the device timer if (GINKGO_BUILD_CUDA) target_compile_definitions("${name}" PRIVATE HAS_CUDA_TIMER=1) @@ -69,7 +67,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE HAS_HIP_TIMER=1) target_link_libraries("${name}" hip_timer) endif() - if (GINKGO_BUILD_DPCPP) + if (GINKGO_BUILD_SYCL) target_compile_definitions("${name}" PRIVATE HAS_DPCPP_TIMER=1) target_link_libraries("${name}" dpcpp_timer) endif() @@ -89,7 +87,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE HAS_HIP=1) target_link_libraries("${name}" hipsparse_linops_${type}) endif() - if (GINKGO_BUILD_DPCPP) + if (GINKGO_BUILD_SYCL) target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) target_link_libraries("${name}" onemkl_linops_${type}) endif() @@ -98,7 +96,7 @@ endfunction(ginkgo_add_single_benchmark_executable) # Generates an executable for each supported precision. Each executable will be -# linked to `ginkgo`, `gflags` and `rapidjson`. +# linked to `ginkgo`, `gflags` and `nlohmann-json`. # # \param name base-name for the executable to create # \param use_lib_linops Boolean indicating if linking against hipsparse/cusparse @@ -122,8 +120,7 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cpp) - target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS}) - target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_libraries(cuda_timer ginkgo CUDA::cudart) endif() if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) @@ -137,12 +134,14 @@ if (GINKGO_BUILD_HIP) target_link_libraries(hip_timer ginkgo) endif() -if (GINKGO_BUILD_DPCPP) +if (GINKGO_BUILD_SYCL) ginkgo_benchmark_onemkl_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) ginkgo_benchmark_onemkl_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_onemkl_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_onemkl_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp) + target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS}) + gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp) target_link_libraries(dpcpp_timer ginkgo) endif() @@ -152,7 +151,7 @@ if (GINKGO_BUILD_MPI) endif() add_subdirectory(blas) -add_subdirectory(conversions) +add_subdirectory(conversion) add_subdirectory(matrix_generator) add_subdirectory(matrix_statistics) add_subdirectory(preconditioner) @@ -160,22 +159,14 @@ add_subdirectory(solver) add_subdirectory(sparse_blas) add_subdirectory(spmv) add_subdirectory(tools) +if (GINKGO_BUILD_TESTS) + add_subdirectory(test) +endif() -add_custom_target(make_run_all_benchmarks ALL) -file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh - DESTINATION ${CMAKE_CURRENT_BINARY_DIR} - FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE) - -add_custom_command( - TARGET make_run_all_benchmarks POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh - ${CMAKE_CURRENT_BINARY_DIR}/run_all_benchmarks.sh) +configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY) add_custom_target(benchmark) add_custom_command( TARGET benchmark POST_BUILD COMMAND bash run_all_benchmarks.sh >/dev/null - DEPENDS make_run_all_benchmarks WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index ee2dc06d01b..f7ad8120a80 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -130,25 +130,17 @@ Parameters for a benchmark case are: stride_B: stride for B matrix in gemm (optional, default m) stride_C: stride for C matrix in gemm (optional, default m) )"; - std::string format = example_config; + std::string format = Generator::get_example_config(); initialize_argument_parsing(&argc, &argv, header, format); std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - std::cerr - << "Input has to be a JSON array of benchmark configurations:\n" - << format; - std::exit(1); - } + auto test_cases = json::parse(get_input_stream()); - run_blas_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), operation_map, - test_cases, true); + run_test_cases(BlasBenchmark{operation_map}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp index f36b7649ffc..1267dc57c15 100644 --- a/benchmark/blas/blas_common.hpp +++ b/benchmark/blas/blas_common.hpp @@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/general.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -70,14 +72,6 @@ DEFINE_string( "C has dimensions n x m and x and y have dimensions n x r"); -std::string example_config = R"( - [ - { "n": 100 }, - { "n": 200, "m": 200, "k": 200 } - ] -)"; - - class BenchmarkOperation { public: virtual ~BenchmarkOperation() = default; @@ -404,172 +398,129 @@ struct dimensions { }; -dimensions parse_dims(rapidjson::Value& test_case) -{ - auto get_optional = [](rapidjson::Value& obj, const char* name, - gko::size_type default_value) -> gko::size_type { - if (obj.HasMember(name)) { - return obj[name].GetUint64(); - } else { - return default_value; - } - }; - - dimensions result; - result.n = test_case["n"].GetInt64(); - result.k = get_optional(test_case, "k", result.n); - result.m = get_optional(test_case, "m", result.n); - result.r = get_optional(test_case, "r", 1); - if (test_case.HasMember("stride")) { - result.stride_x = test_case["stride"].GetInt64(); - result.stride_y = result.stride_x; - } else { - result.stride_x = get_optional(test_case, "stride_x", result.r); - result.stride_y = get_optional(test_case, "stride_y", result.r); +struct BlasBenchmark : Benchmark { + using map_type = + std::map( + std::shared_ptr, dimensions)>>; + map_type operation_map; + std::vector operations; + std::string name; + bool do_print; + + BlasBenchmark(map_type operation_map, bool do_print = true) + : operation_map{std::move(operation_map)}, + name{"blas"}, + operations{split(FLAGS_operations)}, + do_print{do_print} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; } - result.stride_A = get_optional(test_case, "stride_A", result.k); - result.stride_B = get_optional(test_case, "stride_B", result.m); - result.stride_C = get_optional(test_case, "stride_C", result.m); - return result; -} + bool should_print() const override { return do_print; } -std::string describe(rapidjson::Value& test_case) -{ - std::stringstream ss; - auto optional_output = [&](const char* name) { - if (test_case.HasMember(name) && test_case[name].IsInt64()) { - ss << name << " = " << test_case[name].GetInt64() << " "; - } - }; - optional_output("n"); - optional_output("k"); - optional_output("m"); - optional_output("r"); - optional_output("stride"); - optional_output("stride_x"); - optional_output("stride_y"); - optional_output("stride_A"); - optional_output("stride_B"); - optional_output("stride_C"); - return ss.str(); -} + std::string get_example_config() const override + { + return json::parse(R"([{"n": 100}, {"n": 200, "m": 200, "k": 200}])") + .dump(4); + } + bool validate_config(const json& value) const override + { + return value.contains("n") && value["n"].is_number_integer(); + } + + std::string describe_config(const json& test_case) const override + { + std::stringstream ss; + auto optional_output = [&](const char* name) { + if (test_case.contains(name) && + test_case[name].is_number_integer()) { + ss << name << " = " << test_case[name].get() << " "; + } + }; + optional_output("n"); + optional_output("k"); + optional_output("m"); + optional_output("r"); + optional_output("stride"); + optional_output("stride_x"); + optional_output("stride_y"); + optional_output("stride_A"); + optional_output("stride_B"); + optional_output("stride_C"); + return ss.str(); + } + + dimensions setup(std::shared_ptr exec, + json& test_case) const override + { + auto get_optional = [](json& obj, const char* name, + gko::size_type default_value) -> gko::size_type { + if (obj.contains(name)) { + return obj[name].get(); + } else { + return default_value; + } + }; + + dimensions result; + result.n = test_case["n"].get(); + result.k = get_optional(test_case, "k", result.n); + result.m = get_optional(test_case, "m", result.n); + result.r = get_optional(test_case, "r", 1); + if (test_case.contains("stride")) { + result.stride_x = test_case["stride"].get(); + result.stride_y = result.stride_x; + } else { + result.stride_x = get_optional(test_case, "stride_x", result.r); + result.stride_y = get_optional(test_case, "stride_y", result.r); + } + result.stride_A = get_optional(test_case, "stride_A", result.k); + result.stride_B = get_optional(test_case, "stride_B", result.m); + result.stride_C = get_optional(test_case, "stride_C", result.m); + return result; + } -template -void apply_blas(const char* operation_name, std::shared_ptr exec, - std::shared_ptr timer, const OpMap& operation_map, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& blas_case = test_case["blas"]; - add_or_set_member(blas_case, operation_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - auto op = operation_map.at(operation_name)(exec, parse_dims(test_case)); + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, dimensions& dims, + const std::string& operation_name, + json& operation_case) const override + { + auto op = operation_map.at(operation_name)(exec, dims); IterationControl ic(timer); // warm run - for (auto _ : ic.warmup_run()) { - op->prepare(); - exec->synchronize(); - op->run(); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + op->prepare(); + exec->synchronize(); + op->run(); + exec->synchronize(); + } } // timed run op->prepare(); for (auto _ : ic.run()) { + auto range = annotate("repetition"); op->run(); } const auto runtime = ic.compute_time(FLAGS_timer_method); const auto flops = static_cast(op->get_flops()); const auto mem = static_cast(op->get_memory()); const auto repetitions = ic.get_num_repetitions(); - add_or_set_member(blas_case[operation_name], "time", runtime, - allocator); - add_or_set_member(blas_case[operation_name], "flops", flops / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime, - allocator); - add_or_set_member(blas_case[operation_name], "repetitions", repetitions, - allocator); - - // compute and write benchmark data - add_or_set_member(blas_case[operation_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["blas"][operation_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["blas"][operation_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; + operation_case["time"] = runtime; + operation_case["flops"] = flops / runtime; + operation_case["bandwidth"] = mem / runtime; + operation_case["repetitions"] = repetitions; } -} - - -template -void run_blas_benchmarks(std::shared_ptr exec, - std::shared_ptr timer, - const OpMap& operation_map, - rapidjson::Document& test_cases, bool do_print) -{ - auto operations = split(FLAGS_operations, ','); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - if (!test_case.HasMember("blas")) { - test_case.AddMember("blas", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& blas_case = test_case["blas"]; - if (!FLAGS_overwrite && - all_of(begin(operations), end(operations), - [&blas_case](const std::string& s) { - return blas_case.HasMember(s.c_str()); - })) { - continue; - } - if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; - } - // annotate the test case - auto test_case_range = annotate(describe(test_case)); - for (const auto& operation_name : operations) { - { - auto operation_range = annotate(operation_name.c_str()); - apply_blas(operation_name.c_str(), exec, timer, - operation_map, test_case, allocator); - } - - if (do_print) { - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - - backup_results(test_cases); - } - } - } catch (const std::exception& e) { - std::cerr << "Error setting up benchmark, what(): " << e.what() - << std::endl; - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; diff --git a/benchmark/blas/distributed/CMakeLists.txt b/benchmark/blas/distributed/CMakeLists.txt index 1371294efb8..a756b9c0071 100644 --- a/benchmark/blas/distributed/CMakeLists.txt +++ b/benchmark/blas/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(multi-vector-distributed "NO" multi_vector.cpp) +ginkgo_add_typed_benchmark_executables(multi_vector_distributed "NO" multi_vector.cpp) diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp index 4d3b821ed2e..fe5eea5a38c 100644 --- a/benchmark/blas/distributed/multi_vector.cpp +++ b/benchmark/blas/distributed/multi_vector.cpp @@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/blas/blas_common.hpp" #include "benchmark/utils/general.hpp" #include "benchmark/utils/generator.hpp" @@ -50,6 +53,10 @@ int main(int argc, char* argv[]) { gko::experimental::mpi::environment mpi_env{argc, argv}; + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + const auto do_print = rank == 0; + std::string header = R"(" A benchmark for measuring performance of Ginkgo's BLAS-like " operations. @@ -60,26 +67,19 @@ Parameters for a benchmark case are: stride_x: stride for input vector x (optional, default r) stride_y: stride for in/out vector y (optional, default r) )"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + std::string format = Generator::get_example_config(); + initialize_argument_parsing(&argc, &argv, header, format, do_print); - std::string extra_information = "The operations are " + FLAGS_operations; - print_general_information(extra_information); - - const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - const auto rank = comm.rank(); + if (do_print) { + std::string extra_information = + "The operations are " + FLAGS_operations; + print_general_information(extra_information); + } auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); std::string json_input = broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - if (!test_cases.IsArray()) { - std::cerr - << "Input has to be a JSON array of benchmark configurations:\n" - << format; - std::exit(1); - } + auto test_cases = json::parse(json_input); std::map( @@ -127,10 +127,10 @@ Parameters for a benchmark case are: exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y); }}}; - run_blas_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), - operation_map, test_cases, rank == 0); + run_test_cases(BlasBenchmark{operation_map, do_print}, exec, + get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/conversion/CMakeLists.txt b/benchmark/conversion/CMakeLists.txt new file mode 100644 index 00000000000..7ecf578c055 --- /dev/null +++ b/benchmark/conversion/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_add_typed_benchmark_executables(conversion "NO" conversion.cpp) diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp new file mode 100644 index 00000000000..e45046329d7 --- /dev/null +++ b/benchmark/conversion/conversion.cpp @@ -0,0 +1,207 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "benchmark/utils/formats.hpp" +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" +#include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" +#include "benchmark/utils/runner.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" + + +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + +using Generator = DefaultSystemGenerator<>; + + +struct ConversionBenchmark : Benchmark> { + std::string name; + std::vector operations; + + ConversionBenchmark() : name{"conversion"} + { + auto ref_exec = gko::ReferenceExecutor::create(); + auto formats = split(FLAGS_formats); + for (const auto& from_format : formats) { + operations.push_back(from_format + "-read"); + auto from_mtx = + formats::matrix_type_factory.at(from_format)(ref_exec); + // all pairs of conversions that are supported by Ginkgo + for (const auto& to_format : formats) { + if (from_format == to_format) { + continue; + } + auto to_mtx = + formats::matrix_type_factory.at(to_format)(ref_exec); + try { + to_mtx->copy_from(from_mtx); + operations.push_back(from_format + "-" + to_format); + } catch (const std::exception& e) { + } + } + } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; + } + + bool should_print() const override { return true; } + + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + bool validate_config(const json& test_case) const override + { + return Generator::validate_config(test_case); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + gko::device_matrix_data setup( + std::shared_ptr exec, json& test_case) const override + { + gko::matrix_data data; + data = Generator::generate_matrix_data(test_case); + // no reordering here, as it doesn't impact conversions beyond + // dense-sparse conversions + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + return gko::device_matrix_data::create_from_host(exec, + data); + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, + gko::device_matrix_data& data, + const std::string& operation_name, + json& operation_case) const override + { + auto split_it = + std::find(operation_name.begin(), operation_name.end(), '-'); + std::string from_name{operation_name.begin(), split_it}; + std::string to_name{split_it + 1, operation_name.end()}; + auto mtx_from = formats::matrix_type_factory.at(from_name)(exec); + auto readable = + gko::as>(mtx_from.get()); + IterationControl ic{timer}; + if (to_name == "read") { + // warm run + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + readable->read(data); + exec->synchronize(); + } + } + // timed run + for (auto _ : ic.run()) { + auto range = annotate("repetition"); + readable->read(data); + } + } else { + readable->read(data); + auto mtx_to = formats::matrix_type_factory.at(to_name)(exec); + + // warm run + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + exec->synchronize(); + mtx_to->copy_from(mtx_from); + exec->synchronize(); + } + } + // timed run + for (auto _ : ic.run()) { + auto range = annotate("repetition"); + mtx_to->copy_from(mtx_from); + } + } + operation_case["time"] = ic.compute_time(FLAGS_timer_method); + operation_case["repetitions"] = ic.get_num_repetitions(); + } +}; + + +int main(int argc, char* argv[]) +{ + std::string header = + "A benchmark for measuring performance of Ginkgo's conversions.\n"; + std::string format_str = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format_str); + + std::string extra_information = + std::string() + "The formats are " + FLAGS_formats; + print_general_information(extra_information); + + auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); + auto formats = split(FLAGS_formats, ','); + + auto test_cases = json::parse(get_input_stream()); + + run_test_cases(ConversionBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); + + std::cout << std::setw(4) << test_cases << std::endl; +} diff --git a/benchmark/conversions/CMakeLists.txt b/benchmark/conversions/CMakeLists.txt deleted file mode 100644 index 0e0893c3aec..00000000000 --- a/benchmark/conversions/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -ginkgo_add_typed_benchmark_executables(conversions "NO" conversions.cpp) diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp deleted file mode 100644 index ec7febf262f..00000000000 --- a/benchmark/conversions/conversions.cpp +++ /dev/null @@ -1,222 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general.hpp" -#include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" -#include "benchmark/utils/timer.hpp" -#include "benchmark/utils/types.hpp" - - -#ifdef GINKGO_BENCHMARK_ENABLE_TUNING -#include "benchmark/utils/tuning_variables.hpp" -#endif // GINKGO_BENCHMARK_ENABLE_TUNING - - -// This function supposes that management of `FLAGS_overwrite` is done before -// calling it -void convert_matrix(const gko::LinOp* matrix_from, const char* format_to, - const char* conversion_name, - std::shared_ptr exec, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& conversion_case = test_case["conversions"]; - add_or_set_member(conversion_case, conversion_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - - gko::matrix_data data{gko::dim<2>{1, 1}, 1}; - auto matrix_to = share(formats::matrix_factory(format_to, exec, data)); - - auto timer = get_timer(exec, FLAGS_gpu_timer); - IterationControl ic{timer}; - - // warm run - for (auto _ : ic.warmup_run()) { - exec->synchronize(); - matrix_to->copy_from(matrix_from); - exec->synchronize(); - matrix_to->clear(); - } - // timed run - for (auto _ : ic.run()) { - matrix_to->copy_from(matrix_from); - } - add_or_set_member(conversion_case[conversion_name], "time", - ic.compute_time(FLAGS_timer_method), allocator); - add_or_set_member(conversion_case[conversion_name], "repetitions", - ic.get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(conversion_case[conversion_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["conversions"][conversion_name], - "completed", false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["conversions"][conversion_name], - "error", msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -int main(int argc, char* argv[]) -{ - std::string header = - "A benchmark for measuring performance of Ginkgo's conversions.\n"; - std::string format_str = example_config; - initialize_argument_parsing(&argc, &argv, header, format_str); - - std::string extra_information = - std::string() + "The formats are " + FLAGS_formats + "\n"; - print_general_information(extra_information); - - auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto formats = split(FLAGS_formats, ','); - - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } - - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - std::clog << "Benchmarking conversions. " << std::endl; - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("conversions")) { - test_case.AddMember("conversions", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& conversion_case = test_case["conversions"]; - - std::clog << "Running test case: " << test_case << std::endl; - gko::matrix_data data; - try { - data = generator.generate_matrix_data(test_case); - } catch (std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } - continue; - } - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << ")" << std::endl; - add_or_set_member(test_case, "size", data.size[0], allocator); - // annotate the test case - auto test_case_range = annotate(generator.describe_config(test_case)); - for (const auto& format_from : formats) { - try { - auto matrix_from = - share(formats::matrix_factory(format_from, exec, data)); - for (const auto& format_to : formats) { - if (format_from == format_to) { - continue; - } - auto conversion_name = - std::string(format_from) + "-" + format_to; - - if (!FLAGS_overwrite && - conversion_case.HasMember(conversion_name.c_str())) { - continue; - } - { - auto conversion_range = - annotate(conversion_name.c_str()); - convert_matrix(matrix_from.get(), format_to.c_str(), - conversion_name.c_str(), exec, test_case, - allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - } - backup_results(test_cases); - } catch (const gko::AllocationError& e) { - for (const auto& format : formats::matrix_type_factory) { - const auto format_to = std::get<0>(format); - auto conversion_name = - std::string(format_from) + "-" + format_to; - add_or_set_member( - test_case["conversions"][conversion_name.c_str()], - "completed", false, allocator); - } - std::cerr << "Error when allocating data for type " - << format_from << ". what(): " << e.what() - << std::endl; - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error when running benchmark, what(): " - << e.what() << std::endl; - } - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } - - std::cout << test_cases << std::endl; -} diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp index 138b5a9c2ce..193d95f897f 100644 --- a/benchmark/matrix_generator/matrix_generator.cpp +++ b/benchmark/matrix_generator/matrix_generator.cpp @@ -85,31 +85,33 @@ std::string input_format = // clang-format on -void validate_option_object(const rapidjson::Value& value) +void validate_option_object(const json& value) { - if (!value.IsObject() || !value.HasMember("filename") || - !value["filename"].IsString() || !value.HasMember("problem") || - !value["problem"].IsObject() || !value["problem"].HasMember("type") || - !value["problem"]["type"].IsString()) { + if (!value.is_object() || !value.contains("filename") || + !value["filename"].is_string() || !value.contains("problem") || + !value["problem"].is_object() || !value["problem"].contains("type") || + !value["problem"]["type"].is_string()) { print_config_error_and_exit(2); } } using generator_function = std::function( - rapidjson::Value&, std::default_random_engine&)>; + json&, std::default_random_engine&)>; // matrix generators gko::matrix_data generate_block_diagonal( - rapidjson::Value& config, std::default_random_engine& engine) + json& config, std::default_random_engine& engine) { - if (!config.HasMember("num_blocks") || !config["num_blocks"].IsUint() || - !config.HasMember("block_size") || !config["block_size"].IsUint()) { + if (!config.contains("num_blocks") || + !config["num_blocks"].is_number_unsigned() || + !config.contains("block_size") || + !config["block_size"].is_number_unsigned()) { print_config_error_and_exit(2); } - auto num_blocks = config["num_blocks"].GetUint(); - auto block_size = config["block_size"].GetUint(); + auto num_blocks = config["num_blocks"].get(); + auto block_size = config["block_size"].get(); auto block = gko::matrix_data( gko::dim<2>(block_size), std::uniform_real_distribution(-1.0, 1.0), engine); @@ -132,20 +134,18 @@ int main(int argc, char* argv[]) std::clog << gko::version_info::get() << std::endl; auto engine = get_engine(); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document configurations; - configurations.ParseStream(jcin); + auto configurations = json::parse(get_input_stream()); - if (!configurations.IsArray()) { + if (!configurations.is_array()) { print_config_error_and_exit(1); } - for (auto& config : configurations.GetArray()) { + for (auto& config : configurations) { try { validate_option_object(config); std::clog << "Generating matrix: " << config << std::endl; - auto filename = config["filename"].GetString(); - auto type = config["problem"]["type"].GetString(); + auto filename = config["filename"].get(); + auto type = config["problem"]["type"].get(); auto mdata = generator[type](config["problem"], engine); std::ofstream ofs(filename); gko::write_raw(ofs, mdata, gko::layout_type::coordinate); diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp index 45f21ca1e35..576d6fa7d52 100644 --- a/benchmark/matrix_statistics/matrix_statistics.cpp +++ b/benchmark/matrix_statistics/matrix_statistics.cpp @@ -38,9 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "benchmark/utils/general.hpp" +#include + + +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/types.hpp" @@ -51,9 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // See en.wikipedia.org/wiki/Five-number_summary // Quartile computation uses Method 3 from en.wikipedia.org/wiki/Quartile -void compute_summary(const std::vector& dist, - rapidjson::Value& out, - rapidjson::MemoryPoolAllocator<>& allocator) +void compute_summary(const std::vector& dist, json& out) { const auto q = dist.size() / 4; const auto r = dist.size() % 4; @@ -72,23 +73,14 @@ void compute_summary(const std::vector& dist, }; // clang-format on - add_or_set_member(out, "min", dist[0], allocator); - add_or_set_member( - out, "q1", - coefs[r][0] * static_cast(dist[positions[r][0]]) + - coefs[r][1] * static_cast(dist[positions[r][1]]), - allocator); - add_or_set_member( - out, "median", - coefs[r][2] * static_cast(dist[positions[r][2]]) + - coefs[r][3] * static_cast(dist[positions[r][3]]), - allocator); - add_or_set_member( - out, "q3", - coefs[r][4] * static_cast(dist[positions[r][4]]) + - coefs[r][5] * static_cast(dist[positions[r][5]]), - allocator); - add_or_set_member(out, "max", dist[dist.size() - 1], allocator); + out["min"] = dist.front(); + out["q1"] = coefs[r][0] * static_cast(dist[positions[r][0]]) + + coefs[r][1] * static_cast(dist[positions[r][1]]); + out["median"] = coefs[r][2] * static_cast(dist[positions[r][2]]) + + coefs[r][3] * static_cast(dist[positions[r][3]]); + out["q3"] = coefs[r][4] * static_cast(dist[positions[r][4]]) + + coefs[r][5] * static_cast(dist[positions[r][5]]); + out["max"] = dist.back(); } @@ -108,39 +100,30 @@ double compute_moment(int degree, const std::vector& dist, // See en.wikipedia.org/wiki/Moment_(mathematics) -void compute_moments(const std::vector& dist, - rapidjson::Value& out, - rapidjson::MemoryPoolAllocator<>& allocator) +void compute_moments(const std::vector& dist, json& out) { const auto mean = compute_moment(1, dist); - add_or_set_member(out, "mean", mean, allocator); + out["mean"] = mean; const auto variance = compute_moment(2, dist, mean); - add_or_set_member(out, "variance", variance, allocator); + out["variance"] = variance; const auto dev = std::sqrt(variance); - add_or_set_member(out, "skewness", compute_moment(3, dist, mean, dev), - allocator); - add_or_set_member(out, "kurtosis", compute_moment(4, dist, mean, dev), - allocator); - add_or_set_member(out, "hyperskewness", compute_moment(5, dist, mean, dev), - allocator); - add_or_set_member(out, "hyperflatness", compute_moment(6, dist, mean, dev), - allocator); + out["skewness"] = compute_moment(3, dist, mean, dev); + out["kurtosis"] = compute_moment(4, dist, mean, dev); + out["hyperskewness"] = compute_moment(5, dist, mean, dev); + out["hyperflatness"] = compute_moment(6, dist, mean, dev); } -template void compute_distribution_properties(const std::vector& dist, - rapidjson::Value& out, - Allocator& allocator) + json& out) { - compute_summary(dist, out, allocator); - compute_moments(dist, out, allocator); + compute_summary(dist, out); + compute_moments(dist, out); } -template void extract_matrix_statistics(gko::matrix_data& data, - rapidjson::Value& problem, Allocator& allocator) + json& problem) { std::vector row_dist(data.size[0]); std::vector col_dist(data.size[1]); @@ -149,72 +132,95 @@ void extract_matrix_statistics(gko::matrix_data& data, ++col_dist[v.column]; } - add_or_set_member(problem, "rows", data.size[0], allocator); - add_or_set_member(problem, "columns", data.size[1], allocator); - add_or_set_member(problem, "nonzeros", data.nonzeros.size(), allocator); + problem["rows"] = data.size[0]; + problem["columns"] = data.size[1]; + problem["nonzeros"] = data.nonzeros.size(); std::sort(begin(row_dist), end(row_dist)); - add_or_set_member(problem, "row_distribution", - rapidjson::Value(rapidjson::kObjectType), allocator); - compute_distribution_properties(row_dist, problem["row_distribution"], - allocator); + problem["row_distribution"] = json::object(); + compute_distribution_properties(row_dist, problem["row_distribution"]); std::sort(begin(col_dist), end(col_dist)); - add_or_set_member(problem, "col_distribution", - rapidjson::Value(rapidjson::kObjectType), allocator); - compute_distribution_properties(col_dist, problem["col_distribution"], - allocator); + problem["col_distribution"] = json::object(); + compute_distribution_properties(col_dist, problem["col_distribution"]); } -int main(int argc, char* argv[]) -{ - std::string header = - "A utility that collects additional statistical properties of the " - "matrix.\n"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); +using Generator = DefaultSystemGenerator; - std::clog << gko::version_info::get() << std::endl; - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } +struct empty_state {}; + - auto& allocator = test_cases.GetAllocator(); +struct MatrixStatistics : Benchmark { + std::string name; + std::vector empty; - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("problem")) { - test_case.AddMember("problem", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& problem = test_case["problem"]; + MatrixStatistics() : name{"problem"} {} - std::clog << "Running test case: " << test_case << std::endl; + const std::string& get_name() const override { return name; } - auto matrix = - DefaultSystemGenerator::generate_matrix_data( - test_case); + const std::vector& get_operations() const override + { + return empty; + } - std::clog << "Matrix is of size (" << matrix.size[0] << ", " - << matrix.size[1] << ")" << std::endl; - add_or_set_member(test_case, "size", matrix.size[0], allocator); + bool should_print() const override { return true; } - extract_matrix_statistics(matrix, test_case["problem"], allocator); + std::string get_example_config() const override + { + return Generator::get_example_config(); + } - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error extracting statistics, what(): " << e.what() - << std::endl; - } + bool validate_config(const json& test_case) const override + { + return Generator::validate_config(test_case); } - std::cout << test_cases << std::endl; + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + empty_state setup(std::shared_ptr exec, + json& test_case) const override + { + auto data = Generator::generate_matrix_data(test_case); + // no reordering here, as it doesn't change statistics + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + + extract_matrix_statistics(data, test_case["problem"]); + return {}; + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, empty_state& data, + const std::string& operation_name, + json& operation_case) const override + {} +}; + + +int main(int argc, char* argv[]) +{ + std::string header = + "A utility that collects additional statistical properties of the " + "matrix.\n"; + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format); + + std::clog << gko::version_info::get() << std::endl; + + auto test_cases = json::parse(get_input_stream()); + auto exec = gko::ReferenceExecutor::create(); + + run_test_cases(MatrixStatistics{}, exec, get_timer(exec, false), + test_cases); + + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 281e64ddd76..d81dfaa4d5d 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -42,10 +42,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -128,34 +130,85 @@ std::string encode_parameters(const char* precond_name) } -void run_preconditioner(const char* precond_name, - std::shared_ptr exec, - std::shared_ptr system_matrix, - const vec* b, const vec* x, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& precond_object = test_case["preconditioner"]; - auto encoded_name = encode_parameters(precond_name); +struct preconditioner_benchmark_state { + std::unique_ptr x; + std::unique_ptr b; + std::shared_ptr system_matrix; +}; + + +using Generator = DefaultSystemGenerator<>; + + +struct PreconditionerBenchmark : Benchmark { + std::string name; + std::vector preconditioners; + std::map precond_decoder; - if (!FLAGS_overwrite && - precond_object.HasMember(encoded_name.c_str())) { - return; + PreconditionerBenchmark() + : name{"preconditioner"}, preconditioners{split(FLAGS_preconditioners)} + { + for (auto precond : split(FLAGS_preconditioners)) { + preconditioners.push_back(encode_parameters(precond.c_str())); + precond_decoder[preconditioners.back()] = precond; } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return preconditioners; + } - add_or_set_member(precond_object, encoded_name.c_str(), - rapidjson::Value(rapidjson::kObjectType), allocator); - auto& this_precond_data = precond_object[encoded_name.c_str()]; + bool should_print() const override { return true; } + + bool validate_config(const json& value) const override + { + return Generator::validate_config(value); + } + + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + preconditioner_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + preconditioner_benchmark_state state; + auto data = Generator::generate_matrix_data(test_case); + reorder(data, test_case); + + state.system_matrix = + formats::matrix_factory(FLAGS_formats, exec, data); + state.b = Generator::create_multi_vector_random(exec, data.size[0]); + state.x = Generator::create_multi_vector(exec, data.size[0], + gko::zero()); + + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + return state; + } - add_or_set_member(this_precond_data, "generate", - rapidjson::Value(rapidjson::kObjectType), allocator); - add_or_set_member(this_precond_data, "apply", - rapidjson::Value(rapidjson::kObjectType), allocator); + + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, preconditioner_benchmark_state& state, + const std::string& encoded_precond_name, + json& precond_case) const override + { + auto decoded_precond_name = precond_decoder.at(encoded_precond_name); for (auto stage : {"generate", "apply"}) { - add_or_set_member(this_precond_data[stage], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + precond_case[stage] = json::object(); + precond_case[stage]["components"] = json::object(); } IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)}; @@ -163,54 +216,57 @@ void run_preconditioner(const char* precond_name, { // fast run, gets total time - auto x_clone = clone(x); - - auto precond = precond_factory.at(precond_name)(exec); + auto x_clone = clone(state.x); + auto precond = precond_factory.at(decoded_precond_name)(exec); - for (auto _ : ic_apply.warmup_run()) { - precond->generate(system_matrix)->apply(b, x_clone); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic_apply.warmup_run()) { + precond->generate(state.system_matrix) + ->apply(state.b, x_clone); + } } std::unique_ptr precond_op; for (auto _ : ic_gen.run()) { - precond_op = precond->generate(system_matrix); + auto range = annotate("repetition generate"); + precond_op = precond->generate(state.system_matrix); } - add_or_set_member(this_precond_data["generate"], "time", - ic_gen.compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(this_precond_data["generate"], "repetitions", - ic_gen.get_num_repetitions(), allocator); + precond_case["generate"]["time"] = + ic_gen.compute_time(FLAGS_timer_method); + precond_case["generate"]["repetitions"] = + ic_gen.get_num_repetitions(); for (auto _ : ic_apply.run()) { - precond_op->apply(b, x_clone); + auto range = annotate("repetition apply"); + precond_op->apply(state.b, x_clone); } - add_or_set_member(this_precond_data["apply"], "time", - ic_apply.compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(this_precond_data["apply"], "repetitions", - ic_apply.get_num_repetitions(), allocator); + precond_case["apply"]["time"] = + ic_apply.compute_time(FLAGS_timer_method); + precond_case["apply"]["repetitions"] = + ic_apply.get_num_repetitions(); } if (FLAGS_detailed) { // slow run, times each component separately - auto x_clone = clone(x); - auto precond = precond_factory.at(precond_name)(exec); + auto x_clone = clone(state.x); + auto precond = precond_factory.at(decoded_precond_name)(exec); std::unique_ptr precond_op; { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - this_precond_data["generate"]["components"], allocator, + precond_case["generate"]["components"], ic_gen.get_num_repetitions()); exec->add_logger(gen_logger); if (exec->get_master() != exec) { exec->get_master()->add_logger(gen_logger); } for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) { - precond_op = precond->generate(system_matrix); + precond_op = precond->generate(state.system_matrix); } if (exec->get_master() != exec) { exec->get_master()->remove_logger(gen_logger); @@ -220,38 +276,22 @@ void run_preconditioner(const char* precond_name, auto apply_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - this_precond_data["apply"]["components"], allocator, + precond_case["apply"]["components"], ic_apply.get_num_repetitions()); exec->add_logger(apply_logger); if (exec->get_master() != exec) { exec->get_master()->add_logger(apply_logger); } for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) { - precond_op->apply(b, x_clone); + precond_op->apply(state.b, x_clone); } if (exec->get_master() != exec) { exec->get_master()->remove_logger(apply_logger); } exec->remove_logger(apply_logger); } - - add_or_set_member(this_precond_data, "completed", true, allocator); - } catch (const std::exception& e) { - auto encoded_name = encode_parameters(precond_name); - add_or_set_member(test_case["preconditioner"], encoded_name.c_str(), - rapidjson::Value(rapidjson::kObjectType), allocator); - add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], - "completed", false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], - "error", msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; } -} +}; int main(int argc, char* argv[]) @@ -260,11 +300,11 @@ int main(int argc, char* argv[]) FLAGS_formats = "csr"; std::string header = "A benchmark for measuring preconditioner performance.\n"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = - "Running with preconditioners: " + FLAGS_preconditioners + "\n"; + "Running with preconditioners: " + FLAGS_preconditioners; print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); @@ -278,76 +318,10 @@ int main(int argc, char* argv[]) std::exit(1); } - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("preconditioner")) { - test_case.AddMember("preconditioner", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& precond_object = test_case["preconditioner"]; - if (!FLAGS_overwrite && - all_of(begin(preconditioners), end(preconditioners), - [&precond_object](const std::string& s) { - return precond_object.HasMember(s.c_str()); - })) { - continue; - } - std::clog << "Running test case: " << test_case << std::endl; - - // annotate the test case - auto test_case_range = - annotate(generator.describe_config(test_case)); - - auto data = generator.generate_matrix_data(test_case); - - auto system_matrix = - share(formats::matrix_factory(FLAGS_formats, exec, data)); - auto b = generator.create_multi_vector_random( - exec, system_matrix->get_size()[0]); - auto x = generator.create_multi_vector( - exec, system_matrix->get_size()[0], gko::zero()); - - std::clog << "Matrix is of size (" << system_matrix->get_size()[0] - << ", " << system_matrix->get_size()[1] << ")" - << std::endl; - add_or_set_member(test_case, "size", data.size[0], allocator); - for (const auto& precond_name : preconditioners) { - { - auto precond_range = annotate(precond_name.c_str()); - run_preconditioner(precond_name.c_str(), exec, - system_matrix, b.get(), x.get(), - test_case, allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - backup_results(test_cases); - } - } catch (const std::exception& e) { - std::cerr << "Error setting up preconditioner, what(): " << e.what() - << std::endl; - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } + run_test_cases(PreconditionerBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh old mode 100644 new mode 100755 diff --git a/benchmark/solver/distributed/CMakeLists.txt b/benchmark/solver/distributed/CMakeLists.txt index ca6586f1acf..5f6acd5a06c 100644 --- a/benchmark/solver/distributed/CMakeLists.txt +++ b/benchmark/solver/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(solver-distributed "YES" solver.cpp) +ginkgo_add_typed_benchmark_executables(solver_distributed "YES" solver.cpp) diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp index 2db71c16ca3..6577c12e52e 100644 --- a/benchmark/solver/distributed/solver.cpp +++ b/benchmark/solver/distributed/solver.cpp @@ -39,8 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/solver/solver_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" @@ -52,7 +55,7 @@ struct Generator : public DistributedDefaultSystemGenerator { std::unique_ptr generate_rhs(std::shared_ptr exec, const gko::LinOp* system_matrix, - rapidjson::Value& config) const + json& config) const { return Vec::create( exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs}, @@ -82,9 +85,13 @@ int main(int argc, char* argv[]) FLAGS_repetitions = "1"; FLAGS_min_repetitions = 1; + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + const auto do_print = rank == 0; + std::string header = "A benchmark for measuring Ginkgo's distributed solvers\n"; - std::string format = example_config + R"( + std::string format = solver_example_config + R"( The matrix will either be read from an input file if the filename parameter is given, or generated as a stencil matrix. If the filename parameter is given, all processes will read the file and @@ -98,10 +105,9 @@ int main(int argc, char* argv[]) "-", where both "local_format" and "non_local_format" can be any of the recognized spmv formats. )"; - initialize_argument_parsing(&argc, &argv, header, format); - - const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - const auto rank = comm.rank(); + std::string additional_json = R"(,"optimal":{"spmv":"csr-csr"})"; + initialize_argument_parsing_matrix(&argc, &argv, header, format, + additional_json, do_print); auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get()); @@ -112,8 +118,8 @@ int main(int argc, char* argv[]) "Running " + FLAGS_solvers + " with " + std::to_string(FLAGS_max_iters) + " iterations and residual goal of " + ss_rel_res_goal.str() + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; - if (rank == 0) { + std::to_string(FLAGS_nrhs); + if (do_print) { print_general_information(extra_information); } @@ -134,17 +140,12 @@ int main(int argc, char* argv[]) "optimal": {"spmv": "csr-csr"}] )" : broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(json_input); - run_solver_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), - test_cases, Generator(comm), rank == 0); + run_test_cases(SolverBenchmark{Generator{comm}}, exec, + get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index 9190c99dad0..b656102e5df 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/solver/solver_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" @@ -58,10 +58,12 @@ int main(int argc, char* argv[]) FLAGS_min_repetitions = 1; std::string header = "A benchmark for measuring performance of Ginkgo's solvers.\n"; - std::string format = example_config + R"( + std::string format = solver_example_config + R"( "optimal":"spmv" can be one of the recognized spmv formats )"; - initialize_argument_parsing(&argc, &argv, header, format); + std::string additional_json = R"(,"optimal":{"spmv":"csr"})"; + initialize_argument_parsing_matrix(&argc, &argv, header, format, + additional_json); std::stringstream ss_rel_res_goal; ss_rel_res_goal << std::scientific << FLAGS_rel_res_goal; @@ -70,29 +72,24 @@ int main(int argc, char* argv[]) "Running " + FLAGS_solvers + " with " + std::to_string(FLAGS_max_iters) + " iterations and residual goal of " + ss_rel_res_goal.str() + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::to_string(FLAGS_nrhs); print_general_information(extra_information); auto exec = get_executor(FLAGS_gpu_timer); - rapidjson::Document test_cases; + json test_cases; if (!FLAGS_overhead) { - rapidjson::IStreamWrapper jcin(get_input_stream()); - test_cases.ParseStream(jcin); + test_cases = json::parse(get_input_stream()); } else { // Fake test case to run once auto overhead_json = std::string() + " [{\"filename\": \"overhead.mtx\", \"optimal\": " "{ \"spmv\": \"csr\"}}]"; - test_cases.Parse(overhead_json.c_str()); + test_cases = json::parse(overhead_json); } - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } - - run_solver_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), test_cases, - SolverGenerator{}, true); + run_test_cases(SolverBenchmark{SolverGenerator{}}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 64190f8d968..a46cc188c50 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -36,9 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/preconditioners.hpp" +#include "benchmark/utils/runner.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING @@ -59,13 +62,13 @@ DEFINE_bool( rel_residual, false, "Use relative residual instead of residual reduction stopping criterion"); -DEFINE_string( - solvers, "cg", - "A comma-separated list of solvers to run. " - "Supported values are: bicgstab, bicg, cb_gmres_keep, " - "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, " - "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, " - "lower_trs, upper_trs, spd_direct, symm_direct, direct, overhead"); +DEFINE_string(solvers, "cg", + "A comma-separated list of solvers to run. " + "Supported values are: bicgstab, bicg, cb_gmres_keep, " + "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, " + "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, " + "lower_trs, upper_trs, spd_direct, symm_direct, " + "near_symm_direct, direct, overhead"); DEFINE_uint32( nrhs, 1, @@ -107,7 +110,7 @@ DEFINE_bool(overhead, false, "If set, uses dummy data to benchmark Ginkgo overhead"); -std::string example_config = R"( +std::string solver_example_config = R"( [ {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"}, "rhs": "my_file_rhs.mtx"}, @@ -119,28 +122,6 @@ std::string example_config = R"( )"; -// input validation -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of solver configurations:\n" - << example_config << std::endl; - std::exit(1); -} - - -void validate_option_object(const rapidjson::Value& value) -{ - if (!value.IsObject() || - !((value.HasMember("size") && value.HasMember("stencil") && - value["size"].IsInt64() && value["stencil"].IsString()) || - (value.HasMember("filename") && value["filename"].IsString())) || - (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") && - !value["optimal"]["spmv"].IsString())) { - print_config_error_and_exit(); - } -} - - std::shared_ptr create_criterion( std::shared_ptr exec, std::uint32_t max_iters) { @@ -259,21 +240,26 @@ std::unique_ptr generate_solver( return gko::experimental::solver::Direct::build() .with_factorization( gko::experimental::factorization::Cholesky::build() - .on(exec)) + itype>::build()) .on(exec); } else if (description == "symm_direct") { return gko::experimental::solver::Direct::build() .with_factorization( gko::experimental::factorization::Lu::build() - .with_symmetric_sparsity(true) - .on(exec)) + .with_symbolic_algorithm(gko::experimental::factorization:: + symbolic_type::symmetric)) + .on(exec); + } else if (description == "near_symm_direct") { + return gko::experimental::solver::Direct::build() + .with_factorization( + gko::experimental::factorization::Lu::build() + .with_symbolic_algorithm(gko::experimental::factorization:: + symbolic_type::near_symmetric)) .on(exec); } else if (description == "direct") { return gko::experimental::solver::Direct::build() .with_factorization( - gko::experimental::factorization::Lu::build().on( - exec)) + gko::experimental::factorization::Lu::build()) .on(exec); } else if (description == "overhead") { return add_criteria_precond_finalize>( @@ -284,21 +270,17 @@ std::unique_ptr generate_solver( } -void write_precond_info(const gko::LinOp* precond, - rapidjson::Value& precond_info, - rapidjson::MemoryPoolAllocator<>& allocator) +void write_precond_info(const gko::LinOp* precond, json& precond_info) { if (const auto jacobi = dynamic_cast*>(precond)) { // extract block sizes const auto bdata = jacobi->get_parameters().block_pointers.get_const_data(); - add_or_set_member(precond_info, "block_sizes", - rapidjson::Value(rapidjson::kArrayType), allocator); + precond_info["block_sizes"] = json::array(); const auto nblocks = jacobi->get_num_blocks(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i], - allocator); + precond_info["block_sizes"].push_back(bdata[i + 1] - bdata[i]); } // extract block precisions @@ -306,24 +288,19 @@ void write_precond_info(const gko::LinOp* precond, jacobi->get_parameters() .storage_optimization.block_wise.get_const_data(); if (pdata) { - add_or_set_member(precond_info, "block_precisions", - rapidjson::Value(rapidjson::kArrayType), - allocator); + precond_info["block_precisions"] = json::array(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_precisions"].PushBack( - static_cast(pdata[i]), allocator); + precond_info["block_precisions"].push_back( + static_cast(pdata[i])); } } // extract condition numbers const auto cdata = jacobi->get_conditioning(); if (cdata) { - add_or_set_member(precond_info, "block_conditioning", - rapidjson::Value(rapidjson::kArrayType), - allocator); + precond_info["block_conditioning"] = json::array(); for (auto i = decltype(nblocks){0}; i < nblocks; ++i) { - precond_info["block_conditioning"].PushBack(cdata[i], - allocator); + precond_info["block_conditioning"].push_back(cdata[i]); } } } @@ -335,10 +312,10 @@ struct SolverGenerator : DefaultSystemGenerator<> { std::unique_ptr generate_rhs(std::shared_ptr exec, const gko::LinOp* system_matrix, - rapidjson::Value& config) const + json& config) const { - if (config.HasMember("rhs")) { - std::ifstream rhs_fd{config["rhs"].GetString()}; + if (config.contains("rhs")) { + std::ifstream rhs_fd{config["rhs"].get()}; return gko::read(rhs_fd, std::move(exec)); } else { gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs}; @@ -399,70 +376,143 @@ struct SolverGenerator : DefaultSystemGenerator<> { }; -template -void solve_system(const std::string& solver_name, - const std::string& precond_name, - const char* precond_solver_name, - std::shared_ptr exec, - std::shared_ptr timer, - std::shared_ptr system_matrix, - const VectorType* b, const VectorType* x, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) { - return; +template +struct solver_benchmark_state { + using Vec = typename Generator::Vec; + std::shared_ptr system_matrix; + std::unique_ptr b; + std::unique_ptr x; +}; + + +template +struct SolverBenchmark : Benchmark> { + std::string name; + std::vector precond_solvers; + std::map> decoder; + Generator generator; + + SolverBenchmark(Generator generator) : name{"solver"}, generator{generator} + { + auto solvers = split(FLAGS_solvers, ','); + auto preconds = split(FLAGS_preconditioners, ','); + for (const auto& s : solvers) { + for (const auto& p : preconds) { + precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); + decoder[precond_solvers.back()] = {s, p}; + } + } + } + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return precond_solvers; + } + + bool should_print() const override { return true; } + + std::string get_example_config() const override + { + return solver_example_config; + } + + bool validate_config(const json& value) const override + { + return generator.validate_config(value) && + (value.contains("optimal") && + value["optimal"].contains("spmv") && + value["optimal"]["spmv"].is_string()); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + solver_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + solver_benchmark_state state; + + if (FLAGS_overhead) { + state.system_matrix = generator.initialize({1.0}, exec); + state.b = generator.initialize( + {std::numeric_limits::quiet_NaN()}, exec); + state.x = generator.initialize({0.0}, exec); + } else { + auto data = generator.generate_matrix_data(test_case); + auto permutation = reorder(data, test_case); + + state.system_matrix = generator.generate_matrix_with_format( + exec, test_case["optimal"]["spmv"].get(), data); + state.b = generator.generate_rhs(exec, state.system_matrix.get(), + test_case); + if (permutation) { + permute(state.b, permutation.get()); + } + state.x = generator.generate_initial_guess( + exec, state.system_matrix.get(), state.b.get()); } - add_or_set_member(solver_case, precond_solver_name, - rapidjson::Value(rapidjson::kObjectType), allocator); - auto& solver_json = solver_case[precond_solver_name]; - add_or_set_member(solver_json, "recurrent_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "true_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "implicit_residuals", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(solver_json, "iteration_timestamps", - rapidjson::Value(rapidjson::kArrayType), allocator); - if (b->get_size()[1] == 1 && !FLAGS_overhead) { - auto rhs_norm = compute_norm2(b); - add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator); + std::clog << "Matrix is of size (" << state.system_matrix->get_size()[0] + << ", " << state.system_matrix->get_size()[1] << ")" + << std::endl; + test_case["rows"] = state.system_matrix->get_size()[0]; + test_case["cols"] = state.system_matrix->get_size()[1]; + return state; + } + + + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, + solver_benchmark_state& state, + const std::string& encoded_solver_name, + json& solver_case) const override + { + const auto decoded_pair = decoder.at(encoded_solver_name); + auto& solver_name = decoded_pair.first; + auto& precond_name = decoded_pair.second; + solver_case["recurrent_residuals"] = json::array(); + solver_case["true_residuals"] = json::array(); + solver_case["implicit_residuals"] = json::array(); + solver_case["iteration_timestamps"] = json::array(); + if (state.b->get_size()[1] == 1 && !FLAGS_overhead) { + auto rhs_norm = compute_norm2(state.b.get()); + solver_case["rhs_norm"] = rhs_norm; } for (auto stage : {"generate", "apply"}) { - add_or_set_member(solver_json, stage, - rapidjson::Value(rapidjson::kObjectType), - allocator); - add_or_set_member(solver_json[stage], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + solver_case[stage] = json::object(); + solver_case[stage]["components"] = json::object(); } IterationControl ic{timer}; // warm run std::shared_ptr solver; - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_warmup_max_iters) - ->generate(system_matrix); - solver->apply(b, x_clone); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(state.x); + auto precond = precond_factory.at(precond_name)(exec); + solver = generate_solver(exec, give(precond), solver_name, + FLAGS_warmup_max_iters) + ->generate(state.system_matrix); + solver->apply(state.b, x_clone); + exec->synchronize(); + } } // detail run if (FLAGS_detailed && !FLAGS_overhead) { // slow run, get the time of each functions - auto x_clone = clone(x); + auto x_clone = clone(state.x); { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - solver_json["generate"]["components"], allocator, 1); + solver_case["generate"]["components"], 1); exec->add_logger(gen_logger); if (exec != exec->get_master()) { exec->get_master()->add_logger(gen_logger); @@ -471,7 +521,7 @@ void solve_system(const std::string& solver_name, auto precond = precond_factory.at(precond_name)(exec); solver = generate_solver(exec, give(precond), solver_name, FLAGS_max_iters) - ->generate(system_matrix); + ->generate(state.system_matrix); exec->remove_logger(gen_logger); if (exec != exec->get_master()) { @@ -481,25 +531,22 @@ void solve_system(const std::string& solver_name, if (auto prec = dynamic_cast(solver.get())) { - add_or_set_member(solver_json, "preconditioner", - rapidjson::Value(rapidjson::kObjectType), - allocator); + solver_case["preconditioner"] = json::object(); write_precond_info( clone(exec->get_master(), prec->get_preconditioner()).get(), - solver_json["preconditioner"], allocator); + solver_case["preconditioner"]); } { auto apply_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - solver_json["apply"]["components"], allocator, 1); + solver_case["apply"]["components"], 1); exec->add_logger(apply_logger); if (exec != exec->get_master()) { exec->get_master()->add_logger(apply_logger); } - - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); exec->remove_logger(apply_logger); if (exec != exec->get_master()) { @@ -508,17 +555,18 @@ void solve_system(const std::string& solver_name, } // slow run, gets the recurrent and true residuals of each iteration - if (b->get_size()[1] == 1) { - x_clone = clone(x); + if (state.b->get_size()[1] == 1) { + x_clone = clone(state.x); auto res_logger = std::make_shared>( - system_matrix, b, solver_json["recurrent_residuals"], - solver_json["true_residuals"], - solver_json["implicit_residuals"], - solver_json["iteration_timestamps"], allocator); + state.system_matrix, state.b, + solver_case["recurrent_residuals"], + solver_case["true_residuals"], + solver_case["implicit_residuals"], + solver_case["iteration_timestamps"]); solver->add_logger(res_logger); - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); if (!res_logger->has_implicit_res_norms()) { - solver_json.RemoveMember("implicit_residuals"); + solver_case.erase("implicit_residuals"); } } exec->synchronize(); @@ -528,16 +576,17 @@ void solve_system(const std::string& solver_name, auto it_logger = std::make_shared(); auto generate_timer = get_timer(exec, FLAGS_gpu_timer); auto apply_timer = ic.get_timer(); - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto status : ic.run(false)) { - x_clone = clone(x); + auto range = annotate("repetition"); + x_clone = clone(state.x); exec->synchronize(); generate_timer->tic(); auto precond = precond_factory.at(precond_name)(exec); solver = generate_solver(exec, give(precond), solver_name, FLAGS_max_iters) - ->generate(system_matrix); + ->generate(state.system_matrix); generate_timer->toc(); exec->synchronize(); @@ -545,164 +594,33 @@ void solve_system(const std::string& solver_name, solver->add_logger(it_logger); } apply_timer->tic(); - solver->apply(b, x_clone); + solver->apply(state.b, x_clone); apply_timer->toc(); if (ic.get_num_repetitions() == 0) { solver->remove_logger(it_logger); } } - it_logger->write_data(solver_json["apply"], allocator); + it_logger->write_data(solver_case["apply"]); - if (b->get_size()[1] == 1 && !FLAGS_overhead) { + if (state.b->get_size()[1] == 1 && !FLAGS_overhead) { // a solver is considered direct if it didn't log any iterations - if (solver_json["apply"].HasMember("iterations") && - solver_json["apply"]["iterations"].GetInt() == 0) { - auto error = - compute_direct_error(solver.get(), b, x_clone.get()); - add_or_set_member(solver_json, "forward_error", error, - allocator); - } - auto residual = - compute_residual_norm(system_matrix.get(), b, x_clone.get()); - add_or_set_member(solver_json, "residual_norm", residual, - allocator); - } - add_or_set_member(solver_json["generate"], "time", - generate_timer->compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(solver_json["apply"], "time", - apply_timer->compute_time(FLAGS_timer_method), - allocator); - add_or_set_member(solver_json, "repetitions", - apply_timer->get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(solver_json, "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["solver"][precond_solver_name], "completed", - false, allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["solver"][precond_solver_name], "error", - msg_value, allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -template -void run_solver_benchmarks(std::shared_ptr exec, - std::shared_ptr timer, - rapidjson::Document& test_cases, - const SystemGenerator& system_generator, - bool do_print) -{ - auto solvers = split(FLAGS_solvers, ','); - auto preconds = split(FLAGS_preconditioners, ','); - std::vector precond_solvers; - for (const auto& s : solvers) { - for (const auto& p : preconds) { - precond_solvers.push_back(s + (p == "none" ? "" : "-" + p)); - } - } - - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember("solver")) { - test_case.AddMember("solver", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& solver_case = test_case["solver"]; - if (!FLAGS_overwrite && - all_of(begin(precond_solvers), end(precond_solvers), - [&solver_case](const std::string& s) { - return solver_case.HasMember(s.c_str()); - })) { - continue; - } - // annotate the test case - auto test_case_range = - annotate(system_generator.describe_config(test_case)); - - if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; - } - - using Vec = typename SystemGenerator::Vec; - std::shared_ptr system_matrix; - std::unique_ptr b; - std::unique_ptr x; - if (FLAGS_overhead) { - system_matrix = system_generator.initialize({1.0}, exec); - b = system_generator.initialize( - {std::numeric_limits::quiet_NaN()}, exec); - x = system_generator.initialize({0.0}, exec); - } else { - system_matrix = - system_generator.generate_matrix_with_optimal_format( - exec, test_case); - b = system_generator.generate_rhs(exec, system_matrix.get(), - test_case); - x = system_generator.generate_initial_guess( - exec, system_matrix.get(), b.get()); - } - - if (do_print) { - std::clog << "Matrix is of size (" - << system_matrix->get_size()[0] << ", " - << system_matrix->get_size()[1] << ")" << std::endl; - } - add_or_set_member(test_case, "size", system_matrix->get_size()[0], - allocator); - auto precond_solver_name = begin(precond_solvers); - for (const auto& solver_name : solvers) { - auto solver_range = annotate(solver_name.c_str()); - for (const auto& precond_name : preconds) { - if (do_print) { - std::clog - << "\tRunning solver: " << *precond_solver_name - << std::endl; - } - { - auto precond_range = annotate(precond_name.c_str()); - solve_system(solver_name, precond_name, - precond_solver_name->c_str(), exec, timer, - system_matrix, b.get(), x.get(), test_case, - allocator); - } - if (do_print) { - backup_results(test_cases); - } - ++precond_solver_name; - } - } - } catch (const std::exception& e) { - std::cerr << "Error setting up solver, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); + if (solver_case["apply"].contains("iterations") && + solver_case["apply"]["iterations"].get() == 0) { + auto error = compute_direct_error(solver.get(), state.b.get(), + x_clone.get()); + solver_case["forward_error"] = error; } + auto residual = compute_residual_norm(state.system_matrix.get(), + state.b.get(), x_clone.get()); + solver_case["residual_norm"] = residual; } + solver_case["generate"]["time"] = + generate_timer->compute_time(FLAGS_timer_method); + solver_case["apply"]["time"] = + apply_timer->compute_time(FLAGS_timer_method); + solver_case["repetitions"] = apply_timer->get_num_repetitions(); } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; #endif // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 6a817a67c0d..f8d93f6a2c0 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/sparse_blas/operations.hpp" -#include "benchmark/utils/json.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" @@ -632,11 +631,40 @@ class SymbolicLuOperation : public BenchmarkOperation { void run() override { gko::factorization::symbolic_lu(mtx_, result_); } - void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) override + void write_stats(json& object) override { - add_or_set_member(object, "factor_nonzeros", - result_->get_num_stored_elements(), allocator); + object["factor_nonzeros"] = result_->get_num_stored_elements(); + } + +private: + const Mtx* mtx_; + std::unique_ptr result_; +}; + + +class SymbolicLuNearSymmOperation : public BenchmarkOperation { +public: + explicit SymbolicLuNearSymmOperation(const Mtx* mtx) : mtx_{mtx}, result_{} + {} + + std::pair validate() const override + { + return std::make_pair( + validate_symbolic_factorization(mtx_, result_.get()), 0.0); + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void run() override + { + gko::factorization::symbolic_lu_near_symm(mtx_, result_); + } + + void write_stats(json& object) override + { + object["factor_nonzeros"] = result_->get_num_stored_elements(); } private: @@ -680,11 +708,9 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { forest_); } - void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) override + void write_stats(json& object) override { - add_or_set_member(object, "factor_nonzeros", - result_->get_num_stored_elements(), allocator); + object["factor_nonzeros"] = result_->get_num_stored_elements(); } private: @@ -695,6 +721,106 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { }; +class ReorderRcmOperation : public BenchmarkOperation { + using reorder_type = gko::experimental::reorder::Rcm; + using permute_type = gko::matrix::Permutation; + +public: + explicit ReorderRcmOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{reorder_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating RCM correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + +#if GKO_HAVE_METIS + + +class ReorderNestedDissectionOperation : public BenchmarkOperation { + using factory_type = + gko::experimental::reorder::NestedDissection; + using reorder_type = gko::matrix::Permutation; + +public: + explicit ReorderNestedDissectionOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{factory_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating ND correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + +#endif + + +class ReorderApproxMinDegOperation : public BenchmarkOperation { + using factory_type = gko::experimental::reorder::Amd; + using reorder_type = gko::matrix::Permutation; + +public: + explicit ReorderApproxMinDegOperation(const Mtx* mtx) + : mtx_{mtx->clone()}, + factory_{factory_type::build().on(mtx->get_executor())} + {} + + std::pair validate() const override + { + // validating AMD correctness is hard, let's leave it out for now + return {true, 0.0}; + } + + gko::size_type get_flops() const override { return 0; } + + gko::size_type get_memory() const override { return 0; } + + void prepare() override {} + + void run() override { reorder_ = factory_->generate(mtx_); } + +private: + std::shared_ptr mtx_; + std::unique_ptr factory_; + std::unique_ptr reorder_; +}; + + const std::map(const Mtx*)>> operation_map{ @@ -722,12 +848,33 @@ const std::map(mtx); }}, + {"symbolic_lu_near_symm", + [](const Mtx* mtx) { + return std::make_unique(mtx); + }}, {"symbolic_cholesky", [](const Mtx* mtx) { return std::make_unique(mtx, false); }}, - {"symbolic_cholesky_symmetric", [](const Mtx* mtx) { + {"symbolic_cholesky_symmetric", + [](const Mtx* mtx) { return std::make_unique(mtx, true); + }}, + {"reorder_rcm", + [](const Mtx* mtx) { + return std::make_unique(mtx); + }}, + {"reorder_amd", + [](const Mtx* mtx) { + return std::make_unique(mtx); + }}, + {"reorder_nd", + [](const Mtx* mtx) -> std::unique_ptr { +#if GKO_HAVE_METIS + return std::make_unique(mtx); +#else + GKO_NOT_COMPILED(METIS); +#endif }}}; diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp index 99cf72b8e59..48034eb8a1f 100644 --- a/benchmark/sparse_blas/operations.hpp +++ b/benchmark/sparse_blas/operations.hpp @@ -36,9 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - +#include "benchmark/utils/json.hpp" #include "benchmark/utils/types.hpp" @@ -79,9 +77,7 @@ class BenchmarkOperation { /** * Allows the operation to write arbitrary information to the JSON output. */ - virtual void write_stats(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& allocator) - {} + virtual void write_stats(json& object) {} }; diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index 4fb06d2a4a0..d1dc67f8d2d 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -45,9 +45,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/sparse_blas/operations.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" +#include "benchmark/utils/iteration_control.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/types.hpp" #include "core/test/utils/matrix_generator.hpp" @@ -57,92 +58,129 @@ const auto benchmark_name = "sparse_blas"; using mat_data = gko::matrix_data; -DEFINE_string( - operations, "spgemm,spgeam,transpose", +const char* operations_string = "Comma-separated list of operations to be benchmarked. Can be " "spgemm, spgeam, transpose, sort, is_sorted, generate_lookup, " - "lookup, symbolic_lu, symbolic_cholesky, symbolic_cholesky_symmetric"); + "lookup, symbolic_lu, symbolic_lu_near_symm, symbolic_cholesky, " + "symbolic_cholesky_symmetric, reorder_rcm, " +#if GKO_HAVE_METIS + "reorder_nd, " +#endif + "reorder_amd"; + +DEFINE_string(operations, "spgemm,spgeam,transpose", operations_string); DEFINE_bool(validate, false, "Check for correct sparsity pattern and compute the L2 norm " "against the ReferenceExecutor solution."); -void apply_sparse_blas(const char* operation_name, - std::shared_ptr exec, const Mtx* mtx, - rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - add_or_set_member(test_case, operation_name, - rapidjson::Value(rapidjson::kObjectType), allocator); +using Generator = DefaultSystemGenerator<>; + + +struct SparseBlasBenchmark : Benchmark> { + std::string name; + std::vector operations; + + SparseBlasBenchmark() + : name{"sparse_blas"}, operations{split(FLAGS_operations)} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return operations; + } + + bool should_print() const override { return true; } + + bool validate_config(const json& value) const override + { + return Generator::validate_config(value); + } + + std::string get_example_config() const override + { + return Generator::get_example_config(); + } + + std::string describe_config(const json& test_case) const override + { + return Generator::describe_config(test_case); + } + + std::unique_ptr setup(std::shared_ptr exec, + json& test_case) const override + { + auto data = Generator::generate_matrix_data(test_case); + reorder(data, test_case); + std::clog << "Matrix is of size (" << data.size[0] << ", " + << data.size[1] << "), " << data.nonzeros.size() << std::endl; + test_case["rows"] = data.size[0]; + test_case["cols"] = data.size[1]; + test_case["nonzeros"] = data.nonzeros.size(); + + auto mtx = Mtx::create(exec, data.size, data.nonzeros.size()); + mtx->read(data); + return mtx; + } + - auto op = get_operation(operation_name, mtx); + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, std::unique_ptr& mtx, + const std::string& operation_name, + json& operation_case) const override + { + auto op = get_operation(operation_name, mtx.get()); - auto timer = get_timer(exec, FLAGS_gpu_timer); IterationControl ic(timer); // warm run - for (auto _ : ic.warmup_run()) { - op->prepare(); - exec->synchronize(); - op->run(); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + op->prepare(); + exec->synchronize(); + op->run(); + exec->synchronize(); + } } // timed run op->prepare(); for (auto _ : ic.run()) { + auto range = annotate("repetition"); op->run(); } const auto runtime = ic.compute_time(FLAGS_timer_method); const auto flops = static_cast(op->get_flops()); const auto mem = static_cast(op->get_memory()); const auto repetitions = ic.get_num_repetitions(); - add_or_set_member(test_case[operation_name], "time", runtime, - allocator); - add_or_set_member(test_case[operation_name], "flops", flops / runtime, - allocator); - add_or_set_member(test_case[operation_name], "bandwidth", mem / runtime, - allocator); - add_or_set_member(test_case[operation_name], "repetitions", repetitions, - allocator); + operation_case["time"] = runtime; + operation_case["flops"] = flops / runtime; + operation_case["bandwidth"] = mem / runtime; + operation_case["repetitions"] = repetitions; if (FLAGS_validate) { auto validation_result = op->validate(); - add_or_set_member(test_case[operation_name], "correct", - validation_result.first, allocator); - add_or_set_member(test_case[operation_name], "error", - validation_result.second, allocator); + operation_case["correct"] = validation_result.first; + operation_case["error"] = validation_result.second; } if (FLAGS_detailed) { - add_or_set_member(test_case[operation_name], "components", - rapidjson::Value(rapidjson::kObjectType), - allocator); + operation_case["components"] = json::object(); auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - test_case[operation_name]["components"], allocator, 1); + operation_case["components"], repetitions); exec->add_logger(gen_logger); - op->run(); + for (unsigned i = 0; i < repetitions; i++) { + op->run(); + } exec->remove_logger(gen_logger); } - op->write_stats(test_case[operation_name], allocator); - - add_or_set_member(test_case[operation_name], "completed", true, - allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case[operation_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case[operation_name], "error", msg_value, - allocator); - } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; + op->write_stats(operation_case); } -} +}; int main(int argc, char* argv[]) @@ -150,86 +188,18 @@ int main(int argc, char* argv[]) std::string header = "A benchmark for measuring performance of Ginkgo's sparse BLAS " "operations.\n"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); std::string extra_information = "The operations are " + FLAGS_operations; print_general_information(extra_information); - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); - } - auto annotate = annotate_functor{profiler_hook}; - - auto operations = split(FLAGS_operations, ','); - - DefaultSystemGenerator<> generator{}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - validate_option_object(test_case); - if (!test_case.HasMember(benchmark_name)) { - test_case.AddMember(rapidjson::Value(benchmark_name, allocator), - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& sp_blas_case = test_case[benchmark_name]; - std::clog << "Running test case: " << test_case << std::endl; - auto data = generator.generate_matrix_data(test_case); - data.ensure_row_major_order(); - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << "), " << data.nonzeros.size() - << std::endl; - add_or_set_member(test_case, "rows", data.size[0], allocator); - add_or_set_member(test_case, "cols", data.size[1], allocator); - add_or_set_member(test_case, "nonzeros", data.nonzeros.size(), - allocator); - - auto mtx = Mtx::create(exec, data.size, data.nonzeros.size()); - mtx->read(data); - // annotate the test case - auto test_case_range = - annotate(generator.describe_config(test_case)); - for (const auto& operation_name : operations) { - if (FLAGS_overwrite || - !sp_blas_case.HasMember(operation_name.c_str())) { - { - auto operation_range = annotate(operation_name.c_str()); - apply_sparse_blas(operation_name.c_str(), exec, - mtx.get(), sp_blas_case, allocator); - } - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - backup_results(test_cases); - } - } - // write the output if we have no strategies - backup_results(test_cases); - } catch (const std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } - } - } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } + run_test_cases(SparseBlasBenchmark{}, exec, + get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/spmv/distributed/CMakeLists.txt b/benchmark/spmv/distributed/CMakeLists.txt index cadde3eea34..4322dd70e90 100644 --- a/benchmark/spmv/distributed/CMakeLists.txt +++ b/benchmark/spmv/distributed/CMakeLists.txt @@ -1 +1 @@ -ginkgo_add_typed_benchmark_executables(spmv-distributed "YES" spmv.cpp) +ginkgo_add_typed_benchmark_executables(spmv_distributed "YES" spmv.cpp) diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp index 3c2986846b3..d3925dabcf2 100644 --- a/benchmark/spmv/distributed/spmv.cpp +++ b/benchmark/spmv/distributed/spmv.cpp @@ -43,8 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define GKO_BENCHMARK_DISTRIBUTED + + #include "benchmark/spmv/spmv_common.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -58,38 +61,7 @@ DEFINE_string(non_local_formats, "csr", "run. See the 'formats' option for a list of supported versions"); -std::string example_config = R"( - [ - {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, - {"filename": "my_file.mtx"} - ] -)"; - - -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); -} - - -struct Generator : DistributedDefaultSystemGenerator> { - Generator(gko::experimental::mpi::communicator comm) - : DistributedDefaultSystemGenerator>{ - std::move(comm), {}} - {} - - void validate_options(const rapidjson::Value& options) const - { - if (!options.IsObject() || - !((options.HasMember("size") && options.HasMember("stencil") && - options.HasMember("comm_pattern")) || - options.HasMember("filename"))) { - print_config_error_and_exit(); - } - } -}; +using Generator = DistributedDefaultSystemGenerator>; int main(int argc, char* argv[]) @@ -98,18 +70,19 @@ int main(int argc, char* argv[]) const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); const auto rank = comm.rank(); + const auto do_print = rank == 0; std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); - - if (rank == 0) { - std::string extra_information = "The formats are [" + - FLAGS_local_formats + "]x[" + - FLAGS_non_local_formats + "]\n" + - "The number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format, "", + do_print); + + if (do_print) { + std::string extra_information = + "The formats are [" + FLAGS_local_formats + "]x[" + + FLAGS_non_local_formats + "]\n" + + "The number of right hand sides is " + std::to_string(FLAGS_nrhs); print_general_information(extra_information); } @@ -125,16 +98,13 @@ int main(int argc, char* argv[]) } std::string json_input = broadcast_json_input(get_input_stream(), comm); - rapidjson::Document test_cases; - test_cases.Parse(json_input.c_str()); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(json_input); - run_spmv_benchmark(exec, test_cases, formats, Generator{comm}, - get_mpi_timer(exec, comm, FLAGS_gpu_timer), rank == 0); + run_test_cases(SpmvBenchmark{Generator{comm}, formats, do_print}, + exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer), + test_cases); - if (rank == 0) { - std::cout << test_cases << std::endl; + if (do_print) { + std::cout << std::setw(4) << test_cases << std::endl; } } diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index df000cecd47..abd1b783019 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -39,50 +39,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/spmv/spmv_common.hpp" #include "benchmark/utils/formats.hpp" -#include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" #include "benchmark/utils/generator.hpp" -#include "benchmark/utils/spmv_validation.hpp" -struct Generator : DefaultSystemGenerator<> { - void validate_options(const rapidjson::Value& options) const - { - if (!options.IsObject() || - !((options.HasMember("size") && options.HasMember("stencil")) || - options.HasMember("filename"))) { - std::cerr - << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); - } - } -}; +using Generator = DefaultSystemGenerator<>; int main(int argc, char* argv[]) { std::string header = "A benchmark for measuring performance of Ginkgo's spmv.\n"; - std::string format = example_config; - initialize_argument_parsing(&argc, &argv, header, format); + std::string format = Generator::get_example_config(); + initialize_argument_parsing_matrix(&argc, &argv, header, format); std::string extra_information = "The formats are " + FLAGS_formats + "\nThe number of right hand sides is " + - std::to_string(FLAGS_nrhs) + "\n"; + std::to_string(FLAGS_nrhs); print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer); - auto formats = split(FLAGS_formats, ','); - rapidjson::IStreamWrapper jcin(get_input_stream()); - rapidjson::Document test_cases; - test_cases.ParseStream(jcin); - if (!test_cases.IsArray()) { - print_config_error_and_exit(); - } + auto test_cases = json::parse(get_input_stream()); - run_spmv_benchmark(exec, test_cases, formats, Generator{}, - get_timer(exec, FLAGS_gpu_timer), true); + run_test_cases(SpmvBenchmark{Generator{}, split(FLAGS_formats)}, + exec, get_timer(exec, FLAGS_gpu_timer), test_cases); - std::cout << test_cases << std::endl; + std::cout << std::setw(4) << test_cases << std::endl; } diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 4c40f1b9a7b..1d43e3ed327 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -36,7 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/formats.hpp" #include "benchmark/utils/general.hpp" +#include "benchmark/utils/general_matrix.hpp" +#include "benchmark/utils/iteration_control.hpp" #include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/runner.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING @@ -48,57 +51,123 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. DEFINE_uint32(nrhs, 1, "The number of right hand sides"); -// This function supposes that management of `FLAGS_overwrite` is done before -// calling it -template -void apply_spmv(const char* format_name, std::shared_ptr exec, - const Generator& generator, std::shared_ptr timer, - const gko::matrix_data& data, - const VectorType* b, const VectorType* x, - const VectorType* answer, rapidjson::Value& test_case, - rapidjson::MemoryPoolAllocator<>& allocator) -{ - try { - auto& spmv_case = test_case["spmv"]; - add_or_set_member(spmv_case, format_name, - rapidjson::Value(rapidjson::kObjectType), allocator); +template +struct spmv_benchmark_state { + gko::matrix_data data; + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr answer; +}; + + +template +struct SpmvBenchmark : Benchmark> { + using Vec = typename Generator::Vec; + std::string name; + std::vector formats; + bool do_print; + Generator generator; + + SpmvBenchmark(Generator generator, std::vector formats, + bool do_print = true) + : name{"spmv"}, + formats{std::move(formats)}, + generator{generator}, + do_print{do_print} + {} + + const std::string& get_name() const override { return name; } + + const std::vector& get_operations() const override + { + return formats; + } + + bool should_print() const override { return do_print; } + + std::string get_example_config() const override + { + return generator.get_example_config(); + } + + bool validate_config(const json& test_case) const override + { + return generator.validate_config(test_case); + } + + std::string describe_config(const json& test_case) const override + { + return generator.describe_config(test_case); + } + + spmv_benchmark_state setup(std::shared_ptr exec, + json& test_case) const override + { + spmv_benchmark_state state; + state.data = generator.generate_matrix_data(test_case); + reorder(state.data, test_case); + + auto nrhs = FLAGS_nrhs; + state.b = generator.create_multi_vector_random( + exec, gko::dim<2>{state.data.size[1], nrhs}); + state.x = generator.create_multi_vector_random( + exec, gko::dim<2>{state.data.size[0], nrhs}); + if (do_print) { + std::clog << "Matrix is of size (" << state.data.size[0] << ", " + << state.data.size[1] << "), " + << state.data.nonzeros.size() << std::endl; + } + test_case["rows"] = state.data.size[0]; + test_case["cols"] = state.data.size[1]; + test_case["nonzeros"] = state.data.nonzeros.size(); + if (FLAGS_detailed) { + state.answer = gko::clone(state.x); + auto system_matrix = + generator.generate_matrix_with_default_format(exec, state.data); + exec->synchronize(); + system_matrix->apply(state.b, state.answer); + exec->synchronize(); + } + return state; + } + void run(std::shared_ptr exec, std::shared_ptr timer, + annotate_functor annotate, spmv_benchmark_state& state, + const std::string& format_name, json& format_case) const override + { auto system_matrix = generator.generate_matrix_with_format( - exec, format_name, data, &spmv_case[format_name], &allocator); + exec, format_name, state.data, &format_case); // check the residual if (FLAGS_detailed) { - auto x_clone = clone(x); + auto x_clone = clone(state.x); exec->synchronize(); - system_matrix->apply(b, x_clone); + system_matrix->apply(state.b, x_clone); exec->synchronize(); auto max_relative_norm2 = - compute_max_relative_norm2(x_clone.get(), answer); - add_or_set_member(spmv_case[format_name], "max_relative_norm2", - max_relative_norm2, allocator); + compute_max_relative_norm2(x_clone.get(), state.answer.get()); + format_case["max_relative_norm2"] = max_relative_norm2; } IterationControl ic{timer}; // warm run - for (auto _ : ic.warmup_run()) { - auto x_clone = clone(x); - exec->synchronize(); - system_matrix->apply(b, x_clone); - exec->synchronize(); + { + auto range = annotate("warmup", FLAGS_warmup > 0); + for (auto _ : ic.warmup_run()) { + auto x_clone = clone(state.x); + exec->synchronize(); + system_matrix->apply(state.b, x_clone); + exec->synchronize(); + } } // tuning run #ifdef GINKGO_BENCHMARK_ENABLE_TUNING auto& format_case = spmv_case[format_name]; - if (!format_case.HasMember("tuning")) { - format_case.AddMember( - "tuning", rapidjson::Value(rapidjson::kObjectType), allocator); - } + format_case["tuning"] = json::object(); auto& tuning_case = format_case["tuning"]; - add_or_set_member(tuning_case, "time", - rapidjson::Value(rapidjson::kArrayType), allocator); - add_or_set_member(tuning_case, "values", - rapidjson::Value(rapidjson::kArrayType), allocator); + tuning_case["time"] = json::array(); + tuning_case["values"] = json::array(); // Enable tuning for this portion of code gko::_tuning_flag = true; @@ -112,13 +181,13 @@ void apply_spmv(const char* format_name, std::shared_ptr exec, gko::_tuned_value = val; auto tuning_timer = get_timer(exec, FLAGS_gpu_timer); IterationControl ic_tuning{tuning_timer}; - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto _ : ic_tuning.run()) { - system_matrix->apply(b, x_clone); + system_matrix->apply(state.b, x_clone); } - tuning_case["time"].PushBack( - ic_tuning.compute_time(FLAGS_timer_method), allocator); - tuning_case["values"].PushBack(val, allocator); + tuning_case["time"].push_back( + ic_tuning.compute_time(FLAGS_timer_method)); + tuning_case["values"].push_back(val); } // We put back the flag to false to use the default (non-tuned) values // for the following @@ -126,141 +195,39 @@ void apply_spmv(const char* format_name, std::shared_ptr exec, #endif // GINKGO_BENCHMARK_ENABLE_TUNING // timed run - auto x_clone = clone(x); + auto x_clone = clone(state.x); for (auto _ : ic.run()) { - system_matrix->apply(b, x_clone); - } - add_or_set_member(spmv_case[format_name], "time", - ic.compute_time(FLAGS_timer_method), allocator); - add_or_set_member(spmv_case[format_name], "repetitions", - ic.get_num_repetitions(), allocator); - - // compute and write benchmark data - add_or_set_member(spmv_case[format_name], "completed", true, allocator); - } catch (const std::exception& e) { - add_or_set_member(test_case["spmv"][format_name], "completed", false, - allocator); - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case["spmv"][format_name], "error", - msg_value, allocator); + auto range = annotate("repetition"); + system_matrix->apply(state.b, x_clone); } - std::cerr << "Error when processing test case " << test_case << "\n" - << "what(): " << e.what() << std::endl; - } -} - - -template -void run_spmv_benchmark(std::shared_ptr exec, - rapidjson::Document& test_cases, - const std::vector formats, - const SystemGenerator& system_generator, - std::shared_ptr timer, bool do_print) -{ - auto& allocator = test_cases.GetAllocator(); - auto profiler_hook = create_profiler_hook(exec); - if (profiler_hook) { - exec->add_logger(profiler_hook); + format_case["time"] = ic.compute_time(FLAGS_timer_method); + format_case["repetitions"] = ic.get_num_repetitions(); } - auto annotate = annotate_functor{profiler_hook}; - - for (auto& test_case : test_cases.GetArray()) { - try { - // set up benchmark - system_generator.validate_options(test_case); - if (!test_case.HasMember("spmv")) { - test_case.AddMember("spmv", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - auto& spmv_case = test_case["spmv"]; - if (!FLAGS_overwrite && - all_of(begin(formats), end(formats), - [&spmv_case](const std::string& s) { - return spmv_case.HasMember(s.c_str()); - })) { - continue; - } - if (do_print) { - std::clog << "Running test case: " << test_case << std::endl; - } - // annotate the test case - auto test_case_range = - annotate(system_generator.describe_config(test_case)); - - auto data = system_generator.generate_matrix_data(test_case); - - auto nrhs = FLAGS_nrhs; - auto b = system_generator.create_multi_vector_random( - exec, gko::dim<2>{data.size[1], nrhs}); - auto x = system_generator.create_multi_vector_random( - exec, gko::dim<2>{data.size[0], nrhs}); - if (do_print) { - std::clog << "Matrix is of size (" << data.size[0] << ", " - << data.size[1] << ")" << std::endl; - } - add_or_set_member(test_case, "size", data.size[0], allocator); - add_or_set_member(test_case, "nnz", data.nonzeros.size(), - allocator); - auto best_performance = std::numeric_limits::max(); - if (!test_case.HasMember("optimal")) { - test_case.AddMember("optimal", - rapidjson::Value(rapidjson::kObjectType), - allocator); - } - // Compute the result from ginkgo::coo as the correct answer - auto answer = gko::clone(x); - if (FLAGS_detailed) { - auto system_matrix = - system_generator.generate_matrix_with_default_format(exec, - data); - exec->synchronize(); - system_matrix->apply(b, answer); - exec->synchronize(); - } - for (const auto& format_name : formats) { - { - auto format_range = annotate(format_name.c_str()); - apply_spmv(format_name.c_str(), exec, system_generator, - timer, data, b.get(), x.get(), answer.get(), - test_case, allocator); - } - if (do_print) { - std::clog << "Current state:" << std::endl - << test_cases << std::endl; - } - if (spmv_case[format_name.c_str()]["completed"].GetBool()) { - auto performance = - spmv_case[format_name.c_str()]["time"].GetDouble(); - if (performance < best_performance) { - best_performance = performance; - add_or_set_member( - test_case["optimal"], "spmv", - rapidjson::Value(format_name.c_str(), allocator) - .Move(), - allocator); - } - } - if (do_print) { - backup_results(test_cases); + void postprocess(json& test_case) const override + { + if (!test_case.contains("optimal")) { + test_case["optimal"] = json::object(); + } + auto best_time = std::numeric_limits::max(); + std::string best_format; + // find the fastest among all formats we tested + for (const auto& format : formats) { + auto& format_case = test_case[name][format]; + if (format_case.contains("completed") && + format_case["completed"].template get()) { + auto time = format_case["time"]; + if (time < best_time) { + best_time = time; + best_format = format; } } - } catch (const std::exception& e) { - std::cerr << "Error setting up matrix data, what(): " << e.what() - << std::endl; - if (FLAGS_keep_errors) { - rapidjson::Value msg_value; - msg_value.SetString(e.what(), allocator); - add_or_set_member(test_case, "error", msg_value, allocator); - } + } + if (!best_format.empty()) { + test_case["optimal"][name] = best_format; } } - if (profiler_hook) { - exec->remove_logger(profiler_hook); - } -} +}; + #endif // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt new file mode 100644 index 00000000000..2f43b6eaf71 --- /dev/null +++ b/benchmark/test/CMakeLists.txt @@ -0,0 +1,28 @@ +find_package(Python3 COMPONENTS Interpreter REQUIRED) +function(add_benchmark_test test_name) + configure_file(${test_name}.py ${test_name}.py COPYONLY) + add_test(NAME benchmark_${test_name} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $ + WORKING_DIRECTORY "$") + set(regenerate_target benchmark_test_${test_name}_regenerate) + add_custom_target(${regenerate_target} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $ --generate + COMMENT "Regenerating reference output for ${test_name}" + WORKING_DIRECTORY "$") + add_dependencies(${regenerate_target} ${test_name}) + add_dependencies(benchmark_test_regenerate ${regenerate_target}) +endfunction() +add_custom_target(benchmark_test_regenerate) +configure_file(test_framework.py.in test_framework.py @ONLY) +add_benchmark_test(blas) +add_benchmark_test(conversion) +add_benchmark_test(matrix_statistics) +add_benchmark_test(preconditioner) +add_benchmark_test(solver) +add_benchmark_test(sparse_blas) +add_benchmark_test(spmv) +if (GINKGO_BUILD_MPI) + add_benchmark_test(multi_vector_distributed) + add_benchmark_test(spmv_distributed) + add_benchmark_test(solver_distributed) +endif() diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py new file mode 100755 index 00000000000..ff5bddc5d08 --- /dev/null +++ b/benchmark/test/blas.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"n": 100}]'], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", + stdin='[{"n": 100}]', +) + +# file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="blas.simple.stdout", + expected_stderr="blas.simple.stderr", +) + +# profiler annotations +test_framework.compare_output( + ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], + expected_stdout="blas.profile.stdout", + expected_stderr="blas.profile.stderr", +) diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py new file mode 100755 index 00000000000..2eada100731 --- /dev/null +++ b/benchmark/test/conversion.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", +) + +# stdin +test_framework.compare_output( + ["-formats", "coo,csr"], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) + +# input file +test_framework.compare_output( + [ + "-input", + str(test_framework.sourcepath / "input.mtx.json"), + "-formats", + "coo,csr", + ], + expected_stdout="conversion.simple.stdout", + expected_stderr="conversion.simple.stderr", +) + +# input matrixfile +test_framework.compare_output( + [ + "-input_matrix", + str(test_framework.matrixpath), + "-formats", + "coo,csr", + ], + expected_stdout="conversion.matrix.stdout", + expected_stderr="conversion.matrix.stderr", +) + +# check that all conversions work +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-formats", + "coo,csr,ell,sellp,hybrid", + ], + expected_stdout="conversion.all.stdout", + expected_stderr="conversion.all.stderr", +) + +# profiler annotations +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-formats", + "coo,csr", + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="conversion.profile.stdout", + expected_stderr="conversion.profile.stderr", +) diff --git a/benchmark/test/input.blas.json b/benchmark/test/input.blas.json new file mode 100644 index 00000000000..fe366aa6fa0 --- /dev/null +++ b/benchmark/test/input.blas.json @@ -0,0 +1,5 @@ +[ + { + "n": 100 + } +] \ No newline at end of file diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json new file mode 100644 index 00000000000..aca115179e6 --- /dev/null +++ b/benchmark/test/input.distributed_mtx.json @@ -0,0 +1,7 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil" + } +] \ No newline at end of file diff --git a/benchmark/test/input.distributed_solver.json b/benchmark/test/input.distributed_solver.json new file mode 100644 index 00000000000..16efbf03fba --- /dev/null +++ b/benchmark/test/input.distributed_solver.json @@ -0,0 +1,10 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + } + } +] \ No newline at end of file diff --git a/benchmark/test/input.mtx.json b/benchmark/test/input.mtx.json new file mode 100644 index 00000000000..fdeb10c8eee --- /dev/null +++ b/benchmark/test/input.mtx.json @@ -0,0 +1,6 @@ +[ + { + "size": 100, + "stencil": "7pt" + } +] \ No newline at end of file diff --git a/benchmark/test/input.solver.json b/benchmark/test/input.solver.json new file mode 100644 index 00000000000..0183700dfe8 --- /dev/null +++ b/benchmark/test/input.solver.json @@ -0,0 +1,9 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + } + } +] \ No newline at end of file diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py new file mode 100755 index 00000000000..6e4d8b1d2f5 --- /dev/null +++ b/benchmark/test/matrix_statistics.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) + +# input file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="matrix_statistics.simple.stdout", + expected_stderr="matrix_statistics.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="matrix_statistics.matrix.stdout", + expected_stderr="matrix_statistics.matrix.stderr", +) diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py new file mode 100644 index 00000000000..c62cb8ebd17 --- /dev/null +++ b/benchmark/test/multi_vector_distributed.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output_distributed( + ["-input", '[{"n": 100}]'], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + num_procs=3, +) + +# stdin +test_framework.compare_output_distributed( + [], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + stdin='[{"n": 100}]', + num_procs=3, +) + +# file +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.blas.json")], + expected_stdout="multi_vector_distributed.simple.stdout", + expected_stderr="multi_vector_distributed.simple.stderr", + num_procs=3, +) + +# profiler annotations +test_framework.compare_output_distributed( + ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"], + expected_stdout="multi_vector_distributed.profile.stdout", + expected_stderr="multi_vector_distributed.profile.stderr", + num_procs=3, +) diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py new file mode 100755 index 00000000000..7226964dd05 --- /dev/null +++ b/benchmark/test/preconditioner.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) + +# input file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="preconditioner.simple.stdout", + expected_stderr="preconditioner.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="preconditioner.matrix.stdout", + expected_stderr="preconditioner.matrix.stderr", +) + +# profiler annotations +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="preconditioner.profile.stdout", + expected_stderr="preconditioner.profile.stderr", +) + +# stdin +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="preconditioner.reordered.stdout", + expected_stderr="preconditioner.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr new file mode 100644 index 00000000000..e156d489be3 --- /dev/null +++ b/benchmark/test/reference/blas.profile.stderr @@ -0,0 +1,42 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 +DEBUG: begin n = 100 + Running blas: copy +DEBUG: begin copy +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end repetition +DEBUG: end copy + Running blas: axpy +DEBUG: begin axpy +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: end repetition +DEBUG: end axpy + Running blas: scal +DEBUG: begin scal +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::scale +DEBUG: end dense::scale +DEBUG: end repetition +DEBUG: end scal +DEBUG: end n = 100 diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout new file mode 100644 index 00000000000..209e115b557 --- /dev/null +++ b/benchmark/test/reference/blas.profile.stdout @@ -0,0 +1,28 @@ +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr new file mode 100644 index 00000000000..7c5e66b9188 --- /dev/null +++ b/benchmark/test/reference/blas.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 + Running blas: copy + Running blas: axpy + Running blas: scal diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout new file mode 100644 index 00000000000..54745d81104 --- /dev/null +++ b/benchmark/test/reference/blas.simple.stdout @@ -0,0 +1,28 @@ +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr new file mode 100644 index 00000000000..37c88fd8b86 --- /dev/null +++ b/benchmark/test/reference/conversion.all.stderr @@ -0,0 +1,21 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr,ell,sellp,hybrid +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo + Running conversion: csr-ell + Running conversion: csr-sellp + Running conversion: csr-hybrid + Running conversion: ell-read + Running conversion: ell-csr + Running conversion: sellp-read + Running conversion: sellp-csr + Running conversion: hybrid-read + Running conversion: hybrid-csr diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout new file mode 100644 index 00000000000..e7a5b8f0f51 --- /dev/null +++ b/benchmark/test/reference/conversion.all.stdout @@ -0,0 +1,76 @@ +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-ell": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-sellp": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-hybrid": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "ell-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "sellp-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "hybrid-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr new file mode 100644 index 00000000000..c828fe11267 --- /dev/null +++ b/benchmark/test/reference/conversion.matrix.stderr @@ -0,0 +1,12 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Running test case +Matrix is of size (36, 36), 208 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout new file mode 100644 index 00000000000..8489e4b30b4 --- /dev/null +++ b/benchmark/test/reference/conversion.matrix.stdout @@ -0,0 +1,30 @@ +[ + { + "filename": "", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr new file mode 100644 index 00000000000..417c7bd71e7 --- /dev/null +++ b/benchmark/test/reference/conversion.profile.stderr @@ -0,0 +1,82 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin stencil(100, 7pt) + Running conversion: coo-read +DEBUG: begin coo-read +DEBUG: begin repetition +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end repetition +DEBUG: end coo-read + Running conversion: coo-csr +DEBUG: begin coo-csr +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin repetition +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: end copy() +DEBUG: end repetition +DEBUG: end coo-csr + Running conversion: csr-read +DEBUG: begin csr-read +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin repetition +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: end repetition +DEBUG: end csr-read + Running conversion: csr-coo +DEBUG: begin csr-coo +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin repetition +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_ptrs_to_idxs +DEBUG: end components::convert_ptrs_to_idxs +DEBUG: end copy() +DEBUG: end repetition +DEBUG: end csr-coo +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout new file mode 100644 index 00000000000..907eac5b951 --- /dev/null +++ b/benchmark/test/reference/conversion.profile.stdout @@ -0,0 +1,31 @@ +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 1, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr new file mode 100644 index 00000000000..317330a2334 --- /dev/null +++ b/benchmark/test/reference/conversion.simple.stderr @@ -0,0 +1,12 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo,csr +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running conversion: coo-read + Running conversion: coo-csr + Running conversion: csr-read + Running conversion: csr-coo diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout new file mode 100644 index 00000000000..91b69b8a248 --- /dev/null +++ b/benchmark/test/reference/conversion.simple.stdout @@ -0,0 +1,31 @@ +[ + { + "size": 100, + "stencil": "7pt", + "conversion": { + "coo-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "coo-csr": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-read": { + "time": 1.0, + "repetitions": 10, + "completed": true + }, + "csr-coo": { + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr new file mode 100644 index 00000000000..fe739a2b773 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.matrix.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +Matrix is of size (36, 36) + Running solver: cg diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout new file mode 100644 index 00000000000..67ac333bec5 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -0,0 +1,57 @@ +[ + { + "filename": "", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "dense::row_gather": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_squared_norm2": 1.0, + "dense::compute_sqrt": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 27, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 36, + "cols": 36 + } +] diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr new file mode 100644 index 00000000000..ade54da3089 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -0,0 +1,448 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt, stencil) +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +Matrix is of size (125, 125) +DEBUG: begin stencil(100, 7pt, stencil) + Running solver: cg +DEBUG: begin cg +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin repetition +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() +DEBUG: begin apply() +DEBUG: begin iteration +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: end apply() +DEBUG: end repetition +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin advanced_apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end advanced_apply() +DEBUG: begin dense::compute_squared_norm2 +DEBUG: end dense::compute_squared_norm2 +DEBUG: begin dense::compute_sqrt +DEBUG: end dense::compute_sqrt +DEBUG: begin copy +DEBUG: end copy +DEBUG: end cg +DEBUG: end stencil(100, 7pt, stencil) diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout new file mode 100644 index 00000000000..0a844879c4f --- /dev/null +++ b/benchmark/test/reference/distributed_solver.profile.stdout @@ -0,0 +1,33 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": {}, + "time": 1.0 + }, + "apply": { + "components": {}, + "iterations": 7, + "time": 1.0 + }, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr new file mode 100644 index 00000000000..02c580674b3 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt, stencil) +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout new file mode 100644 index 00000000000..458115e6ab2 --- /dev/null +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -0,0 +1,59 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "optimal": { + "spmv": "csr-csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "dense::row_gather": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_squared_norm2": 1.0, + "dense::compute_sqrt": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr new file mode 100644 index 00000000000..b25e792459a --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.matrix.stderr @@ -0,0 +1,4 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running test case +Matrix is of size (36, 36), 208 diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout new file mode 100644 index 00000000000..f5eba9461f7 --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.matrix.stdout @@ -0,0 +1,39 @@ +[ + { + "filename": "", + "problem": { + "rows": 36, + "columns": 36, + "nonzeros": 208, + "row_distribution": { + "min": 4, + "q1": 4.5, + "median": 6.0, + "q3": 7.0, + "max": 9, + "mean": 5.777777777777778, + "variance": 2.061728395061728, + "skewness": 0.3366362745126052, + "kurtosis": 2.0507009932231366, + "hyperskewness": 1.9165991338199193, + "hyperflatness": 6.0545648993883665 + }, + "col_distribution": { + "min": 4, + "q1": 4.5, + "median": 6.0, + "q3": 7.0, + "max": 9, + "mean": 5.777777777777778, + "variance": 2.061728395061728, + "skewness": 0.3366362745126052, + "kurtosis": 2.0507009932231366, + "hyperskewness": 1.9165991338199193, + "hyperflatness": 6.0545648993883665 + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr new file mode 100644 index 00000000000..06e12e1159e --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.simple.stderr @@ -0,0 +1,4 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout new file mode 100644 index 00000000000..23124781a7d --- /dev/null +++ b/benchmark/test/reference/matrix_statistics.simple.stdout @@ -0,0 +1,40 @@ +[ + { + "size": 100, + "stencil": "7pt", + "problem": { + "rows": 125, + "columns": 125, + "nonzeros": 725, + "row_distribution": { + "min": 4, + "q1": 5.0, + "median": 6.0, + "q3": 6.0, + "max": 7, + "mean": 5.8, + "variance": 0.7199999999999992, + "skewness": -0.23570226039551892, + "kurtosis": 2.388888888888889, + "hyperskewness": -1.741577812922432, + "hyperflatness": 7.762345679012379 + }, + "col_distribution": { + "min": 4, + "q1": 5.0, + "median": 6.0, + "q3": 6.0, + "max": 7, + "mean": 5.8, + "variance": 0.7199999999999992, + "skewness": -0.23570226039551892, + "kurtosis": 2.388888888888889, + "hyperskewness": -1.741577812922432, + "hyperflatness": 7.762345679012379 + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr new file mode 100644 index 00000000000..29dc6b8d286 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr @@ -0,0 +1,132 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 +DEBUG: begin n = 100 + Running blas: copy +DEBUG: begin copy +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end repetition +DEBUG: end copy + Running blas: axpy +DEBUG: begin axpy +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::add_scaled +DEBUG: end dense::add_scaled +DEBUG: end repetition +DEBUG: end axpy + Running blas: scal +DEBUG: begin scal +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin repetition +DEBUG: begin dense::scale +DEBUG: end dense::scale +DEBUG: end repetition +DEBUG: end scal +DEBUG: end n = 100 diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout new file mode 100644 index 00000000000..209e115b557 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout @@ -0,0 +1,28 @@ +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr new file mode 100644 index 00000000000..7c5e66b9188 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are copy,axpy,scal +Running test case n = 100 + Running blas: copy + Running blas: axpy + Running blas: scal diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout new file mode 100644 index 00000000000..54745d81104 --- /dev/null +++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout @@ -0,0 +1,28 @@ +[ + { + "n": 100, + "blas": { + "copy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "axpy": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + }, + "scal": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "completed": true + } + } + } +] diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr new file mode 100644 index 00000000000..82212a3d2c4 --- /dev/null +++ b/benchmark/test/reference/preconditioner.matrix.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case +Matrix is of size (36, 36), 208 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout new file mode 100644 index 00000000000..742ec55c41d --- /dev/null +++ b/benchmark/test/reference/preconditioner.matrix.stdout @@ -0,0 +1,31 @@ +[ + { + "filename": "", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr new file mode 100644 index 00000000000..b90c5e44912 --- /dev/null +++ b/benchmark/test/reference/preconditioner.profile.stderr @@ -0,0 +1,47 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case stencil(100, 7pt) +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100, 7pt) + Running preconditioner: none +DEBUG: begin none +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin repetition generate +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end repetition generate +DEBUG: begin repetition apply +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: end repetition apply +DEBUG: end none +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout new file mode 100644 index 00000000000..526349b55ad --- /dev/null +++ b/benchmark/test/reference/preconditioner.profile.stdout @@ -0,0 +1,24 @@ +[ + { + "size": 100, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": {}, + "time": 1.0, + "repetitions": 1 + }, + "apply": { + "components": {}, + "time": 1.0, + "repetitions": 1 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/preconditioner.reordered.stderr b/benchmark/test/reference/preconditioner.reordered.stderr new file mode 100644 index 00000000000..a0bec924a46 --- /dev/null +++ b/benchmark/test/reference/preconditioner.reordered.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.reordered.stdout b/benchmark/test/reference/preconditioner.reordered.stdout new file mode 100644 index 00000000000..51adfb3b58b --- /dev/null +++ b/benchmark/test/reference/preconditioner.reordered.stdout @@ -0,0 +1,33 @@ +[ + { + "size": 100, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr new file mode 100644 index 00000000000..a0bec924a46 --- /dev/null +++ b/benchmark/test/reference/preconditioner.simple.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +Running with preconditioners: none +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running preconditioner: none diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout new file mode 100644 index 00000000000..ed567dcbb13 --- /dev/null +++ b/benchmark/test/reference/preconditioner.simple.stdout @@ -0,0 +1,32 @@ +[ + { + "size": 100, + "stencil": "7pt", + "preconditioner": { + "none": { + "generate": { + "components": { + "generate()": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "apply": { + "components": { + "apply()": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "overhead": 1.0 + }, + "time": 1.0, + "repetitions": 10 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr new file mode 100644 index 00000000000..fe739a2b773 --- /dev/null +++ b/benchmark/test/reference/solver.matrix.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case +Matrix is of size (36, 36) + Running solver: cg diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout new file mode 100644 index 00000000000..594a3887921 --- /dev/null +++ b/benchmark/test/reference/solver.matrix.stdout @@ -0,0 +1,55 @@ +[ + { + "filename": "", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 27, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 36, + "cols": 36 + } +] diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr new file mode 100644 index 00000000000..0f972f0aec8 --- /dev/null +++ b/benchmark/test/reference/solver.profile.stderr @@ -0,0 +1,300 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +Matrix is of size (125, 125) +DEBUG: begin stencil(100, 7pt) + Running solver: cg +DEBUG: begin cg +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin repetition +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin generate() +DEBUG: begin generate() +DEBUG: end generate() +DEBUG: end generate() +DEBUG: begin apply() +DEBUG: begin iteration +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin cg::initialize +DEBUG: end cg::initialize +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: begin check() +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: begin iteration +DEBUG: begin cg::step_1 +DEBUG: end cg::step_1 +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin cg::step_2 +DEBUG: end cg::step_2 +DEBUG: begin apply() +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: end apply() +DEBUG: begin dense::compute_conj_dot_dispatch +DEBUG: end dense::compute_conj_dot_dispatch +DEBUG: begin check() +DEBUG: begin check() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin residual_norm::residual_norm +DEBUG: end residual_norm::residual_norm +DEBUG: end check() +DEBUG: end check() +DEBUG: end iteration +DEBUG: end apply() +DEBUG: end repetition +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: begin dense::compute_norm2_dispatch +DEBUG: end dense::compute_norm2_dispatch +DEBUG: begin copy +DEBUG: end copy +DEBUG: end cg +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout new file mode 100644 index 00000000000..c132ed1a572 --- /dev/null +++ b/benchmark/test/reference/solver.profile.stdout @@ -0,0 +1,32 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": {}, + "time": 1.0 + }, + "apply": { + "components": {}, + "iterations": 7, + "time": 1.0 + }, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/solver.reordered.stderr b/benchmark/test/reference/solver.reordered.stderr new file mode 100644 index 00000000000..b133e6bfc57 --- /dev/null +++ b/benchmark/test/reference/solver.reordered.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/solver.reordered.stdout b/benchmark/test/reference/solver.reordered.stdout new file mode 100644 index 00000000000..c1b826ae3fc --- /dev/null +++ b/benchmark/test/reference/solver.reordered.stdout @@ -0,0 +1,57 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr new file mode 100644 index 00000000000..b133e6bfc57 --- /dev/null +++ b/benchmark/test/reference/solver.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +Running cg with 1000 iterations and residual goal of 1.000000e-06 +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125) + Running solver: cg diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout new file mode 100644 index 00000000000..0ee0e4b9a4b --- /dev/null +++ b/benchmark/test/reference/solver.simple.stdout @@ -0,0 +1,56 @@ +[ + { + "size": 100, + "stencil": "7pt", + "optimal": { + "spmv": "csr" + }, + "solver": { + "cg": { + "recurrent_residuals": [], + "true_residuals": [], + "implicit_residuals": [], + "iteration_timestamps": [], + "rhs_norm": 1.0, + "generate": { + "components": { + "generate()": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "time": 1.0 + }, + "apply": { + "components": { + "apply()": 1.0, + "iteration": 1.0, + "allocate": 1.0, + "dense::fill": 1.0, + "cg::initialize": 1.0, + "advanced_apply()": 1.0, + "csr::advanced_spmv": 1.0, + "dense::compute_norm2_dispatch": 1.0, + "copy()": 1.0, + "dense::copy": 1.0, + "dense::compute_conj_dot_dispatch": 1.0, + "check()": 1.0, + "residual_norm::residual_norm": 1.0, + "cg::step_1": 1.0, + "csr::spmv": 1.0, + "cg::step_2": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "iterations": 7, + "time": 1.0 + }, + "preconditioner": {}, + "residual_norm": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125 + } +] diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr new file mode 100644 index 00000000000..cbd08e1d21e --- /dev/null +++ b/benchmark/test/reference/sparse_blas.matrix.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are transpose +Running test case +Matrix is of size (36, 36), 208 + Running sparse_blas: transpose diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout new file mode 100644 index 00000000000..a50fa1159d9 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.matrix.stdout @@ -0,0 +1,24 @@ +[ + { + "filename": "", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208 + } +] diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr new file mode 100644 index 00000000000..e8376ca2713 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.profile.stderr @@ -0,0 +1,25 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The operations are transpose +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin stencil(100, 7pt) + Running sparse_blas: transpose +DEBUG: begin transpose +DEBUG: begin repetition +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin csr::transpose +DEBUG: end csr::transpose +DEBUG: end repetition +DEBUG: end transpose +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout new file mode 100644 index 00000000000..45cb7e2638a --- /dev/null +++ b/benchmark/test/reference/sparse_blas.profile.stdout @@ -0,0 +1,18 @@ +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/sparse_blas.reordered.stderr b/benchmark/test/reference/sparse_blas.reordered.stderr new file mode 100644 index 00000000000..2a7bd2a6665 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.reordered.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are symbolic_cholesky +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running sparse_blas: symbolic_cholesky diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout new file mode 100644 index 00000000000..b5fc8998be0 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.reordered.stdout @@ -0,0 +1,32 @@ +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "symbolic_cholesky": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "compute_elim_forest": 1.0, + "allocate": 1.0, + "free": 1.0, + "components::fill_array": 1.0, + "cholesky::symbolic_count": 1.0, + "components::prefix_sum_nonnegative": 1.0, + "copy": 1.0, + "cholesky::symbolic_factorize": 1.0, + "csr::sort_by_column_index": 1.0, + "overhead": 1.0 + }, + "factor_nonzeros": 1324, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr new file mode 100644 index 00000000000..21c2241c6a5 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.simple.stderr @@ -0,0 +1,9 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The operations are transpose +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running sparse_blas: transpose diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout new file mode 100644 index 00000000000..a44f4f189b2 --- /dev/null +++ b/benchmark/test/reference/sparse_blas.simple.stdout @@ -0,0 +1,25 @@ +[ + { + "size": 100, + "stencil": "7pt", + "sparse_blas": { + "transpose": { + "time": 1.0, + "flops": 1.0, + "bandwidth": 1.0, + "repetitions": 10, + "components": { + "allocate": 1.0, + "components::fill_array": 1.0, + "csr::transpose": 1.0, + "free": 1.0, + "overhead": 1.0 + }, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725 + } +] diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr new file mode 100644 index 00000000000..a184b39b9fd --- /dev/null +++ b/benchmark/test/reference/spmv.matrix.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case +Matrix is of size (36, 36), 208 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout new file mode 100644 index 00000000000..ea5927ba148 --- /dev/null +++ b/benchmark/test/reference/spmv.matrix.stdout @@ -0,0 +1,20 @@ +[ + { + "filename": "", + "spmv": { + "coo": { + "storage": 3328, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 36, + "cols": 36, + "nonzeros": 208, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr new file mode 100644 index 00000000000..dff3b58a9dd --- /dev/null +++ b/benchmark/test/reference/spmv.profile.stderr @@ -0,0 +1,38 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +Matrix is of size (125, 125), 725 +DEBUG: begin stencil(100, 7pt) + Running spmv: coo +DEBUG: begin coo +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin repetition +DEBUG: begin apply() +DEBUG: begin coo::spmv +DEBUG: end coo::spmv +DEBUG: end apply() +DEBUG: end repetition +DEBUG: end coo +DEBUG: end stencil(100, 7pt) diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout new file mode 100644 index 00000000000..6e4701af719 --- /dev/null +++ b/benchmark/test/reference/spmv.profile.stdout @@ -0,0 +1,20 @@ +[ + { + "size": 100, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv.reordered.stderr b/benchmark/test/reference/spmv.reordered.stderr new file mode 100644 index 00000000000..07044cc70f8 --- /dev/null +++ b/benchmark/test/reference/spmv.reordered.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.reordered.stdout b/benchmark/test/reference/spmv.reordered.stdout new file mode 100644 index 00000000000..5404235cdf7 --- /dev/null +++ b/benchmark/test/reference/spmv.reordered.stdout @@ -0,0 +1,22 @@ +[ + { + "size": 100, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "reordered": "amd", + "rows": 125, + "cols": 125, + "nonzeros": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr new file mode 100644 index 00000000000..07044cc70f8 --- /dev/null +++ b/benchmark/test/reference/spmv.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are coo +The number of right hand sides is 1 +Running test case stencil(100, 7pt) +Matrix is of size (125, 125), 725 + Running spmv: coo diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout new file mode 100644 index 00000000000..38f2598c616 --- /dev/null +++ b/benchmark/test/reference/spmv.simple.stdout @@ -0,0 +1,21 @@ +[ + { + "size": 100, + "stencil": "7pt", + "spmv": { + "coo": { + "storage": 11600, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 125, + "cols": 125, + "nonzeros": 725, + "optimal": { + "spmv": "coo" + } + } +] diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr new file mode 100644 index 00000000000..4cd21d00758 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -0,0 +1,140 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 0 warm iterations and 1 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +Running test case stencil(100, 7pt, stencil) +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin dense::fill_in_matrix_data +DEBUG: end dense::fill_in_matrix_data +Matrix is of size (81, 81), 144 +DEBUG: begin stencil(100, 7pt, stencil) + Running spmv: csr-csr +DEBUG: begin csr-csr +DEBUG: begin partition::build_ranges_from_global_size +DEBUG: end partition::build_ranges_from_global_size +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin partition::build_from_contiguous +DEBUG: end partition::build_from_contiguous +DEBUG: begin partition::build_starting_indices +DEBUG: end partition::build_starting_indices +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy() +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: end copy() +DEBUG: begin dense::fill +DEBUG: end dense::fill +DEBUG: begin components::aos_to_soa +DEBUG: end components::aos_to_soa +DEBUG: begin distributed_matrix::build_local_nonlocal +DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::convert_idxs_to_ptrs +DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy() +DEBUG: begin dense::copy +DEBUG: end dense::copy +DEBUG: end copy() +DEBUG: begin repetition +DEBUG: begin apply() +DEBUG: begin dense::row_gather +DEBUG: end dense::row_gather +DEBUG: begin apply() +DEBUG: begin csr::spmv +DEBUG: end csr::spmv +DEBUG: end apply() +DEBUG: begin advanced_apply() +DEBUG: begin csr::advanced_spmv +DEBUG: end csr::advanced_spmv +DEBUG: end advanced_apply() +DEBUG: end apply() +DEBUG: end repetition +DEBUG: end csr-csr +DEBUG: end stencil(100, 7pt, stencil) diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout new file mode 100644 index 00000000000..bbef87d0b89 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -0,0 +1,21 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 6420, + "time": 1.0, + "repetitions": 1, + "completed": true + } + }, + "rows": 81, + "cols": 81, + "nonzeros": 144, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr new file mode 100644 index 00000000000..7d59e4f4190 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stderr @@ -0,0 +1,10 @@ +This is Ginkgo 1.7.0 (master) + running with core module 1.7.0 (master) +Running on reference(0) +Running with 2 warm iterations and 10 running iterations +The random seed for right hand sides is 42 +The formats are [csr]x[csr] +The number of right hand sides is 1 +Running test case stencil(100, 7pt, stencil) +Matrix is of size (81, 81), 144 + Running spmv: csr-csr diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout new file mode 100644 index 00000000000..77bdef168d3 --- /dev/null +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -0,0 +1,22 @@ +[ + { + "size": 100, + "stencil": "7pt", + "comm_pattern": "stencil", + "spmv": { + "csr-csr": { + "storage": 6420, + "max_relative_norm2": 1.0, + "time": 1.0, + "repetitions": 10, + "completed": true + } + }, + "rows": 81, + "cols": 81, + "nonzeros": 144, + "optimal": { + "spmv": "csr-csr" + } + } +] diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py new file mode 100755 index 00000000000..5dd1d840a4e --- /dev/null +++ b/benchmark/test/solver.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', +) + +# input file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.solver.json")], + expected_stdout="solver.simple.stdout", + expected_stderr="solver.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="solver.matrix.stdout", + expected_stderr="solver.matrix.stderr", +) + +# profiler annotations +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="solver.profile.stdout", + expected_stderr="solver.profile.stderr", +) + +# reordering +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="solver.reordered.stdout", + expected_stderr="solver.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]', +) diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py new file mode 100644 index 00000000000..54bbb030077 --- /dev/null +++ b/benchmark/test/solver_distributed.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', + ], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', +) + +# input file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.distributed_solver.json")], + expected_stdout="distributed_solver.simple.stdout", + expected_stderr="distributed_solver.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="distributed_solver.matrix.stdout", + expected_stderr="distributed_solver.matrix.stderr", +) + +# profiler annotations +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="distributed_solver.profile.stdout", + expected_stderr="distributed_solver.profile.stderr", +) diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py new file mode 100755 index 00000000000..8e6cda3c9bd --- /dev/null +++ b/benchmark/test/sparse_blas.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-operations", "transpose", "-input", + '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", +) + +# stdin +test_framework.compare_output( + ["-operations", "transpose"], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) + +# input file +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input", + str(test_framework.sourcepath / "input.mtx.json"), + ], + expected_stdout="sparse_blas.simple.stdout", + expected_stderr="sparse_blas.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input_matrix", + str(test_framework.matrixpath), + ], + expected_stdout="sparse_blas.matrix.stdout", + expected_stderr="sparse_blas.matrix.stderr", +) + +# profiler annotations (transpose has the smallest number of allocations) +test_framework.compare_output( + [ + "-operations", + "transpose", + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="sparse_blas.profile.stdout", + expected_stderr="sparse_blas.profile.stderr", +) + +# reordering +test_framework.compare_output( + ["-operations", "symbolic_cholesky", "-reorder", "amd"], + expected_stdout="sparse_blas.reordered.stdout", + expected_stderr="sparse_blas.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py new file mode 100755 index 00000000000..f6f4a4b5c39 --- /dev/null +++ b/benchmark/test/spmv.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output( + ["-input", '[{"size": 100, "stencil": "7pt"}]'], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", +) + +# stdin +test_framework.compare_output( + [], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) + +# input file +test_framework.compare_output( + ["-input", str(test_framework.sourcepath / "input.mtx.json")], + expected_stdout="spmv.simple.stdout", + expected_stderr="spmv.simple.stderr", +) + +# input matrix file +test_framework.compare_output( + ["-input_matrix", str(test_framework.matrixpath)], + expected_stdout="spmv.matrix.stdout", + expected_stderr="spmv.matrix.stderr", +) + +# profiler annotations +test_framework.compare_output( + [ + "-input", + '[{"size": 100, "stencil": "7pt"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="spmv.profile.stdout", + expected_stderr="spmv.profile.stderr", +) + +# stdin +test_framework.compare_output( + ["-reorder", "amd"], + expected_stdout="spmv.reordered.stdout", + expected_stderr="spmv.reordered.stderr", + stdin='[{"size": 100, "stencil": "7pt"}]', +) diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py new file mode 100644 index 00000000000..356db48459e --- /dev/null +++ b/benchmark/test/spmv_distributed.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +import test_framework + +# check that all input modes work: +# parameter +test_framework.compare_output_distributed( + ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) + +# stdin +test_framework.compare_output_distributed( + [], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, + stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', +) + +# input file +test_framework.compare_output_distributed( + ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")], + expected_stdout="spmv_distributed.simple.stdout", + expected_stderr="spmv_distributed.simple.stderr", + num_procs=3, +) + +# profiler annotations +test_framework.compare_output_distributed( + [ + "-input", + '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]', + "-profile", + "-profiler_hook", + "debug", + ], + expected_stdout="spmv_distributed.profile.stdout", + expected_stderr="spmv_distributed.profile.stderr", + num_procs=3, +) diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in new file mode 100644 index 00000000000..62c4293e7c0 --- /dev/null +++ b/benchmark/test/test_framework.py.in @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +import subprocess +import difflib +import json +from typing import List, Tuple +import re +import pathlib +import sys + +sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@") +binpath = pathlib.Path("@PROJECT_BINARY_DIR@") +matrixpath = pathlib.Path("@PROJECT_BINARY_DIR@/matrices/test/ani1.mtx") +generate = False +if len(sys.argv) > 2 and sys.argv[2] == "--generate": + generate = True +denumberify_paths = [ + "time", + "bandwidth", + "flops", + "components", + "residual_norm", + "rhs_norm", + "max_relative_norm2", +] +detypenameify_key_starts = [ + "generate(", "apply(", "advanced_apply(", "copy(", "check("] +empty_string_paths = ["filename"] +empty_array_paths = [ + "recurrent_residuals", + "true_residuals", + "implicit_residuals", + "iteration_timestamps", +] + + +def sanitize_json_key(key: str): + """Applies sanitation to a single key. + + Strings that start with a name in detypenameify_key_starts will be truncated + """ + + for start in detypenameify_key_starts: + if key.startswith(start): + return start + ")" + return key + + +def sanitize_json_value(key: str, value, sanitize_all: bool): + """Applies sanitation to a single key-value pair. + + Strings with a key in empty_string_paths will be emptied + Numbers with a key in denumberify_paths will be set to 1.0 + """ + + if key in empty_string_paths and isinstance(value, str): + return "" + if key in denumberify_paths and isinstance(value, float): + return 1.0 + if key in denumberify_paths and isinstance(value, dict): + return sanitize_json(value, True) + if key in empty_array_paths and isinstance(value, list): + return [] + return sanitize_json(value, sanitize_all) + + +def sanitize_json(parsed_input, sanitize_all: bool = False): + """Removes non-deterministic parts of a parsed JSON input. + + If sanitize_all is set to True, all nested float values will be set to 0. + Otherwise, only JSON object entries will be sanitized + using sanitize_json_key_value. + """ + + if isinstance(parsed_input, dict): + return { + sanitize_json_key(key): sanitize_json_value(key, value, sanitize_all) + for key, value in parsed_input.items() + } + elif isinstance(parsed_input, list): + return [sanitize_json(e, sanitize_all) for e in parsed_input] + elif sanitize_all and isinstance(parsed_input, float): + return 1.0 + else: + return parsed_input + + +def sanitize_json_text(input: str) -> List[str]: + """Sanitizes the given input JSON string. + + The JSON values will be parsed and sanitized through sanitize_json(...) + and pretty-printed to replace the original JSON input. + """ + + result = json.dumps(sanitize_json(json.loads(input)), indent=4) + # json.dumps doesn't add a trailing newline + return result.splitlines() + [""] + + +def sanitize_text( + input: str, + ignore_patterns: List[str], + replace_patterns: List[Tuple[str, str]], +) -> List[str]: + """Sanitizes the given input string. + + Every input line matching an entry from ignore_patterns will be removed. + Every line matching the first string in an entry from replace_patterns + will be replaced by the second string. + The output is guaranteed to end with an empty line. + """ + + lines = input.splitlines() + output_lines = [] + patterns = [re.compile(pattern) for pattern in ignore_patterns] + for line in lines: + for pattern, replacement in replace_patterns: + line = re.sub(pattern, replacement, line) + keep = True + for compiled_pattern in patterns: + if re.match(compiled_pattern, line): + keep = False + break + if keep: + output_lines.append(line) + if len(output_lines) == 0 or output_lines[-1] != "": + output_lines.append("") + return output_lines + + +def compare_output_impl( + args: List[str], + expected_stdout: str, + expected_stderr: str, + stdin: str, + launcher_flags: List[str], +): + args = [sys.argv[1]] + args + expected_stdout = str(sourcepath / "reference" / expected_stdout) + expected_stderr = str(sourcepath / "reference" / expected_stderr) + result = subprocess.run( + args=launcher_flags + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + input=bytes(stdin, "utf-8"), + ) + print( + "TEST: {}".format( + " ".join(["'{}'".format(arg) for arg in launcher_flags + args]) + ) + ) + ignore_patterns = [ + " the .* module is", # version numbers + "DEBUG: (begin|end ) (allocate|free)", # allocations + ] + typename_patterns = [ + ("(apply|generate|check|copy|move)\([^())]*\)", "\\1()"), + ("what\\(\\): .*", "what(): "), + (re.escape(str(matrixpath)), ""), + ] + if generate: + open(expected_stdout, "w").write( + "\n".join(sanitize_json_text(result.stdout.decode())) + ) + open(expected_stderr, "w").write( + "\n".join( + sanitize_text( + result.stderr.decode(), + ignore_patterns=ignore_patterns, + replace_patterns=typename_patterns, + ) + ) + ) + print("GENERATED") + return + result_stdout_processed = sanitize_json_text(result.stdout.decode()) + result_stderr_processed = sanitize_text( + result.stderr.decode(), + ignore_patterns=ignore_patterns, + replace_patterns=typename_patterns, + ) + expected_stdout_processed = sanitize_json_text( + open(expected_stdout).read()) + expected_stderr_processed = sanitize_text( + open(expected_stderr).read(), + ignore_patterns=ignore_patterns, + replace_patterns=typename_patterns, + ) + failed = False + if result_stdout_processed != expected_stdout_processed: + print("FAIL: stdout differs") + print( + "\n".join( + difflib.unified_diff( + expected_stdout_processed, result_stdout_processed) + ) + ) + failed = True + if result_stderr_processed != expected_stderr_processed: + print("FAIL: stderr differs") + print( + "\n".join( + difflib.unified_diff( + expected_stderr_processed, result_stderr_processed) + ) + ) + failed = True + if failed: + exit(1) + print("PASS") + + +def compare_output( + args: List[str], expected_stdout: str, expected_stderr: str, stdin: str = "" +): + compare_output_impl( + args, + expected_stdout=expected_stdout, + expected_stderr=expected_stderr, + stdin=stdin, + launcher_flags=[], + ) + + +def compare_output_distributed( + args, expected_stdout, expected_stderr, num_procs, stdin="" +): + compare_output_impl( + args, + expected_stdout, + expected_stderr, + stdin, + ["@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)], + ) diff --git a/benchmark/tools/mtx_to_binary.cpp b/benchmark/tools/mtx_to_binary.cpp index 487687ff605..1d2f4f94e02 100644 --- a/benchmark/tools/mtx_to_binary.cpp +++ b/benchmark/tools/mtx_to_binary.cpp @@ -61,8 +61,8 @@ void process(const char* input, const char* output, bool validate) } } if (validate) { - std::ifstream ois(output, std::ios_base::in | std::ios_base::binary); - auto data2 = gko::read_binary_raw(ois); + std::ifstream is(output, std::ios_base::in | std::ios_base::binary); + auto data2 = gko::read_binary_raw(is); std::cerr << "Comparing against previously read data\n"; if (data.size != data2.size) { throw GKO_STREAM_ERROR("Mismatching sizes!"); diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index dd1dda5c774..e2221614d9c 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -438,9 +438,7 @@ class CusparseCsrEx trans_(CUSPARSE_OPERATION_NON_TRANSPOSE), buffer_(exec) { -#ifdef ALLOWMP algmode_ = CUSPARSE_ALG_MERGE_PATH; -#endif // ALLOWMP } private: diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index deecc4b530c..6b024b16d1c 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -78,8 +78,8 @@ std::string format_description = " Irregular Sparse Matrices.\n" "csr: Compressed Sparse Row storage. Ginkgo implementation with\n" " automatic strategy.\n" - "csrc: Ginkgo's CSR implementation with automatic stategy.\n" - "csri: Ginkgo's CSR implementation with inbalance strategy.\n" + "csrc: Ginkgo's CSR implementation with automatic strategy.\n" + "csri: Ginkgo's CSR implementation with imbalance strategy.\n" "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "csrs: Ginkgo's CSR implementation with sparselib strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n" diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 92c3e5c9b13..6012cb6c77b 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -41,10 +41,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -52,10 +54,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include -#include -#include + + +#include #include "benchmark/utils/json.hpp" @@ -69,6 +70,10 @@ DEFINE_string(executor, "reference", "The executor used to run the benchmarks, one of: reference, " "omp, cuda, hip"); +DEFINE_string(allocator, "default", + "The allocator used in the executor. Only relevant for CUDA and " + "HIP executors, one of: default, async, host, unified"); + DEFINE_uint32(device_id, 0, "ID of the device where to run the code"); DEFINE_bool(overwrite, false, @@ -92,10 +97,6 @@ DEFINE_string( DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); -DEFINE_bool(keep_errors, true, - "If set, writes exception messages during the execution into the " - "JSON output"); - DEFINE_bool(nested_names, false, "If set, separately logs nested operations"); DEFINE_bool(profile, false, @@ -137,6 +138,9 @@ DEFINE_double( "is lower than or equal to 1, the timing region is always 1 repetition."); +std::unique_ptr input_stream; + + /** * Parses arguments through gflags and initialize a documentation string. * @@ -146,27 +150,32 @@ DEFINE_double( * @param format the format of the benchmark input data */ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, - std::string& format) + std::string& format, bool do_print = true) { - std::ostringstream doc; - doc << header << "Usage: " << (*argv)[0] << " [options]\n" - << format - << " The results are written on standard output, in the same " - "format,\n" - << " but with test cases extended to include an additional member " - "\n" - << " object for each benchmark run.\n" - << " If run with a --backup flag, an intermediate result is " - "written \n" - << " to a file in the same format. The backup file can be used as " - "\n" - << " input to this test suite, and the benchmarking will \n" - << " continue from the point where the backup file was created."; - - gflags::SetUsageMessage(doc.str()); - std::ostringstream ver; - ver << gko::version_info::get(); - gflags::SetVersionString(ver.str()); + if (do_print) { + std::ostringstream doc; + doc << header << "Usage: " << (*argv)[0] << " [options]\n" + << format + << " The results are written on standard output, in the same " + "format,\n" + << " but with test cases extended to include an additional member " + "\n" + << " object for each benchmark run.\n" + << " If run with a --backup flag, an intermediate result is " + "written \n" + << " to a file in the same format. The backup file can be used as " + "\n" + << " input to this test suite, and the benchmarking will \n" + << " continue from the point where the backup file was created."; + + gflags::SetUsageMessage(doc.str()); + std::ostringstream ver; + ver << gko::version_info::get(); + gflags::SetVersionString(ver.str()); + } else { + gflags::SetUsageMessage(""); + gflags::SetVersionString(""); + } gflags::ParseCommandLineFlags(argc, argv, true); if (FLAGS_profile) { FLAGS_repetitions = "1"; @@ -176,10 +185,18 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header, FLAGS_profiler_hook = "auto"; } } + std::string input_str(FLAGS_input); + if (!input_str.empty()) { + if (input_str.back() == ']') { + input_stream = std::make_unique(input_str); + } else { + input_stream = std::make_unique(input_str); + } + } } /** - * Print general benchmark informations using the common available parameters + * Print general benchmark information using the common available parameters * * @param extra describes benchmark specific extra parameters to output */ @@ -187,20 +204,19 @@ void print_general_information(const std::string& extra) { std::clog << gko::version_info::get() << std::endl << "Running on " << FLAGS_executor << "(" << FLAGS_device_id - << ")" << std::endl + << ")\n" << "Running with " << FLAGS_warmup << " warm iterations and "; if (FLAGS_repetitions == "auto") { std::clog << "adaptively determined repetititions with " << FLAGS_min_repetitions << " <= rep <= " << FLAGS_max_repetitions - << " and a minimal runtime of " << FLAGS_min_runtime << "s" - << std::endl; + << " and a minimal runtime of " << FLAGS_min_runtime << "s\n"; } else { - std::clog << FLAGS_repetitions << " running iterations" << std::endl; + std::clog << FLAGS_repetitions << " running iterations\n"; } std::clog << "The random seed for right hand sides is " << FLAGS_seed - << std::endl - << extra; + << '\n' + << extra << std::endl; } @@ -229,32 +245,19 @@ std::shared_ptr create_profiler_hook( } -struct owning_profiling_scope_guard { - std::string name; - gko::log::profiling_scope_guard guard; - - owning_profiling_scope_guard() = default; - - owning_profiling_scope_guard(std::string name_, - gko::log::ProfilerHook* profiler_hook) - : name(std::move(name_)), guard{profiler_hook->user_range(name.c_str())} - {} -}; - - struct annotate_functor { - owning_profiling_scope_guard operator()(std::string name) const + gko::log::profiling_scope_guard operator()(const char* name) const { if (profiler_hook) { - return owning_profiling_scope_guard{std::move(name), - profiler_hook.get()}; + return profiler_hook->user_range(name); } return {}; } - gko::log::profiling_scope_guard operator()(const char* name) const + gko::log::profiling_scope_guard operator()(const char* name, + bool should_annotate) const { - if (profiler_hook) { + if (profiler_hook && should_annotate) { return profiler_hook->user_range(name); } return {}; @@ -292,25 +295,15 @@ std::vector split(const std::string& s, char delimiter = ',') // returns the stream to be used as input of the application std::istream& get_input_stream() { - static auto stream = []() -> std::unique_ptr { - std::string input_str(FLAGS_input); - if (input_str.empty()) { - return nullptr; - } - if (input_str.back() == ']') { - return std::make_unique(input_str); - } - return std::make_unique(input_str); - }(); - if (stream) { - return *stream; + if (input_stream) { + return *input_stream; } return std::cin; } // backup generation -void backup_results(rapidjson::Document& results) +void backup_results(json& results) { static int next = 0; static auto filenames = []() -> std::array { @@ -329,6 +322,40 @@ void backup_results(rapidjson::Document& results) } +inline std::shared_ptr create_cuda_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + +inline std::shared_ptr create_hip_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + // executor mapping const std::map(bool)>> executor_factory{ @@ -337,12 +364,14 @@ const std::map(bool)>> {"cuda", [](bool) { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create(), + create_cuda_allocator()); }}, {"hip", [](bool) { return gko::HipExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create(), + create_hip_allocator()); }}, {"dpcpp", [](bool use_gpu_timer) { auto property = dpcpp_queue_property::in_order; @@ -368,16 +397,17 @@ const std::map compute_max_relative_norm2( } -/** - * A class for controlling the number warmup and timed iterations. - * - * The behavior is determined by the following flags - * - 'repetitions' switch between fixed and adaptive number of iterations - * - 'warmup' warmup iterations, applies in fixed and adaptive case - * - 'min_repetitions' minimal number of repetitions (adaptive case) - * - 'max_repetitions' maximal number of repetitions (adaptive case) - * - 'min_runtime' minimal total runtime (adaptive case) - * - 'repetition_growth_factor' controls the increase between two successive - * timings - * - * Usage: - * `IterationControl` exposes the member functions: - * - `warmup_run()`: controls run defined by `warmup` flag - * - `run(bool)`: controls run defined by all other flags - * - `get_timer()`: access to underlying timer - * The first two methods return an object that is to be used in a range-based - * for loop: - * ``` - * IterationControl ic(get_timer(...)); - * - * // warmup run always uses fixed number of iteration and does not issue - * // timings - * for(auto status: ic.warmup_run()){ - * // execute benchmark - * } - * // run may use adaptive number of iterations (depending on cmd line flag) - * // and issues timing (unless manage_timings is false) - * for(auto status: ic.run(manage_timings [default is true])){ - * if(! manage_timings) ic.get_timer->tic(); - * // execute benchmark - * if(! manage_timings) ic.get_timer->toc(); - * } - * - * ``` - * At the beginning of both methods, the timer is reset. - * The `status` object exposes the member - * - `cur_it`, containing the current iteration number, - * and the methods - * - `is_finished`, checks if the benchmark is finished, - */ -class IterationControl { - using IndexType = unsigned int; //!< to be compatible with GFLAGS type - - class run_control; - -public: - /** - * Creates an `IterationControl` object. - * - * Uses the commandline flags to setup the stopping criteria for the - * warmup and timed run. - * - * @param timer the timer that is to be used for the timings - */ - explicit IterationControl(const std::shared_ptr& timer) - { - status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup, - FLAGS_warmup, 0., 0}; - if (FLAGS_repetitions == "auto") { - status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions, - FLAGS_max_repetitions, FLAGS_min_runtime}; - } else { - const auto reps = - static_cast(std::stoi(FLAGS_repetitions)); - status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0}; - } - } - - IterationControl() = default; - IterationControl(const IterationControl&) = default; - IterationControl(IterationControl&&) = default; - - /** - * Creates iterable `run_control` object for the warmup run. - * - * This run uses always a fixed number of iterations. - */ - run_control warmup_run() - { - status_warmup_.cur_it = 0; - status_warmup_.managed_timer.clear(); - return run_control{&status_warmup_}; - } - - /** - * Creates iterable `run_control` object for the timed run. - * - * This run may be adaptive, depending on the commandline flags. - * - * @param manage_timings If true, the timer calls (`tic/toc`) are handled - * by the `run_control` object, otherwise they need to be executed outside - */ - run_control run(bool manage_timings = true) - { - status_run_.cur_it = 0; - status_run_.managed_timer.clear(); - status_run_.managed_timer.manage_timings = manage_timings; - return run_control{&status_run_}; - } - - std::shared_ptr get_timer() const - { - return status_run_.managed_timer.timer; - } - - /** - * Compute the time from the given statistical method - * - * @param method the statistical method. If the timer does not have the - * same iteration as the IterationControl, it can only use - * average from the IterationControl. - * - * @return the statistical time - */ - double compute_time(const std::string& method = "average") const - { - if (status_run_.managed_timer.timer->get_num_repetitions() == - this->get_num_repetitions()) { - return status_run_.managed_timer.compute_time(method); - } else { - assert(method == "average"); - return status_run_.managed_timer.get_total_time() / - this->get_num_repetitions(); - } - } - - IndexType get_num_repetitions() const { return status_run_.cur_it; } - -private: - struct TimerManager { - std::shared_ptr timer; - bool manage_timings = false; - - void tic() - { - if (manage_timings) { - timer->tic(); - } - } - void toc(unsigned int num = 1) - { - if (manage_timings) { - timer->toc(num); - } - } - - void clear() { timer->clear(); } - - double get_total_time() const { return timer->get_total_time(); } - - double compute_time(const std::string& method = "average") const - { - return timer->compute_time(method); - } - }; - - /** - * Stores stopping criteria of the adaptive benchmark run as well as the - * current iteration number. - */ - struct status { - TimerManager managed_timer{}; - - IndexType min_it = 0; - IndexType max_it = 0; - double max_runtime = 0.; - - IndexType cur_it = 0; - - /** - * checks if the adaptive run is complete - * - * the adaptive run is complete if: - * - the minimum number of iteration is reached - * - and either: - * - the maximum number of repetitions is reached - * - the total runtime is above the threshold - * - * @return completeness state of the adaptive run - */ - bool is_finished() const - { - return cur_it >= min_it && - (cur_it >= max_it || - managed_timer.get_total_time() >= max_runtime); - } - }; - - /** - * Iterable class managing the benchmark iteration. - * - * Has to be used in a range-based for loop. - */ - struct run_control { - struct iterator { - /** - * Increases the current iteration count and finishes timing if - * necessary. - * - * As `++it` is the last step of a for-loop, the managed_timer is - * stopped, if enough iterations have passed since the last timing. - * The interval between two timings is steadily increased to - * reduce the timing overhead. - */ - iterator operator++() - { - cur_info->cur_it++; - if (cur_info->cur_it >= next_timing && !stopped) { - cur_info->managed_timer.toc( - static_cast(cur_info->cur_it - start_timing)); - stopped = true; - next_timing = static_cast(std::ceil( - next_timing * FLAGS_repetition_growth_factor)); - // If repetition_growth_factor <= 1, next_timing will be - // next iteration. - if (next_timing <= cur_info->cur_it) { - next_timing = cur_info->cur_it + 1; - } - } - return *this; - } - - status operator*() const { return *cur_info; } - - /** - * Checks if the benchmark is finished and handles timing, if - * necessary. - * - * As `begin != end` is the first step in a for-loop, the - * managed_timer is started, if it was previously stopped. - * Additionally, if the benchmark is complete and the managed_timer - * is still running it is stopped. (This may occur if the maximal - * number of repetitions is surpassed) - * - * Uses only the information from the `status` object, i.e. - * the right hand side is ignored. - * - * @return true if benchmark is not finished, else false - */ - bool operator!=(const iterator&) - { - const bool is_finished = cur_info->is_finished(); - if (!is_finished && stopped) { - stopped = false; - cur_info->managed_timer.tic(); - start_timing = cur_info->cur_it; - } else if (is_finished && !stopped) { - cur_info->managed_timer.toc( - static_cast(cur_info->cur_it - start_timing)); - stopped = true; - } - return !is_finished; - } - - status* cur_info; - IndexType next_timing = 1; //!< next iteration to stop timing - IndexType start_timing = 0; //!< iteration for starting timing - bool stopped = true; - }; - - iterator begin() const { return iterator{info}; } - - // not used, could potentially be used in c++17 as a sentinel - iterator end() const { return iterator{}; } - - status* info; - }; - - status status_warmup_; - status status_run_; -}; - - #endif // GKO_BENCHMARK_UTILS_GENERAL_HPP_ diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp new file mode 100644 index 00000000000..914684ce6e4 --- /dev/null +++ b/benchmark/utils/general_matrix.hpp @@ -0,0 +1,160 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ +#define GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ + + +#include + + +#include + + +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/generator.hpp" + + +std::string reordering_algorithm_desc = + "Reordering algorithm to apply to the input matrices:\n" + " none - no reordering\n" + " amd - Approximate Minimum Degree reordering algorithm\n" +#if GKO_HAVE_METIS + " nd - Nested Dissection reordering algorithm\n" +#endif + " rcm - Reverse Cuthill-McKee reordering algorithm\n" + "This is a preprocessing step whose runtime will not be included\n" + "in the measurements."; + + +DEFINE_string(input_matrix, "", + "Filename of a matrix to be used as the single input. Overwrites " + "the value of the -input flag"); + + +#ifndef GKO_BENCHMARK_DISTRIBUTED +DEFINE_string(reorder, "none", reordering_algorithm_desc.c_str()); +#endif + + +template +std::unique_ptr> reorder( + gko::matrix_data& data, json& test_case) +{ +#ifndef GKO_BENCHMARK_DISTRIBUTED + if (FLAGS_reorder == "none") { + return nullptr; + } + using Csr = gko::matrix::Csr; + auto ref = gko::ReferenceExecutor::create(); + auto mtx = gko::share(Csr::create(ref)); + mtx->read(data); + std::unique_ptr> perm; + if (FLAGS_reorder == "amd") { + perm = gko::experimental::reorder::Amd::build() + .on(ref) + ->generate(mtx); +#if GKO_HAVE_METIS + } else if (FLAGS_reorder == "nd") { + perm = gko::experimental::reorder::NestedDissection::build() + .on(ref) + ->generate(mtx); +#endif + } else if (FLAGS_reorder == "rcm") { + perm = gko::experimental::reorder::Rcm::build() + .on(ref) + ->generate(mtx); + } else { + throw std::runtime_error{"Unknown reordering algorithm " + + FLAGS_reorder}; + } + auto perm_arr = + gko::array::view(ref, data.size[0], perm->get_permutation()); + gko::as(mtx->permute(&perm_arr))->write(data); + test_case["reordered"] = FLAGS_reorder; + return perm; +#else + // no reordering for distributed benchmarks + return nullptr; +#endif +} + + +template +void permute(std::unique_ptr>& vec, + gko::matrix::Permutation* perm) +{ + auto perm_arr = gko::array::view( + perm->get_executor(), perm->get_size()[0], perm->get_permutation()); + vec = gko::as>(vec->row_permute(&perm_arr)); +} + + +template +void permute( + std::unique_ptr>& vec, + gko::matrix::Permutation* perm) +{} + + +/** + * @copydoc initialize_argument_parsing + * @param additional_matrix_file_json text to be appended to the + * `{"filename":"..."}` JSON object that + * will be used as input for the benchmark + * if the `-input_matrix` flag is used. + */ +void initialize_argument_parsing_matrix( + int* argc, char** argv[], std::string& header, std::string& format, + std::string additional_matrix_file_json = "", bool do_print = true) +{ + initialize_argument_parsing(argc, argv, header, format, do_print); + std::string input_matrix_str{FLAGS_input_matrix}; + if (!input_matrix_str.empty()) { + if (input_stream) { + std::cerr + << "-input and -input_matrix cannot be used simultaneously\n"; + std::exit(1); + } + // create JSON for the filename via nlohmann_json to ensure the string + // is correctly escaped + auto json_template = + R"([{"filename":"")" + additional_matrix_file_json + "}]"; + auto doc = json::parse(json_template); + doc[0]["filename"] = input_matrix_str; + input_stream = std::make_unique(doc.dump()); + } +} + + +#endif // GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_ diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 076d2954980..3491fb0fc2c 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -53,28 +53,48 @@ struct DefaultSystemGenerator { using Vec = vec; static gko::matrix_data generate_matrix_data( - rapidjson::Value& config) + const json& config) { - if (config.HasMember("filename")) { - std::ifstream in(config["filename"].GetString()); - return gko::read_generic_raw(in); - } else if (config.HasMember("stencil")) { - return generate_stencil( - config["stencil"].GetString(), config["size"].GetInt64()); + gko::matrix_data data; + if (config.contains("filename")) { + std::ifstream in(config["filename"].get()); + data = gko::read_generic_raw(in); + } else if (config.contains("stencil")) { + data = generate_stencil( + config["stencil"].get(), + config["size"].get()); } else { throw std::runtime_error( "No known way to generate matrix data found."); } + data.ensure_row_major_order(); + return data; } - static std::string describe_config(rapidjson::Value& config) + static std::string get_example_config() { - if (config.HasMember("filename")) { - return config["filename"].GetString(); - } else if (config.HasMember("stencil")) { + return json:: + parse(R"([{"filename": "my_file.mtx"},{"filename": "my_file2.mtx"},{"size": 100, "stencil": "7pt"}])") + .dump(4); + } + + static bool validate_config(const json& test_case) + { + return ((test_case.contains("size") && test_case.contains("stencil") && + test_case["size"].is_number_integer() && + test_case["stencil"].is_string()) || + (test_case.contains("filename") && + test_case["filename"].is_string())); + } + + static std::string describe_config(const json& config) + { + if (config.contains("filename")) { + return config["filename"].get(); + } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].GetInt64() << "," - << config["stencil"].GetString() << ")"; + ss << "stencil(" << config["size"].get() << ", " + << config["stencil"].get() << ")"; return ss.str(); } else { throw std::runtime_error("No known way to describe config."); @@ -82,30 +102,30 @@ struct DefaultSystemGenerator { } static std::shared_ptr generate_matrix_with_optimal_format( - std::shared_ptr exec, rapidjson::Value& config) + std::shared_ptr exec, json& config) { auto data = generate_matrix_data(config); return generate_matrix_with_format( - std::move(exec), config["optimal"]["spmv"].GetString(), data); + std::move(exec), config["optimal"]["spmv"].get(), + data); } static std::shared_ptr generate_matrix_with_format( std::shared_ptr exec, const std::string& format_name, const gko::matrix_data& data, - rapidjson::Value* spmv_case = nullptr, - rapidjson::MemoryPoolAllocator<>* allocator = nullptr) + json* spmv_case = nullptr) { auto storage_logger = std::make_shared(); - if (spmv_case && allocator) { + if (spmv_case) { exec->add_logger(storage_logger); } auto mtx = gko::share(::formats::matrix_factory(format_name, exec, data)); - if (spmv_case && allocator) { + if (spmv_case) { exec->remove_logger(storage_logger); - storage_logger->write_data(*spmv_case, *allocator); + storage_logger->write_data(*spmv_case); } return mtx; @@ -172,62 +192,79 @@ struct DistributedDefaultSystemGenerator { using Vec = dist_vec; gko::matrix_data generate_matrix_data( - rapidjson::Value& config) const + const json& config) const { - if (config.HasMember("filename")) { - std::ifstream in(config["filename"].GetString()); - return gko::read_generic_raw(in); - } else if (config.HasMember("stencil")) { + gko::matrix_data data; + if (config.contains("filename")) { + std::ifstream in(config["filename"].get()); + data = gko::read_generic_raw(in); + } else if (config.contains("stencil")) { auto local_size = static_cast( - config["size"].GetInt64() / comm.size()); - return generate_stencil( - config["stencil"].GetString(), comm, local_size, - config["comm_pattern"].GetString() == std::string("optimal")); + config["size"].get() / comm.size()); + data = generate_stencil( + config["stencil"].get(), comm, local_size, + config["comm_pattern"].get() == + std::string("optimal")); } else { throw std::runtime_error( "No known way to generate matrix data found."); } + data.ensure_row_major_order(); + return data; + } + + static std::string get_example_config() + { + return json:: + parse(R"([{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, {"filename": "my_file.mtx"}])") + .dump(4); } - std::string describe_config(rapidjson::Value& config) const + static bool validate_config(const json& test_case) { - if (config.HasMember("filename")) { - return config["filename"].GetString(); - } else if (config.HasMember("stencil")) { + return ((test_case.contains("size") && test_case.contains("stencil") && + test_case.contains("comm_pattern") && + test_case["size"].is_number_integer() && + test_case["stencil"].is_string() && + test_case["comm_pattern"].is_string()) || + (test_case.contains("filename") && + test_case["filename"].is_string())); + } + + static std::string describe_config(const json& config) + { + if (config.contains("filename")) { + return config["filename"].get(); + } else if (config.contains("stencil")) { std::stringstream ss; - ss << "stencil(" << config["size"].GetInt64() << "," - << config["stencil"].GetString() << "," - << config["comm_pattern"].GetString() << ")"; + ss << "stencil(" << config["size"].get() << ", " + << config["stencil"].get() << ", " + << config["comm_pattern"].get() << ")"; return ss.str(); } else { throw std::runtime_error("No known way to describe config."); } } - std::shared_ptr generate_matrix_with_optimal_format( - std::shared_ptr exec, rapidjson::Value& config) const - { - auto data = generate_matrix_data(config); - return generate_matrix_with_format( - std::move(exec), config["optimal"]["spmv"].GetString(), data); - } - std::shared_ptr generate_matrix_with_format( std::shared_ptr exec, const std::string& format_name, const gko::matrix_data& data, - rapidjson::Value* spmv_case = nullptr, - rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const + json* spmv_case = nullptr) const { auto part = gko::experimental::distributed:: Partition::build_from_global_size_uniform( exec, comm.size(), static_cast(data.size[0])); auto formats = split(format_name, '-'); + if (formats.size() != 2) { + throw std::runtime_error{"Invalid distributed format specifier " + + format_name}; + } auto local_mat = formats::matrix_type_factory.at(formats[0])(exec); auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec); auto storage_logger = std::make_shared(); - if (spmv_case && allocator) { + if (spmv_case) { exec->add_logger(storage_logger); } @@ -235,9 +272,9 @@ struct DistributedDefaultSystemGenerator { exec, comm, local_mat, non_local_mat); dist_mat->read_distributed(data, part); - if (spmv_case && allocator) { + if (spmv_case) { exec->remove_logger(storage_logger); - storage_logger->write_data(comm, *spmv_case, *allocator); + storage_logger->write_data(comm, *spmv_case); } return dist_mat; diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 627dfad980e..c8664778e02 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -36,9 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include - - #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp new file mode 100644 index 00000000000..295ae7870d6 --- /dev/null +++ b/benchmark/utils/iteration_control.hpp @@ -0,0 +1,326 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ +#define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ + + +#include + + +#include +#include +#include + + +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" +#include "core/distributed/helpers.hpp" + + +/** + * A class for controlling the number warmup and timed iterations. + * + * The behavior is determined by the following flags + * - 'repetitions' switch between fixed and adaptive number of iterations + * - 'warmup' warmup iterations, applies in fixed and adaptive case + * - 'min_repetitions' minimal number of repetitions (adaptive case) + * - 'max_repetitions' maximal number of repetitions (adaptive case) + * - 'min_runtime' minimal total runtime (adaptive case) + * - 'repetition_growth_factor' controls the increase between two successive + * timings + * + * Usage: + * `IterationControl` exposes the member functions: + * - `warmup_run()`: controls run defined by `warmup` flag + * - `run(bool)`: controls run defined by all other flags + * - `get_timer()`: access to underlying timer + * The first two methods return an object that is to be used in a range-based + * for loop: + * ``` + * IterationControl ic(get_timer(...)); + * + * // warmup run always uses fixed number of iteration and does not issue + * // timings + * for(auto status: ic.warmup_run()){ + * // execute benchmark + * } + * // run may use adaptive number of iterations (depending on cmd line flag) + * // and issues timing (unless manage_timings is false) + * for(auto status: ic.run(manage_timings [default is true])){ + * if(! manage_timings) ic.get_timer->tic(); + * // execute benchmark + * if(! manage_timings) ic.get_timer->toc(); + * } + * + * ``` + * At the beginning of both methods, the timer is reset. + * The `status` object exposes the member + * - `cur_it`, containing the current iteration number, + * and the methods + * - `is_finished`, checks if the benchmark is finished, + */ +class IterationControl { + using IndexType = unsigned int; //!< to be compatible with GFLAGS type + + class run_control; + +public: + /** + * Creates an `IterationControl` object. + * + * Uses the commandline flags to setup the stopping criteria for the + * warmup and timed run. + * + * @param timer the timer that is to be used for the timings + */ + explicit IterationControl(const std::shared_ptr& timer) + { + status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup, + FLAGS_warmup, 0., 0}; + if (FLAGS_repetitions == "auto") { + status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions, + FLAGS_max_repetitions, FLAGS_min_runtime}; + } else { + const auto reps = + static_cast(std::stoi(FLAGS_repetitions)); + status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0}; + } + } + + IterationControl() = default; + IterationControl(const IterationControl&) = default; + IterationControl(IterationControl&&) = default; + + /** + * Creates iterable `run_control` object for the warmup run. + * + * This run uses always a fixed number of iterations. + */ + run_control warmup_run() + { + status_warmup_.cur_it = 0; + status_warmup_.managed_timer.clear(); + return run_control{&status_warmup_}; + } + + /** + * Creates iterable `run_control` object for the timed run. + * + * This run may be adaptive, depending on the commandline flags. + * + * @param manage_timings If true, the timer calls (`tic/toc`) are handled + * by the `run_control` object, otherwise they need to be executed outside + */ + run_control run(bool manage_timings = true) + { + status_run_.cur_it = 0; + status_run_.managed_timer.clear(); + status_run_.managed_timer.manage_timings = manage_timings; + return run_control{&status_run_}; + } + + std::shared_ptr get_timer() const + { + return status_run_.managed_timer.timer; + } + + /** + * Compute the time from the given statistical method + * + * @param method the statistical method. If the timer does not have the + * same iteration as the IterationControl, it can only use + * average from the IterationControl. + * + * @return the statistical time + */ + double compute_time(const std::string& method = "average") const + { + if (status_run_.managed_timer.timer->get_num_repetitions() == + this->get_num_repetitions()) { + return status_run_.managed_timer.compute_time(method); + } else { + assert(method == "average"); + return status_run_.managed_timer.get_total_time() / + this->get_num_repetitions(); + } + } + + IndexType get_num_repetitions() const { return status_run_.cur_it; } + +private: + struct TimerManager { + std::shared_ptr timer; + bool manage_timings = false; + + void tic() + { + if (manage_timings) { + timer->tic(); + } + } + void toc(unsigned int num = 1) + { + if (manage_timings) { + timer->toc(num); + } + } + + void clear() { timer->clear(); } + + double get_total_time() const { return timer->get_total_time(); } + + double compute_time(const std::string& method = "average") const + { + return timer->compute_time(method); + } + }; + + /** + * Stores stopping criteria of the adaptive benchmark run as well as the + * current iteration number. + */ + struct status { + TimerManager managed_timer{}; + + IndexType min_it = 0; + IndexType max_it = 0; + double max_runtime = 0.; + + IndexType cur_it = 0; + + /** + * checks if the adaptive run is complete + * + * the adaptive run is complete if: + * - the minimum number of iteration is reached + * - and either: + * - the maximum number of repetitions is reached + * - the total runtime is above the threshold + * + * @return completeness state of the adaptive run + */ + bool is_finished() const + { + return cur_it >= min_it && + (cur_it >= max_it || + managed_timer.get_total_time() >= max_runtime); + } + }; + + /** + * Iterable class managing the benchmark iteration. + * + * Has to be used in a range-based for loop. + */ + struct run_control { + struct iterator { + /** + * Increases the current iteration count and finishes timing if + * necessary. + * + * As `++it` is the last step of a for-loop, the managed_timer is + * stopped, if enough iterations have passed since the last timing. + * The interval between two timings is steadily increased to + * reduce the timing overhead. + */ + iterator operator++() + { + cur_info->cur_it++; + if (cur_info->cur_it >= next_timing && !stopped) { + cur_info->managed_timer.toc( + static_cast(cur_info->cur_it - start_timing)); + stopped = true; + next_timing = static_cast(std::ceil( + next_timing * FLAGS_repetition_growth_factor)); + // If repetition_growth_factor <= 1, next_timing will be + // next iteration. + if (next_timing <= cur_info->cur_it) { + next_timing = cur_info->cur_it + 1; + } + } + return *this; + } + + status operator*() const { return *cur_info; } + + /** + * Checks if the benchmark is finished and handles timing, if + * necessary. + * + * As `begin != end` is the first step in a for-loop, the + * managed_timer is started, if it was previously stopped. + * Additionally, if the benchmark is complete and the managed_timer + * is still running it is stopped. (This may occur if the maximal + * number of repetitions is surpassed) + * + * Uses only the information from the `status` object, i.e. + * the right hand side is ignored. + * + * @return true if benchmark is not finished, else false + */ + bool operator!=(const iterator&) + { + const bool is_finished = cur_info->is_finished(); + if (!is_finished && stopped) { + stopped = false; + cur_info->managed_timer.tic(); + start_timing = cur_info->cur_it; + } else if (is_finished && !stopped) { + cur_info->managed_timer.toc( + static_cast(cur_info->cur_it - start_timing)); + stopped = true; + } + return !is_finished; + } + + status* cur_info; + IndexType next_timing = 1; //!< next iteration to stop timing + IndexType start_timing = 0; //!< iteration for starting timing + bool stopped = true; + }; + + iterator begin() const { return iterator{info}; } + + // not used, could potentially be used in c++17 as a sentinel + iterator end() const { return iterator{}; } + + status* info; + }; + + status status_warmup_; + status status_run_; +}; + + +#endif // GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_ diff --git a/benchmark/utils/json.hpp b/benchmark/utils/json.hpp index b0cd384cae5..684db0229aa 100644 --- a/benchmark/utils/json.hpp +++ b/benchmark/utils/json.hpp @@ -34,69 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_BENCHMARK_UTILS_JSON_HPP_ -#include +#include -#include - - -#include -#include -#include -#include - - -// helper for setting rapidjson object members -template -std::enable_if_t< - !std::is_same::type, gko::size_type>::value, void> -add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value, - Allocator&& allocator) -{ - if (object.HasMember(name)) { - object[name] = std::forward(value); - } else { - auto n = rapidjson::Value(name, allocator); - object.AddMember(n, std::forward(value), allocator); - } -} - - -/** - @internal This is required to fix some MacOS problems (and possibly other - compilers). There is no explicit RapidJSON constructor for `std::size_t` so a - conversion to a known constructor is required to solve any ambiguity. See the - last comments of https://github.com/ginkgo-project/ginkgo/issues/270. - */ -template -std::enable_if_t< - std::is_same::type, gko::size_type>::value, void> -add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value, - Allocator&& allocator) -{ - if (object.HasMember(name)) { - object[name] = - std::forward(static_cast(value)); - } else { - auto n = rapidjson::Value(name, allocator); - object.AddMember( - n, std::forward(static_cast(value)), - allocator); - } -} - - -// helper for writing out rapidjson Values -inline std::ostream& operator<<(std::ostream& os, const rapidjson::Value& value) -{ - rapidjson::OStreamWrapper jos(os); - rapidjson::PrettyWriter, - rapidjson::UTF8<>, rapidjson::CrtAllocator, - rapidjson::kWriteNanAndInfFlag> - writer(jos); - value.Accept(writer); - return os; -} +using json = nlohmann::ordered_json; #endif // GKO_BENCHMARK_UTILS_JSON_HPP_ diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp index e3e6228604e..89ea6108eda 100644 --- a/benchmark/utils/loggers.hpp +++ b/benchmark/utils/loggers.hpp @@ -50,10 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, gko::log::ProfilerHook::NestedSummaryWriter { - JsonSummaryWriter(rapidjson::Value& object, - rapidjson::MemoryPoolAllocator<>& alloc, - gko::uint32 repetitions) - : object{&object}, alloc{&alloc}, repetitions{repetitions} + JsonSummaryWriter(json& object, gko::uint32 repetitions) + : object{&object}, repetitions{repetitions} {} void write( @@ -62,13 +60,11 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, { for (const auto& entry : entries) { if (entry.name != "total") { - add_or_set_member(*object, entry.name.c_str(), - entry.exclusive.count() * 1e-9 / repetitions, - *alloc); + (*object)[entry.name] = + entry.exclusive.count() * 1e-9 / repetitions; } } - add_or_set_member(*object, "overhead", - overhead.count() * 1e-9 / repetitions, *alloc); + (*object)["overhead"] = overhead.count() * 1e-9 / repetitions; } void write_nested(const gko::log::ProfilerHook::nested_summary_entry& root, @@ -84,27 +80,24 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter, visit(visit, child, new_prefix); exclusive -= child.elapsed; } - add_or_set_member(*object, (prefix + node.name).c_str(), - exclusive.count() * 1e-9 / repetitions, *alloc); + (*object)[prefix + node.name] = + exclusive.count() * 1e-9 / repetitions; }; // we don't need to annotate the total for (const auto& child : root.children) { visit(visit, child, ""); } - add_or_set_member(*object, "overhead", - overhead.count() * 1e-9 / repetitions, *alloc); + (*object)["overhead"] = overhead.count() * 1e-9 / repetitions; } - rapidjson::Value* object; - rapidjson::MemoryPoolAllocator<>* alloc; + json* object; gko::uint32 repetitions; }; inline std::shared_ptr create_operations_logger( bool gpu_timer, bool nested, std::shared_ptr exec, - rapidjson::Value& object, rapidjson::MemoryPoolAllocator<>& alloc, - gko::uint32 repetitions) + json& object, gko::uint32 repetitions) { std::shared_ptr timer; if (gpu_timer) { @@ -114,12 +107,10 @@ inline std::shared_ptr create_operations_logger( } if (nested) { return gko::log::ProfilerHook::create_nested_summary( - timer, - std::make_unique(object, alloc, repetitions)); + timer, std::make_unique(object, repetitions)); } else { return gko::log::ProfilerHook::create_summary( - timer, - std::make_unique(object, alloc, repetitions)); + timer, std::make_unique(object, repetitions)); } } @@ -140,21 +131,18 @@ struct StorageLogger : gko::log::Logger { storage[location] = 0; } - void write_data(rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) + void write_data(json& output) { const std::lock_guard lock(mutex); gko::size_type total{}; for (const auto& e : storage) { total += e.second; } - add_or_set_member(output, "storage", total, allocator); + output["storage"] = total; } #if GINKGO_BUILD_MPI - void write_data(gko::experimental::mpi::communicator comm, - rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) + void write_data(gko::experimental::mpi::communicator comm, json& output) { const std::lock_guard lock(mutex); gko::size_type total{}; @@ -166,7 +154,7 @@ struct StorageLogger : gko::log::Logger { ? static_cast(MPI_IN_PLACE) : &total, &total, 1, MPI_SUM, 0); - add_or_set_member(output, "storage", total, allocator); + output["storage"] = total; } #endif @@ -191,17 +179,16 @@ struct ResidualLogger : gko::log::Logger { const gko::array* status, bool all_stopped) const override { - timestamps.PushBack(std::chrono::duration( - std::chrono::steady_clock::now() - start) - .count(), - alloc); + timestamps->push_back(std::chrono::duration( + std::chrono::steady_clock::now() - start) + .count()); if (residual_norm) { - rec_res_norms.PushBack( - get_norm(gko::as>(residual_norm)), alloc); + rec_res_norms->push_back( + get_norm(gko::as>(residual_norm))); } else { gko::detail::vector_dispatch( residual, [&](const auto v_residual) { - rec_res_norms.PushBack(compute_norm2(v_residual), alloc); + rec_res_norms->push_back(compute_norm2(v_residual)); }); } if (solution) { @@ -209,42 +196,34 @@ struct ResidualLogger : gko::log::Logger { rc_vtype>(solution, [&](auto v_solution) { using concrete_type = std::remove_pointer_t>; - true_res_norms.PushBack( - compute_residual_norm(matrix, gko::as(b), - v_solution), - alloc); + true_res_norms->push_back(compute_residual_norm( + matrix, gko::as(b), v_solution)); }); } else { - true_res_norms.PushBack(-1.0, alloc); + true_res_norms->push_back(-1.0); } if (implicit_sq_residual_norm) { - implicit_res_norms.PushBack( - std::sqrt(get_norm( - gko::as>(implicit_sq_residual_norm))), - alloc); + implicit_res_norms->push_back(std::sqrt( + get_norm(gko::as>(implicit_sq_residual_norm)))); has_implicit_res_norm = true; } else { - implicit_res_norms.PushBack(-1.0, alloc); + implicit_res_norms->push_back(-1.0); } } ResidualLogger(gko::ptr_param matrix, - gko::ptr_param b, - rapidjson::Value& rec_res_norms, - rapidjson::Value& true_res_norms, - rapidjson::Value& implicit_res_norms, - rapidjson::Value& timestamps, - rapidjson::MemoryPoolAllocator<>& alloc) + gko::ptr_param b, json& rec_res_norms, + json& true_res_norms, json& implicit_res_norms, + json& timestamps) : gko::log::Logger(gko::log::Logger::iteration_complete_mask), matrix{matrix.get()}, b{b.get()}, start{std::chrono::steady_clock::now()}, - rec_res_norms{rec_res_norms}, - true_res_norms{true_res_norms}, + rec_res_norms{&rec_res_norms}, + true_res_norms{&true_res_norms}, has_implicit_res_norm{}, - implicit_res_norms{implicit_res_norms}, - timestamps{timestamps}, - alloc{alloc} + implicit_res_norms{&implicit_res_norms}, + timestamps{×tamps} {} bool has_implicit_res_norms() const { return has_implicit_res_norm; } @@ -253,12 +232,11 @@ struct ResidualLogger : gko::log::Logger { const gko::LinOp* matrix; const gko::LinOp* b; std::chrono::steady_clock::time_point start; - rapidjson::Value& rec_res_norms; - rapidjson::Value& true_res_norms; + json* rec_res_norms; + json* true_res_norms; mutable bool has_implicit_res_norm; - rapidjson::Value& implicit_res_norms; - rapidjson::Value& timestamps; - rapidjson::MemoryPoolAllocator<>& alloc; + json* implicit_res_norms; + json* timestamps; }; @@ -279,11 +257,7 @@ struct IterationLogger : gko::log::Logger { : gko::log::Logger(gko::log::Logger::iteration_complete_mask) {} - void write_data(rapidjson::Value& output, - rapidjson::MemoryPoolAllocator<>& allocator) - { - add_or_set_member(output, "iterations", this->num_iters, allocator); - } + void write_data(json& output) { output["iterations"] = this->num_iters; } private: mutable gko::size_type num_iters{0}; diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp index 168e650234d..d947b8de38e 100644 --- a/benchmark/utils/overhead_linop.hpp +++ b/benchmark/utils/overhead_linop.hpp @@ -104,27 +104,12 @@ class Overhead : public EnableLinOp>, friend class EnablePolymorphicObject; public: - GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) - { - /** - * Criterion factories. - */ - std::vector> - GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr); - - /** - * Preconditioner factory. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - preconditioner, nullptr); - - /** - * Already generated preconditioner. If one is provided, the factory - * `preconditioner` will be ignored. - */ - std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( - generated_preconditioner, nullptr); - }; + class Factory; + + struct parameters_type + : public gko::solver:: + enable_preconditioned_iterative_solver_factory_parameters< + parameters_type, Factory> {}; GKO_ENABLE_LIN_OP_FACTORY(Overhead, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp index 466d5f2d3f9..3450eb71b44 100644 --- a/benchmark/utils/preconditioners.hpp +++ b/benchmark/utils/preconditioners.hpp @@ -122,7 +122,7 @@ const std::map( .on(exec)); return gko::preconditioner::Ic, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parict", @@ -137,7 +137,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parilu", @@ -150,7 +150,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"parilut", @@ -165,7 +165,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"ic", @@ -174,7 +174,7 @@ const std::map( gko::factorization::Ic::build().on(exec)); return gko::preconditioner::Ic, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"ilu", @@ -184,7 +184,7 @@ const std::map( return gko::preconditioner:: Ilu, gko::solver::UpperTrs, false, itype>::build() - .with_factorization_factory(fact) + .with_factorization(fact) .on(exec); }}, {"paric-isai", @@ -201,8 +201,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"parict-isai", @@ -221,8 +221,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"parilu-isai", @@ -244,9 +244,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"parilut-isai", @@ -270,9 +270,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"ic-isai", @@ -286,8 +286,8 @@ const std::map( return gko::preconditioner::Ic< gko::preconditioner::LowerIsai, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) + .with_factorization(fact) + .with_l_solver(lisai) .on(exec); }}, {"ilu-isai", @@ -306,9 +306,9 @@ const std::map( gko::preconditioner::LowerIsai, gko::preconditioner::UpperIsai, false, itype>::build() - .with_factorization_factory(fact) - .with_l_solver_factory(lisai) - .with_u_solver_factory(uisai) + .with_factorization(fact) + .with_l_solver(lisai) + .with_u_solver(uisai) .on(exec); }}, {"general-isai", @@ -326,8 +326,7 @@ const std::map( {"overhead", [](std::shared_ptr exec) { return gko::Overhead::build() .with_criteria(gko::stop::ResidualNorm::build() - .with_reduction_factor(rc_etype{}) - .on(exec)) + .with_reduction_factor(rc_etype{})) .on(exec); }}}; diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp new file mode 100644 index 00000000000..264dc3965db --- /dev/null +++ b/benchmark/utils/runner.hpp @@ -0,0 +1,203 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_BENCHMARK_UTILS_RUNNER_HPP_ +#define GKO_BENCHMARK_UTILS_RUNNER_HPP_ + + +#include + + +#include +#include +#include + + +#include "benchmark/utils/general.hpp" + + +std::shared_ptr create_profiler_hook( + std::shared_ptr exec, bool do_print = true) +{ + using gko::log::ProfilerHook; + std::map()>> + hook_map{ + {"none", [] { return std::shared_ptr{}; }}, + {"auto", [&] { return ProfilerHook::create_for_executor(exec); }}, + {"nvtx", [] { return ProfilerHook::create_nvtx(); }}, + {"roctx", [] { return ProfilerHook::create_roctx(); }}, + {"tau", [] { return ProfilerHook::create_tau(); }}, + {"vtune", [] { return ProfilerHook::create_vtune(); }}, + {"debug", [do_print] { + return ProfilerHook::create_custom( + [do_print](const char* name, + gko::log::profile_event_category) { + if (do_print) { + std::clog << "DEBUG: begin " << name << '\n'; + } + }, + [do_print](const char* name, + gko::log::profile_event_category) { + if (do_print) { + std::clog << "DEBUG: end " << name << '\n'; + } + }); + }}}; + return hook_map.at(FLAGS_profiler_hook)(); +} + + +template +struct Benchmark { + /** The name to be used in the JSON output. */ + virtual const std::string& get_name() const = 0; + + /** The operations to loop over for each test case. */ + virtual const std::vector& get_operations() const = 0; + + /** Should we write logging output? */ + virtual bool should_print() const = 0; + + /** Example JSON input */ + virtual std::string get_example_config() const = 0; + + /** Is the input test case in the correct format? */ + virtual bool validate_config(const json& value) const = 0; + + /** Textual representation of the test case for profiler annotation */ + virtual std::string describe_config(const json& test_case) const = 0; + + /** Sets up shared state and test case info */ + virtual State setup(std::shared_ptr exec, + json& test_case) const = 0; + + /** Runs a single operation of the benchmark */ + virtual void run(std::shared_ptr exec, + std::shared_ptr timer, annotate_functor annotate, + State& state, const std::string& operation, + json& operation_case) const = 0; + + /** Post-process test case info. */ + virtual void postprocess(json& test_case) const {} +}; + + +template +void run_test_cases(const Benchmark& benchmark, + std::shared_ptr exec, + std::shared_ptr timer, json& test_cases) +{ + if (!test_cases.is_array()) { + if (benchmark.should_print()) { + std::cerr + << "Input has to be a JSON array of benchmark configurations:\n" + << benchmark.get_example_config() << std::endl; + } + std::exit(1); + } + for (const auto& test_case : test_cases) { + if (!test_case.is_object() || !benchmark.validate_config(test_case)) { + if (benchmark.should_print()) { + std::cerr << "Invalid test case:\n" + << std::setw(4) << test_case << "\nInput format:\n" + << benchmark.get_example_config() << std::endl; + } + std::exit(2); + } + } + + auto profiler_hook = create_profiler_hook(exec, benchmark.should_print()); + if (profiler_hook) { + exec->add_logger(profiler_hook); + } + auto annotate = annotate_functor(profiler_hook); + + for (auto& test_case : test_cases) { + try { + // set up benchmark + if (!test_case.contains(benchmark.get_name())) { + test_case[benchmark.get_name()] = json::object(); + } + auto test_case_desc = benchmark.describe_config(test_case); + if (benchmark.should_print()) { + std::clog << "Running test case " << test_case_desc + << std::endl; + } + auto test_case_state = benchmark.setup(exec, test_case); + auto test_case_range = annotate(test_case_desc.c_str()); + auto& benchmark_case = test_case[benchmark.get_name()]; + for (const auto& operation_name : benchmark.get_operations()) { + if (benchmark_case.contains(operation_name) && + !FLAGS_overwrite) { + continue; + } + benchmark_case[operation_name] = json::object(); + if (benchmark.should_print()) { + std::clog << "\tRunning " << benchmark.get_name() << ": " + << operation_name << std::endl; + } + auto& operation_case = benchmark_case[operation_name]; + try { + auto operation_range = annotate(operation_name.c_str()); + benchmark.run(exec, timer, annotate, test_case_state, + operation_name, operation_case); + operation_case["completed"] = true; + } catch (const std::exception& e) { + operation_case["completed"] = false; + operation_case["error_type"] = + gko::name_demangling::get_dynamic_type(e); + operation_case["error"] = e.what(); + std::cerr << "Error when processing test case\n" + << test_case_desc << "\n" + << "what(): " << e.what() << std::endl; + } + + if (benchmark.should_print()) { + backup_results(test_cases); + } + } + benchmark.postprocess(test_case); + } catch (const std::exception& e) { + std::cerr << "Error setting up benchmark, what(): " << e.what() + << std::endl; + test_case["error_type"] = gko::name_demangling::get_dynamic_type(e); + test_case["error"] = e.what(); + } + } + + if (profiler_hook) { + exec->remove_logger(profiler_hook); + } +} + + +#endif // GKO_BENCHMARK_UTILS_RUNNER_HPP_ diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp index 888cb496248..a6b9d968713 100644 --- a/benchmark/utils/timer_impl.hpp +++ b/benchmark/utils/timer_impl.hpp @@ -111,7 +111,8 @@ class Timer { return copy.back(); } else if (method == "median") { auto mid = copy.size() / 2; - if (copy.size() % 2) { + if (copy.size() % 2 == 0) { + // contains even elements return (copy.at(mid) + copy.at(mid - 1)) / 2; } else { return copy.at(mid); diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake index 61d53b0442a..81ff86625d1 100644 --- a/cmake/CTestScript.cmake +++ b/cmake/CTestScript.cmake @@ -4,7 +4,7 @@ # # Runs our tests through CTest, with support for Coverage or memory checking. # -# This script provides a full CTest run whith result submission to Ginkgo's +# This script provides a full CTest run with result submission to Ginkgo's # CDash dashboard. The supported runs are: # + With or without coverage, requires the gcov tool. # + With or without address sanitizers. diff --git a/cmake/DownloadNonCMakeCMakeLists.txt.in b/cmake/DownloadNonCMakeCMakeLists.txt.in deleted file mode 100644 index c2d848e8d49..00000000000 --- a/cmake/DownloadNonCMakeCMakeLists.txt.in +++ /dev/null @@ -1,14 +0,0 @@ -cmake_minimum_required(VERSION 3.9) -project(${package_name}) - -include(ExternalProject) -ExternalProject_Add(${package_name} - URL "${package_url}" - URL_HASH "${package_hash}" - DOWNLOAD_NO_PROGRESS TRUE - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" - CONFIGURE_COMMAND "${config_command}" "${ARGN}" - INSTALL_COMMAND "" - UPDATE_DISCONNECTED ${GINKGO_SKIP_DEPENDENCY_UPDATE} - ) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index a2857310183..41f3b8f2879 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -37,7 +37,7 @@ set(GINKGO_BUILD_OMP @GINKGO_BUILD_OMP@) set(GINKGO_BUILD_CUDA @GINKGO_BUILD_CUDA@) set(GINKGO_BUILD_HIP @GINKGO_BUILD_HIP@) set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@) -set(GINKGO_BUILD_DPCPP @GINKGO_BUILD_DPCPP@) +set(GINKGO_BUILD_SYCL @GINKGO_BUILD_SYCL@) set(GINKGO_DEVEL_TOOLS @GINKGO_DEVEL_TOOLS@) set(GINKGO_BUILD_TESTS @GINKGO_BUILD_TESTS@) @@ -61,27 +61,25 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@) set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) -set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@) -set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@) -set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@) -set(GINKGO_CUDA_ARCH_FLAGS @GINKGO_CUDA_ARCH_FLAGS@) +set(GINKGO_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@") +set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") -set(GINKGO_HIP_COMPILER_FLAGS @GINKGO_HIP_COMPILER_FLAGS@) -set(GINKGO_HIP_HCC_COMPILER_FLAGS @GINKGO_HIP_HCC_COMPILER_FLAGS@) -set(GINKGO_HIP_NVCC_COMPILER_FLAGS @GINKGO_HIP_NVCC_COMPILER_FLAGS@) -set(GINKGO_HIP_CLANG_COMPILER_FLAGS @GINKGO_HIP_CLANG_COMPILER_FLAGS@) +set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") +set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@") +set(GINKGO_HIP_NVCC_COMPILER_FLAGS "@GINKGO_HIP_NVCC_COMPILER_FLAGS@") +set(GINKGO_HIP_CLANG_COMPILER_FLAGS "@GINKGO_HIP_CLANG_COMPILER_FLAGS@") set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@) -set(GINKGO_HIP_PLATFORM_AMD_REGEX @HIP_PLATFORM_AMD_REGEX@) -set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX @HIP_PLATFORM_NVIDIA_REGEX@) -set(GINKGO_HIP_AMDGPU @GINKGO_HIP_AMDGPU@) +set(GINKGO_HIP_PLATFORM_AMD_REGEX "@HIP_PLATFORM_AMD_REGEX@") +set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX "@HIP_PLATFORM_NVIDIA_REGEX@") +set(GINKGO_HIP_AMDGPU "@GINKGO_HIP_AMDGPU@") set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@) -set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@) +set(GINKGO_AMD_ARCH_FLAGS "@GINKGO_AMD_ARCH_FLAGS@") set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@) set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@) -set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@) -set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@) -set(GINKGO_DPL_ROOT @GINKGO_DPL_ROOT@) +set(GINKGO_DPCPP_FLAGS "@GINKGO_DPCPP_FLAGS@") +set(GINKGO_MKL_ROOT "@GINKGO_MKL_ROOT@") +set(GINKGO_DPL_ROOT "@GINKGO_DPL_ROOT@") set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@) @@ -91,6 +89,14 @@ set(GINKGO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@) set(GINKGO_HAVE_ROCTX @GINKGO_HAVE_ROCTX@) +# Ginkgo compiler information +set(GINKGO_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +set(GINKGO_CXX_COMPILER_SHORT "@CMAKE_CXX_COMPILER_ID@:@CMAKE_CXX_COMPILER_VERSION@") +set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") +set(GINKGO_CUDA_COMPILER_SHORT "@CMAKE_CUDA_COMPILER_ID@:@CMAKE_CUDA_COMPILER_VERSION@") +set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") +set(GINKGO_CUDA_HOST_COMPILER_SHORT "") # dummy value to stay consistent + # Ginkgo installation configuration set(GINKGO_INSTALL_PREFIX "@PACKAGE_CMAKE_INSTALL_PREFIX@") set(GINKGO_INSTALL_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_FULL_INCLUDEDIR@") @@ -107,7 +113,6 @@ if(GINKGO_BUILD_HIP) endif() list(APPEND CMAKE_PREFIX_PATH "${GINKGO_INSTALL_PREFIX}") - set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@") set(GINKGO_INTERFACE_LINK_FLAGS "@GINKGO_INTERFACE_LINK_FLAGS@") set(GINKGO_INTERFACE_CXX_FLAGS "@GINKGO_INTERFACE_CXX_FLAGS@") @@ -117,11 +122,6 @@ set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(GINKGO_CUDA_COMPILER_VERSION @CMAKE_CUDA_COMPILER_VERSION@) set(GINKGO_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@") -set(GINKGO_CUBLAS_LIBRARIES @CUBLAS@) -set(GINKGO_CUSPARSE_LIBRARIES @CUSPARSE@) -set(GINKGO_CUDA_LIBRARIES @CUDA_RUNTIME_LIBS@) -set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@") - set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@") set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@") set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@") @@ -129,8 +129,8 @@ set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@") # OpenMP set(GINKGO_OPENMP_VERSION @OpenMP_CXX_VERSION@) -set(GINKGO_OPENMP_LIB_NAMES @OpenMP_CXX_LIB_NAMES@) -set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@) +set(GINKGO_OPENMP_LIB_NAMES "@OpenMP_CXX_LIB_NAMES@") +set(GINKGO_OPENMP_LIBRARIES "@OpenMP_CXX_LIBRARIES@") set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@") @@ -139,21 +139,14 @@ set(GINKGO_HAVE_VTUNE "@GINKGO_HAVE_VTUNE@") set(GINKGO_HAVE_METIS "@GINKGO_HAVE_METIS@") set(VTune_PATH "@VTune_PATH@") +# ensure Threads settings +set(THREADS_PREFER_PTHREAD_FLAG ON) + # NOTE: we do not export benchmarks, examples, tests or devel tools # so `third_party` libraries are currently unneeded. -# propagate CUDA_HOST_COMPILER if needed -if (GINKGO_BUILD_CUDA OR (GINKGO_BUILD_HIP - AND GINKGO_HIP_PLATFORM MATCHES "${GINKGO_HIP_PLATFORM_NVIDIA_REGEX}")) - if (GINKGO_CUDA_HOST_COMPILER AND NOT CMAKE_CUDA_HOST_COMPILER - AND EXISTS "${GINKGO_CUDA_HOST_COMPILER}") - message(STATUS "Ginkgo: Setting CUDA host compiler to ${GINKGO_CUDA_HOST_COMPILER}") - set(CMAKE_CUDA_HOST_COMPILER "${GINKGO_CUDA_HOST_COMPILER}" CACHE STRING "" FORCE) - endif() -endif() - if(GINKGO_HAVE_PAPI_SDE) - find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde) + find_package(PAPI REQUIRED COMPONENTS sde) endif() if(GINKGO_HAVE_HWLOC) @@ -174,6 +167,7 @@ endif() # For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614 if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA) enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) find_package(NVTX REQUIRED) endif() @@ -190,7 +184,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) endif() endif() -if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP) +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_SYCL) find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}") find_package(oneDPL REQUIRED HINTS "${GINKGO_DPL_ROOT}") endif() @@ -207,4 +201,20 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_HAVE_TAU) find_package(PerfStubs REQUIRED) endif() +# Check that the same compilers as for Ginkgo are used +function(_ginkgo_check_compiler lang) + if(NOT ${CMAKE_${lang}_COMPILER} STREQUAL ${GINKGO_${lang}_COMPILER}) + set(_compiler_short "${CMAKE_${lang}_COMPILER_ID}:${CMAKE_${lang}_COMPILER_VERSION}") + if(NOT _compiler_short STREQUAL "${GINKGO_${lang}_COMPILER_SHORT}") + message(WARNING "The currently used ${lang} compiler: ${CMAKE_${lang}_COMPILER} does not match the compiler used to " + "build Ginkgo: ${GINKGO_${lang}_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.") + endif() + endif() +endfunction() +_ginkgo_check_compiler(CXX) +if(GINKGO_BUILD_CUDA) + _ginkgo_check_compiler(CUDA) + _ginkgo_check_compiler(CUDA_HOST) +endif() + include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake index 63e8c767446..f863b144ab7 100644 --- a/cmake/Modules/CudaArchitectureSelector.cmake +++ b/cmake/Modules/CudaArchitectureSelector.cmake @@ -65,6 +65,15 @@ # The command has the same result as ``cas_target_cuda_architectures``. It does # not add the compiler flags to the target, but stores the compiler flags in # the variable (string). +# +# cas_variable_cmake_cuda_architectures( +# [] # variable for storing architecture list +# [] # list of architecture specifications +# ) +# +# The command prepares an architecture list supported by the CMake +# ``CUDA_ARCHITECTURES`` target property and ``CMAKE_CUDA_ARCHITECTURES`` +# variable. The architecture specification # # # ``ARCHITECTURES`` specification list @@ -119,7 +128,7 @@ # identifiers in this list will be removed from the list specified by the # ``ARCHITECTURES`` list. A warning will be printed for each removed entry. # The list also supports aggregates ``All``, ``Auto`` and GPU generation names -# wich have the same meaning as in the ``ARCHITECTURES'' specification list. +# which have the same meaning as in the ``ARCHITECTURES'' specification list. if(NOT DEFINED CMAKE_CUDA_COMPILER) @@ -404,3 +413,34 @@ function(cas_variable_cuda_architectures variable) cas_get_compiler_flags(flags ${ARGN}) set(${variable} "${flags}" PARENT_SCOPE) endfunction() + + +function(cas_variable_cmake_cuda_architectures variable) + cas_get_supported_architectures(supported_archs) + if("${ARGN}" STREQUAL "All") + set(archs "${supported_archs}") + elseif("${ARGN}" STREQUAL "Auto") + cas_get_onboard_architectures(onboard_archs) + if (onboard_archs) + set(archs "${onboard_archs}") + else() + set(archs "${supported_archs}") + endif() + else() + set(archs) + foreach(arch IN LISTS ARGN) + if(arch MATCHES "${cas_spec_regex}") + if(CMAKE_MATCH_1) + list(APPEND archs ${CMAKE_MATCH_1}-real) + endif() + if(CMAKE_MATCH_3) + list(APPEND archs ${CMAKE_MATCH_3}-virtual) + endif() + else() + cas_get_architectures_by_name("${arch}" arch) + list(APPEND archs ${arch}) + endif() + endforeach() + endif() + set("${variable}" "${archs}" PARENT_SCOPE) +endfunction() diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake index 7078c9dcb36..879c66f2d59 100644 --- a/cmake/Modules/FindNVTX.cmake +++ b/cmake/Modules/FindNVTX.cmake @@ -27,8 +27,8 @@ # ``NVTX_FOUND`` # If false, do not try to use the NVTX library. -find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvtx3) -find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS}/nvtx3) +find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS}) mark_as_advanced(NVTX3_INCLUDE_DIR) mark_as_advanced(NVTX_INCLUDE_DIR) include(FindPackageHandleStandardArgs) diff --git a/cmake/Modules/FindPAPI.cmake b/cmake/Modules/FindPAPI.cmake index 95f26a24684..04962970e35 100644 --- a/cmake/Modules/FindPAPI.cmake +++ b/cmake/Modules/FindPAPI.cmake @@ -57,6 +57,7 @@ if(NOT PAPI_LIBRARY) select_library_configurations(PAPI) endif() +set(WORK_DIR "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/FindPAPI") if(PAPI_INCLUDE_DIR) if(EXISTS "${PAPI_INCLUDE_DIR}/papi.h") file(STRINGS "${PAPI_INCLUDE_DIR}/papi.h" papi_version_str REGEX "^#define[\t ]+PAPI_VERSION[\t ]+.*") @@ -70,7 +71,9 @@ if(PAPI_INCLUDE_DIR) # find the components enable_language(C) foreach(component IN LISTS PAPI_FIND_COMPONENTS) - file(WRITE "${PROJECT_BINARY_DIR}/papi_${component}_detect.c" + set(SRC_FILE "${WORK_DIR}/papi_${component}_detect.c") + set(BIN_FILE "${WORK_DIR}/papi_${component}_detect.bin") + file(WRITE "${SRC_FILE}" " #include int main() { @@ -78,17 +81,18 @@ if(PAPI_INCLUDE_DIR) retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT && retval > 0) return -1; - if (PAPI_get_component_index(\"${component}\") < 0) + if (PAPI_get_component_index(\"${component}\") == PAPI_ENOCMP) return 0; return 1; }" ) try_run(PAPI_${component}_FOUND gko_result_unused - "${PROJECT_BINARY_DIR}" - "${PROJECT_BINARY_DIR}/papi_${component}_detect.c" + "${WORK_DIR}" + "${SRC_FILE}" CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${PAPI_INCLUDE_DIR} LINK_LIBRARIES ${PAPI_LIBRARY} + COPY_FILE ${BIN_FILE} ) if (NOT PAPI_${component}_FOUND EQUAL 1) @@ -105,6 +109,33 @@ find_package_handle_standard_args(PAPI VERSION_VAR PAPI_VERSION_STRING HANDLE_COMPONENTS) +if(PAPI_sde_FOUND) + # PAPI SDE is another library and header, let's try to find them + find_path(PAPI_SDE_INCLUDE_DIR NAMES sde_lib.h) + mark_as_advanced(PAPI_SDE_INCLUDE_DIR) + + if(NOT PAPI_SDE_LIBRARY) + find_library(PAPI_SDE_LIBRARY_RELEASE NAMES + sde + ) + mark_as_advanced(PAPI_SDE_LIBRARY_RELEASE) + + find_library(PAPI_SDE_LIBRARY_DEBUG NAMES + sded + sde-d + ) + mark_as_advanced(PAPI_SDE_LIBRARY_DEBUG) + + include(SelectLibraryConfigurations) + select_library_configurations(PAPI_SDE) + endif() + + # FIXME: with CMake>=3.17, use NAME_MISMATCHED to get rid of the warning + find_package_handle_standard_args(PAPI_SDE + REQUIRED_VARS PAPI_SDE_LIBRARY PAPI_SDE_INCLUDE_DIR + VERSION_VAR PAPI_VERSION_STRING) +endif() + if(PAPI_FOUND) set(PAPI_LIBRARIES ${PAPI_LIBRARY}) set(PAPI_INCLUDE_DIRS ${PAPI_INCLUDE_DIR}) @@ -142,3 +173,41 @@ if(PAPI_FOUND) endif() endif() endif() + +if (PAPI_SDE_FOUND AND NOT TARGET PAPI::PAPI_SDE) + set(PAPI_SDE_LIBRARIES ${PAPI_SDE_LIBRARY}) + set(PAPI_SDE_INCLUDE_DIRS ${PAPI_SDE_INCLUDE_DIR}) + unset(PAPI_SDE_LIBRARY) + unset(PAPI_SDE_INCLUDE_DIR) + + if(NOT TARGET PAPI::PAPI_SDE) + add_library(PAPI::PAPI_SDE UNKNOWN IMPORTED) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${PAPI_SDE_INCLUDE_DIRS}") + + if(EXISTS "${PAPI_SDE_LIBRARIES}") + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES "${PAPI_SDE_LIBRARIES}" + IMPORTED_LOCATION "${PAPI_SDE_LIBRARIES}") + endif() + if(PAPI_SDE_LIBRARY_RELEASE) + set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}" + IMPORTED_LOCATION_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}") + unset(PAPI_SDE_LIBRARY_RELEASE) + endif() + if(PAPI_SDE_LIBRARY_DEBUG) + set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(PAPI::PAPI_SDE PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + INTERFACE_LINK_LIBRARIES_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}" + IMPORTED_LOCATION_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}") + unset(PAPI_SDE_LIBRARY_DEBUG) + endif() + endif() +endif() diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index 315e0eb3e38..757262f1ea1 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -1,7 +1,7 @@ set(GINKGO_HAS_OMP OFF) set(GINKGO_HAS_MPI OFF) set(GINKGO_HAS_CUDA OFF) -set(GINKGO_HAS_DPCPP OFF) +set(GINKGO_HAS_SYCL OFF) set(GINKGO_HAS_HIP OFF) include(CheckLanguage) @@ -37,12 +37,16 @@ if (NOT DEFINED GINKGO_BUILD_HIP) endif() endif() -if (NOT DEFINED GINKGO_BUILD_DPCPP) +if (NOT DEFINED GINKGO_BUILD_DPCPP AND NOT DEFINED GINKGO_BUILD_SYCL) try_compile(GKO_CAN_COMPILE_DPCPP ${PROJECT_BINARY_DIR}/dpcpp SOURCES ${PROJECT_SOURCE_DIR}/dpcpp/test_dpcpp.dp.cpp + # try_compile will pass the project CMAKE_CXX_FLAGS so passing -DCMAKE_CXX_FLAGS does not affect it. + # They append COMPILE_DEFINITIONS into CMAKE_CXX_FLAGS. + # Note. it is different from try_compile COMPILE_DEFINITIONS affect + CMAKE_FLAGS -DCOMPILE_DEFINITIONS=-fsycl CXX_STANDARD 17) if (GKO_CAN_COMPILE_DPCPP) message(STATUS "Enabling DPCPP executor") - set(GINKGO_HAS_DPCPP ON) + set(GINKGO_HAS_SYCL ON) endif() endif() diff --git a/cmake/autodetect_system_libs.cmake b/cmake/autodetect_system_libs.cmake new file mode 100644 index 00000000000..6f59a759aa8 --- /dev/null +++ b/cmake/autodetect_system_libs.cmake @@ -0,0 +1,7 @@ +if (NOT DEFINED GINKGO_BUILD_HWLOC) + find_package(HWLOC 2.1) +endif() + +if (NOT DEFINED GINKGO_BUILD_PAPI_SDE) + find_package(PAPI 7.0.1.0 COMPONENTS sde) +endif() diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index a7b8c48acf3..34189a09450 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -139,7 +139,8 @@ function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION MACRO_ "int main() {std::cout << ${MACRO_VAR} << '\\n'\;" "return 0\;}") file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp" ${DPCPP_VERSION_PROG}) - execute_process(COMMAND ${DPCPP_COMPILER} ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp + # we always add -fsycl + execute_process(COMMAND ${DPCPP_COMPILER} -fsycl ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver ERROR_VARIABLE DPCPP_EXTRACT_VER_ERROR) execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 3fbafe35858..522ad5f2ba7 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,26 +1,19 @@ -set(gko_test_single_args "MPI_SIZE") +set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_TYPE") +set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}") set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") +set(gko_test_option_args "NO_RESOURCES") ## Replaces / by _ to create valid target names from relative paths function(ginkgo_build_test_name test_name target_name) file(RELATIVE_PATH REL_BINARY_DIR - ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) -endfunction(ginkgo_build_test_name) - -function(ginkgo_create_gtest_mpi_main) - add_library(gtest_mpi_main "") - target_sources(gtest_mpi_main - PRIVATE - ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp) - find_package(MPI 3.1 COMPONENTS CXX REQUIRED) - target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX) -endfunction(ginkgo_create_gtest_mpi_main) +endfunction() ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes. -function(ginkgo_set_test_target_properties test_target_name) +function(ginkgo_set_test_target_properties test_target_name test_library_suffix) cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}") if (GINKGO_FAST_TESTS) target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS) @@ -31,26 +24,56 @@ function(ginkgo_set_test_target_properties test_target_name) if (GINKGO_COMPILING_DPCPP_TEST AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() - if (GINKGO_CHECK_CIRCULAR_DEPS) + if(GINKGO_CHECK_CIRCULAR_DEPS) target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") endif() - if (set_properties_MPI_SIZE) - if(NOT TARGET gtest_mpi_main) - ginkgo_create_gtest_mpi_main() - endif() - set(gtest_main gtest_mpi_main MPI::MPI_CXX) + if(set_properties_MPI_SIZE) + target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main_mpi${test_library_suffix}) else() - set(gtest_main GTest::Main) + target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix}) endif() target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) - target_link_libraries(${test_target_name} PRIVATE ginkgo ${gtest_main} GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) + target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) +endfunction() + +function(ginkgo_add_resource_requirement test_name) + cmake_parse_arguments(PARSE_ARGV 1 add_rr "${gko_test_option_args}" "${gko_test_single_args}" "") + if(add_rr_NO_RESOURCES OR (NOT add_rr_RESOURCE_TYPE)) + return() + endif () + + if(add_rr_RESOURCE_TYPE STREQUAL "cpu") + if(NOT add_rr_RESOURCE_LOCAL_CORES) + set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_CI_TEST_OMP_PARALLELISM}) + endif() + if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+") + message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORES=${add_rr_RESOURCE_LOCAL_CORES}") + endif() + + set(single_resource "cpu:${add_rr_RESOURCE_LOCAL_CORES}") + elseif(add_rr_RESOURCE_TYPE MATCHES "^(cudagpu|hipgpu|sycl)$") + set(single_resource "${add_rr_RESOURCE_TYPE}:1") + else() + message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: cpu, cudagpu, hipgpu, sycl.") + endif() + + if(NOT add_rr_MPI_SIZE) + set(add_rr_MPI_SIZE 1) + endif() + set_property(TEST ${test_name} + PROPERTY + RESOURCE_GROUPS "${add_rr_MPI_SIZE},${single_resource}") endfunction() + ## Adds a test to the list executed by ctest and sets its output binary name ## Possible additional arguments: ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes. +## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is +## $GINKGO_CI_TEST_OMP_PARALLELISM +## - `RESOURCE_TYPE` the resource type, can be cpu, cudagpu, hipgpu, sycl ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple) ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths @@ -71,6 +94,9 @@ function(ginkgo_add_test test_name test_target_name) COMMAND ${test_target_name} WORKING_DIRECTORY "$") endif() + + ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN}) + set(test_preload) if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA) set(test_preload $:${test_preload}) @@ -87,8 +113,8 @@ endfunction() function(ginkgo_create_test test_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) - target_link_libraries(${test_target_name} PRIVATE ${create_test_ADDITIONAL_LIBRARIES}) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) + target_link_libraries(${test_target_name}) + ginkgo_set_test_target_properties(${test_target_name} "_cpu" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) endfunction(ginkgo_create_test) @@ -98,9 +124,10 @@ function(ginkgo_create_dpcpp_test test_name) add_executable(${test_target_name} ${test_name}.dp.cpp) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS}) + gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_dpcpp" ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") @@ -119,7 +146,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) target_compile_options(${test_target_name} PRIVATE - $<$:${GINKGO_CUDA_ARCH_FLAGS}> $<$:${GINKGO_CUDA_COMPILER_FLAGS}>) if(MSVC) target_compile_options(${test_target_name} @@ -134,8 +160,8 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF) endif() - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu) endfunction(ginkgo_create_cuda_test_internal) ## Test compiled with HIP @@ -190,10 +216,26 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add ${hiprand_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS} ) - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name} ${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) + +## Test compiled with OpenMP +function(ginkgo_create_omp_test test_name) + ginkgo_build_test_name(${test_name} test_target_name) + ginkgo_create_omp_test_internal(${test_name} ${test_name}.cpp ${test_target_name} "" ${ARGN}) +endfunction() + +function(ginkgo_create_omp_test_internal test_name filename test_target_name) + ginkgo_build_test_name(${test_name} test_target_name) + add_executable(${test_target_name} ${test_name}.cpp) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) + target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) + ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN}) + ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu) +endfunction() + ## Common test compiled with the host compiler, one target for each enabled backend function(ginkgo_create_common_test test_name) if(GINKGO_BUILD_OMP) @@ -205,7 +247,7 @@ function(ginkgo_create_common_test test_name) if(GINKGO_BUILD_CUDA) ginkgo_create_common_test_internal(${test_name} CudaExecutor cuda ${ARGN}) endif() - if(GINKGO_BUILD_DPCPP) + if(GINKGO_BUILD_SYCL) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) endif() endfunction(ginkgo_create_common_test) @@ -215,11 +257,29 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) if(exec IN_LIST common_test_DISABLE_EXECUTORS) return() endif() + if (exec STREQUAL reference) + set(test_resource_type "") + elseif (exec STREQUAL omp) + set(test_resource_type cpu) + elseif (exec STREQUAL cuda) + set(test_resource_type cudagpu) + elseif (exec STREQUAL hip) + set(test_resource_type hipgpu) + else () + set(test_resource_type sycl) + endif () ginkgo_build_test_name(${test_name} test_target_name) string(TOUPPER ${exec} exec_upper) + # set up actual test set(test_target_name ${test_target_name}_${exec}) add_executable(${test_target_name} ${test_name}.cpp) + + # also need to add runtime libraries for other backends + if (exec STREQUAL omp) + target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) + endif () + target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary @@ -227,18 +287,21 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() - ginkgo_set_test_target_properties(${test_target_name} ${ARGN}) - ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN}) + ginkgo_set_test_target_properties(${test_target_name} "_${exec}" ${ARGN}) + ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} RESOURCE_TYPE ${test_resource_type}) endfunction(ginkgo_create_common_test_internal) ## Common test compiled with the device compiler, one target for each enabled backend function(ginkgo_create_common_device_test test_name) cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name) - if(GINKGO_BUILD_DPCPP) + if(GINKGO_BUILD_SYCL) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17) target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS}) + # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property. + configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY) + gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp) target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel) endif() if(GINKGO_BUILD_OMP) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index c5ba334e983..2e1c82db6b0 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,82 +1,16 @@ enable_language(CUDA) -if(MSVC) - # MSVC can not find CUDA automatically - # Use CUDA_COMPILER PATH to define the CUDA TOOLKIT ROOT DIR - string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER}) - if("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" STREQUAL "") - set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/include") - endif() - if("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" STREQUAL "") - set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/lib/x64") - endif() -endif() +find_package(CUDAToolkit REQUIRED) include(cmake/Modules/CudaArchitectureSelector.cmake) -set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - -# Detect the CUDA architecture flags and propagate to all the project -cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS - ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} - UNSUPPORTED "20" "21") - -if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") - find_package(NVHPC REQUIRED - HINTS - $ENV{NVIDIA_PATH} - ${CMAKE_CUDA_COMPILER}/../../.. - ) - - set(CUDA_RUNTIME_LIBS_DYNAMIC ${NVHPC_CUDART_LIBRARY}) - set(CUDA_RUNTIME_LIBS_STATIC ${NVHPC_CUDART_LIBRARY_STATIC}) - set(CUBLAS ${NVHPC_CUBLAS_LIBRARY}) - set(CUSPARSE ${NVHPC_CUSPARSE_LIBRARY}) - set(CURAND ${NVHPC_CURAND_LIBRARY}) - set(CUFFT ${NVHPC_CUFFT_LIBRARY}) -else() - find_library(CUDA_RUNTIME_LIBS_DYNAMIC cudart - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CUDA_RUNTIME_LIBS_STATIC cudart_static - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - - # CUDA 10.1/10.2 put cublas, cublasLt, cudnn in /usr/lib/-linux-gnu/, but - # others (<= 10.0 or >= 11) put them in cuda own directory - # If the environment installs several cuda including 10.1/10.2, cmake will find - # the 10.1/10.2 .so files when searching others cuda in the default path. - # CMake already puts /usr/lib/-linux-gnu/ after cuda own directory in the - # `CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES`, so we always put NO_DEFAULT_PATH here. - find_library(CUBLAS cublas - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} NO_DEFAULT_PATH) - find_library(CUSPARSE cusparse - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CURAND curand - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - find_library(CUFFT cufft - HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +if(NOT CMAKE_CUDA_ARCHITECTURES) + # Detect the CUDA architecture and propagate it to the entire project + cas_variable_cmake_cuda_architectures(CMAKE_CUDA_ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}) endif() find_package(NVTX REQUIRED) -# MSVC nvcc uses static cudartlibrary by default, and other platforms use shared cudartlibrary. -# add `-cudart shared` or `-cudart=shared` according system into CMAKE_CUDA_FLAGS -# to force nvcc to use dynamic cudart library in MSVC. -if(MSVC) - if("${CMAKE_CUDA_FLAGS}" MATCHES "-cudart(=| )shared") - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE) - else() - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_STATIC}" CACHE STRING "Path to a library" FORCE) - endif() -else() - set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE) -endif() - -if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) -elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) - unset(CMAKE_CUDA_HOST_COMPILER CACHE) -endif() - if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER) message(WARNING "The CMake CXX compiler and CUDA host compiler do not match. " "If you encounter any build error, especially while linking, try to use " @@ -84,13 +18,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_ "The CXX compiler is ${CMAKE_CXX_COMPILER} with version ${CMAKE_CXX_COMPILER_VERSION}.\n" "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.") endif() - -if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION - MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() -endif() diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 2cf8dd06c3f..6b904189151 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -127,7 +127,7 @@ foreach(log_type ${log_types}) ginkgo_print_module_footer(${${log_type}} "User configuration:") ginkgo_print_module_footer(${${log_type}} " Enabled modules:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") + "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL") ginkgo_print_module_footer(${${log_type}} " Enabled features:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI") @@ -167,7 +167,7 @@ IF(GINKGO_BUILD_HIP) include(hip/get_info.cmake) ENDIF() -IF(GINKGO_BUILD_DPCPP) +IF(GINKGO_BUILD_SYCL) include(dpcpp/get_info.cmake) ENDIF() @@ -190,16 +190,21 @@ ginkgo_print_module_footer(${detailed_log} "") ginkgo_print_generic_header(${minimal_log} " Components:") ginkgo_print_generic_header(${detailed_log} " Components:") -if(PAPI_sde_FOUND) +ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_PAPI_SDE") +ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_PAPI_SDE") +if(TARGET PAPI::PAPI) ginkgo_print_variable(${detailed_log} "PAPI_VERSION") ginkgo_print_variable(${detailed_log} "PAPI_INCLUDE_DIR") ginkgo_print_flags(${detailed_log} "PAPI_LIBRARY") endif() + ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_HWLOC") ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC") -ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") -ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") -ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +if(TARGET hwloc) + ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") + ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") + ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +endif() _minimal( " diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 1b9aa0e8723..72a7a3a86d8 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21) set(CMAKE_HIP_ARCHITECTURES OFF) endif() -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2) - message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.") -endif() - if(NOT DEFINED ROCM_PATH) if(DEFINED ENV{ROCM_PATH}) set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") @@ -185,11 +180,6 @@ endif() set(GINKGO_HIP_NVCC_ARCH "") if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) - set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE) - elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER) - unset(CMAKE_CUDA_HOST_COMPILER CACHE) - endif() if (CMAKE_CUDA_HOST_COMPILER) list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") endif() @@ -197,16 +187,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") # Remove false positive CUDA warnings when calling one() and zero() list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda) - if (GINKGO_HIP_PLATFROM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2" - AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() - endif() # select GPU architecture include(cmake/Modules/CudaArchitectureSelector.cmake) cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH @@ -227,7 +207,7 @@ set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILIN set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) - list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics) + list(APPEND GINKGO_HIP_CLANG_OPTIONS "-munsafe-fp-atomics -Wno-unused-command-line-argument") endif() # HIP's cmake support secretly carries around global state to remember # whether we created any shared libraries, and sets PIC flags accordingly. diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index 8bed7320caa..7ac7fdfeda5 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -78,11 +78,7 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS) list(TRANSFORM GINKGO_LIBS_INTERFACE_LIBS REPLACE "\\$" "\\1") ginkgo_interface_libraries_recursively("${GINKGO_LIBS_INTERFACE_LIBS}") elseif(EXISTS "${_libs}") - if ("${_libs}" MATCHES "${PROJECT_BINARY_DIR}.*hwloc.so") - list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${CMAKE_INSTALL_FULL_LIBDIR}/libhwloc.so") - else() - list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}") - endif() + list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}") elseif("${_libs}" STREQUAL "${CMAKE_DL_LIBS}") list(APPEND GINKGO_INTERFACE_LIBS_FOUND "-l${_libs}") endif() @@ -103,7 +99,7 @@ macro(ginkgo_interface_information) get_target_property(GINKGO_INTERFACE_LINK_LIBRARIES ginkgo INTERFACE_LINK_LIBRARIES) ginkgo_interface_libraries_recursively("${GINKGO_INTERFACE_LINK_LIBRARIES}") # Format and store the interface libraries found - # remove duplicates on the reversed list to keep the dependecy in the end of list. + # remove duplicates on the reversed list to keep the dependency in the end of list. list(REVERSE GINKGO_INTERFACE_LIBS_FOUND) list(REMOVE_DUPLICATES GINKGO_INTERFACE_LIBS_FOUND) list(REVERSE GINKGO_INTERFACE_LIBS_FOUND) diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index 58cc730bb14..601fc89a3db 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -30,10 +30,6 @@ function(ginkgo_add_install_rpath name) endif() if (GINKGO_INSTALL_RPATH_DEPENDENCIES) set(RPATH_DEPENDENCIES "${ARGN}") - if(GINKGO_HAVE_HWLOC AND HWLOC_FOUND) - get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY) - list(APPEND RPATH_DEPENDENCIES "${HWLOC_LIBRARIES}") - endif() endif() if (GINKGO_INSTALL_RPATH) set_property(TARGET "${name}" PROPERTY INSTALL_RPATH @@ -80,11 +76,6 @@ function(ginkgo_install) install(FILES "${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp" DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/ginkgo" ) - if (GINKGO_HAVE_PAPI_SDE) - install(FILES "${Ginkgo_SOURCE_DIR}/third_party/papi_sde/papi_sde_interface.h" - DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/third_party/papi_sde" - ) - endif() if (GINKGO_HAVE_HWLOC AND NOT HWLOC_FOUND) get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY) diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp index 3b6f33dd5d0..94b2774503b 100644 --- a/cmake/openmpi_test.cpp +++ b/cmake/openmpi_test.cpp @@ -38,11 +38,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main() { -#if defined(OPEN_MPI) && OPEN_MPI - std::printf("%d.%d.%d", OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION); - return 1; +#if CHECK_HAS_OPEN_MPI && defined(OPEN_MPI) && OPEN_MPI + static_assert(true, "Check availability of OpenMPI"); +#elif CHECK_OPEN_MPI_VERSION && defined(OPEN_MPI) && OPEN_MPI + static_assert(OMPI_MAJOR_VERSION > 4 || + (OMPI_MAJOR_VERSION == 4 && OMPI_MINOR_VERSION >= 1), + "Check OpenMPI version."); #else - return 0; + static_assert(false, "No OpenMPI available"); #endif } diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake deleted file mode 100644 index e1d196ad553..00000000000 --- a/cmake/package_helpers.cmake +++ /dev/null @@ -1,59 +0,0 @@ -set(NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT - "${CMAKE_CURRENT_LIST_DIR}/DownloadNonCMakeCMakeLists.txt.in") - - -# Load a package from the url provided and run configure (Non-CMake projects) -# -# \param package_name Name of the package -# \param package_url Url of the package -# \param package_tag Tag or version of the package to be downloaded. -# \param config_command The command for the configuration step. -# -function(ginkgo_load_and_configure_package package_name package_url package_hash config_command) - set(GINKGO_THIRD_PARTY_BUILD_TYPE "Debug") - if (CMAKE_BUILD_TYPE MATCHES "[Rr][Ee][Ll][Ee][Aa][Ss][Ee]") - set(GINKGO_THIRD_PARTY_BUILD_TYPE "Release") - endif() - configure_file(${NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT} - download/CMakeLists.txt) - set(TOOLSET "") - if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "") - set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}") - endif() - execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" "${TOOLSET}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) - if(result) - message(FATAL_ERROR - "CMake step for ${package_name}/download failed: ${result}") - return() - endif() - execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download) - if(result) - message(FATAL_ERROR - "Build step for ${package_name}/download failed: ${result}") - return() - endif() -endfunction() - - -# Download a file and verify the download -# -# \param url The url of file to be downloaded -# \param filename The name of the file -# \param hash_type The type of hash, See CMake file() documentation for more details. -# \param hash The hash itself, See CMake file() documentation for more details. -# -function(ginkgo_download_file url filename hash_type hash) - file(DOWNLOAD ${url} ${filename} - TIMEOUT 60 # seconds - EXPECTED_HASH "${hash_type}=${hash}" - TLS_VERIFY ON) - if(EXISTS ${filename}) - message(STATUS "${filename} downloaded from ${url}") - else() - message(FATAL_ERROR "Download of ${filename} failed.") - endif() -endfunction(ginkgo_download_file) diff --git a/cmake/rename.cmake b/cmake/rename.cmake new file mode 100644 index 00000000000..6c386bc24c6 --- /dev/null +++ b/cmake/rename.cmake @@ -0,0 +1,20 @@ +# Only for CACHE variable (option) +macro(gko_rename_cache deprecated actual type doc_string) + if(DEFINED ${deprecated}) + if(DEFINED ${actual}) + message("actual ${actual} and deprecated ${deprecated}") + if("${${actual}}" STREQUAL "${${deprecated}}") + # They are the same, so only throw warning + message(WARNING "${deprecated} was deprecated, please only use ${actual} instead.") + else() + # They are different + message(FATAL_ERROR "Both ${deprecated} and ${actual} were specified differently, please only use ${actual} instead.") + endif() + else() + # Only set `deprecated`, move it to `actual`. + message(WARNING "${deprecated} was deprecated, please use ${actual} instead. " + "We copy ${${deprecated}} to ${actual}") + set(${actual} ${${deprecated}} CACHE ${type} "${doc_string}") + endif() + endif() +endmacro() \ No newline at end of file diff --git a/cmake/sycl.cmake b/cmake/sycl.cmake new file mode 100644 index 00000000000..b0f4eab91f1 --- /dev/null +++ b/cmake/sycl.cmake @@ -0,0 +1,33 @@ +# IntelSYCL for dpcpp and icpx if the config is existed and cmake reaches the requirement +if(CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx") + if(CMAKE_HOST_WIN32 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.25) + find_package(IntelSYCL QUIET) + elseif(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20.5) + find_package(IntelSYCL QUIET) + endif() +endif() +# If we do not have the config from compiler, try to set components to make it work. +if(NOT COMMAND add_sycl_to_target) + if(NOT DEFINED SYCL_FLAGS) + set(SYCL_FLAGS "-fsycl" CACHE STRING "SYCL flags for compiler") + endif() +endif() + +# Provide a uniform way for those package without add_sycl_to_target +function(gko_add_sycl_to_target) + if(COMMAND add_sycl_to_target) + add_sycl_to_target(${ARGN}) + return() + endif() + # We handle them by adding SYCL_FLAGS to compile and link to the target + set(one_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(SYCL + "" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + target_compile_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}") + target_link_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}") +endfunction() + diff --git a/cmake/template_instantiation.cmake b/cmake/template_instantiation.cmake new file mode 100644 index 00000000000..f77527e0092 --- /dev/null +++ b/cmake/template_instantiation.cmake @@ -0,0 +1,81 @@ +function(add_instantiation_files source_dir source_file output_files_var) + # if instantiation is disabled, compile the file directly + if(NOT GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) + set(${output_files_var} "${source_dir}/${source_file}" PARENT_SCOPE) + return() + endif() + # read full file into variable + set(source_path "${source_dir}/${source_file}") + file(READ "${source_path}" file_contents) + # escape semicolons and use them for line separation + string(REPLACE ";" "" file_contents "${file_contents}") + string(REGEX REPLACE "[\r\n]" ";" file_contents "${file_contents}") + # find location of // begin|split|end comments + set(begin_location) + set(end_location) + set(split_locations) + list(LENGTH file_contents total_length) + set(counter 0) + foreach(line IN LISTS file_contents) + if(line MATCHES "// begin") + if(begin_location) + message(FATAL_ERROR "Duplicate begin in line ${counter}, first found in ${begin_location}") + endif() + set(begin_location ${counter}) + elseif(line MATCHES "// split") + if((NOT begin_location) OR end_location) + message(FATAL_ERROR "Found split outside begin/end in line ${counter}") + endif() + list(APPEND split_locations ${counter}) + elseif(line MATCHES "// end") + if(end_location) + message(FATAL_ERROR "Duplicate end in line ${counter}, first found in ${end_location}") + endif() + set(end_location ${counter}) + endif() + math(EXPR counter "${counter} + 1") + endforeach() + if (NOT (begin_location AND end_location AND split_locations)) + message(FATAL_ERROR "Nothing to split") + endif() + if (begin_location GREATER_EQUAL end_location) + message(FATAL_ERROR "Incorrect begin/end order") + endif() + # determine which lines belong to the header and footer + set(range_begins ${begin_location} ${split_locations}) + set(range_ends ${split_locations} ${end_location}) + list(LENGTH split_locations range_count_minus_one) + math(EXPR length_header "${begin_location}") + math(EXPR end_location_past "${end_location} + 1") + math(EXPR length_footer "${total_length} - ${end_location_past}") + list(SUBLIST file_contents 0 ${length_header} header) + list(SUBLIST file_contents ${end_location_past} ${length_footer} footer) + set(output_files) + # for each range between // begin|split|end pairs + foreach(range RANGE 0 ${range_count_minus_one}) + # create an output filename + string(REGEX REPLACE "(\.hip\.cpp|\.dp\.cpp|\.cpp|\.cu)$" ".${range}\\1" target_file "${source_file}") + set(target_path "${CMAKE_CURRENT_BINARY_DIR}/${target_file}") + list(APPEND output_files "${target_path}") + # extract the range between the comments + list(GET range_begins ${range} begin) + list(GET range_ends ${range} end) + math(EXPR begin "${begin} + 1") + math(EXPR length "${end} - ${begin}") + list(SUBLIST file_contents ${begin} ${length} content) + # concatenate header, content and footer and turn semicolons into newlines + string(REPLACE ";" "\n" content "${header};${content};${footer}") + # and escaped semicolons into regular semicolons again + string(REPLACE "" ";" content "${content}") + # create a .tmp file, but only copy it over if source file changed + # this way, we don't rebuild unnecessarily + file(WRITE "${target_path}.tmp" "${content}") + add_custom_command( + OUTPUT "${target_path}" + COMMAND ${CMAKE_COMMAND} -E copy "${target_path}.tmp" "${target_path}" + MAIN_DEPENDENCY "${source_path}") + endforeach() + # make sure cmake gets called when the source file was updated + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${source_path}") + set(${output_files_var} ${output_files} PARENT_SCOPE) +endfunction() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3a7cb1ceb15..77bdd7230b9 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,31 +1,2 @@ -set(UNIFIED_SOURCES - base/device_matrix_data_kernels.cpp - base/index_set_kernels.cpp - components/absolute_array_kernels.cpp - components/fill_array_kernels.cpp - components/format_conversion_kernels.cpp - components/precision_conversion_kernels.cpp - components/reduce_array_kernels.cpp - distributed/partition_kernels.cpp - matrix/coo_kernels.cpp - matrix/csr_kernels.cpp - matrix/dense_kernels.cpp - matrix/ell_kernels.cpp - matrix/hybrid_kernels.cpp - matrix/sellp_kernels.cpp - matrix/sparsity_csr_kernels.cpp - matrix/diagonal_kernels.cpp - multigrid/pgm_kernels.cpp - preconditioner/jacobi_kernels.cpp - solver/bicg_kernels.cpp - solver/bicgstab_kernels.cpp - solver/cg_kernels.cpp - solver/cgs_kernels.cpp - solver/common_gmres_kernels.cpp - solver/fcg_kernels.cpp - solver/gcr_kernels.cpp - solver/gmres_kernels.cpp - solver/ir_kernels.cpp - ) -list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/unified/) -set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) +add_subdirectory(unified) +set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..6c0c5363baa --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc @@ -0,0 +1,150 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void scale(std::shared_ptr exec, + const batch::MultiVector* const alpha, + batch::MultiVector* const x) +{ + const auto num_blocks = x->get_num_batch_items(); + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + if (alpha->get_common_size()[1] == 1) { + scale_kernel<<get_stream()>>>( + alpha_ub, x_ub, [] __device__(int col) { return 0; }); + } else { + scale_kernel<<get_stream()>>>( + alpha_ub, x_ub, [] __device__(int col) { return col; }); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const batch::MultiVector* const alpha, + const batch::MultiVector* const x, + batch::MultiVector* const y) +{ + const auto num_blocks = x->get_num_batch_items(); + const size_type nrhs = x->get_common_size()[1]; + const auto alpha_ub = get_batch_struct(alpha); + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + if (alpha->get_common_size()[1] == 1) { + add_scaled_kernel<<get_stream()>>>( + alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); + } else { + add_scaled_kernel<<get_stream()>>>( + alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL); + + +template +void compute_dot(std::shared_ptr exec, + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) +{ + const auto num_blocks = x->get_num_batch_items(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL); + + +template +void compute_conj_dot(std::shared_ptr exec, + const batch::MultiVector* x, + const batch::MultiVector* y, + batch::MultiVector* result) +{ + const auto num_blocks = x->get_num_batch_items(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto y_ub = get_batch_struct(y); + const auto res_ub = get_batch_struct(result); + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const batch::MultiVector* const x, + batch::MultiVector>* const result) +{ + const auto num_blocks = x->get_num_batch_items(); + const auto num_rhs = x->get_common_size()[1]; + const auto x_ub = get_batch_struct(x); + const auto res_ub = get_batch_struct(result); + compute_norm2_kernel<<get_stream()>>>(x_ub, res_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL); + + +template +void copy(std::shared_ptr exec, + const batch::MultiVector* x, + batch::MultiVector* result) +{ + const auto num_blocks = x->get_num_batch_items(); + const auto result_ub = get_batch_struct(result); + const auto x_ub = get_batch_struct(x); + copy_kernel<<get_stream()>>>( + x_ub, result_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc new file mode 100644 index 00000000000..cb157d80fd5 --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -0,0 +1,327 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void scale( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + x.values[row * x.stride + col] = + alpha.values[map(col)] * x.values[row * x.stride + col]; + } +} + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + scale(alpha_b, x_b, map); + } +} + + +template +__device__ __forceinline__ void add_scaled( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::multi_vector:: + uniform_batch< + const ValueType> + x, + const gko::batch::multi_vector:: + uniform_batch + y, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + add_scaled(alpha_b, x_b, y_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, + const int num_rows, + const ValueType* x, + const ValueType* y, + ValueType& result) + +{ + ValueType val = zero(); + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += conj(x[r]) * y[r]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result = val; + } +} + + +template +__device__ __forceinline__ void gen_one_dot( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const int rhs_index, + const gko::batch::multi_vector::batch_item& result, + Group subgroup, Mapping conj_map) +{ + ValueType val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += conj_map(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + +template +__device__ __forceinline__ void compute_gen_dot_product( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, + Mapping conj_map) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); + } +} + + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_gen_dot_product(x_b, y_b, r_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_norm2( + Group subgroup, const int num_rows, const ValueType* x, + remove_complex& result) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += squared_norm(x[r]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result = sqrt(val); + } +} + + +template +__device__ __forceinline__ void one_norm2( + const gko::batch::multi_vector::batch_item& x, + const int rhs_index, + const gko::batch::multi_vector::batch_item>& + result, + Group subgroup) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += squared_norm(x.values[r * x.stride + rhs_index]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = sqrt(val); + } +} + + +/** + * Computes the 2-norms of some column vectors in global or shared memory. + * + * @param x A row-major multivector with nrhs columns. + * @param result Holds norm value for each vector in x. + */ +template +__device__ __forceinline__ void compute_norm2( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& + result) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + one_norm2(x, rhs_index, result, subgroup); + } +} + + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void compute_norm2_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + x, + const gko::batch:: + multi_vector:: + uniform_batch< + remove_complex< + ValueType>> + result) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_norm2(x_b, r_b); + } +} + + +template +__device__ __forceinline__ void single_rhs_copy(const int num_rows, + const ValueType* in, + ValueType* out) +{ + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + out[iz] = in[iz]; + } +} + + +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +__device__ __forceinline__ void copy( + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) +{ + for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; + iz += blockDim.x) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; + } +} + + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) +{ + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; + batch_id += gridDim.x) { + const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); + const auto src_b = gko::batch::extract_batch_item(src, batch_id); + copy(src_b, dst_b); + } +} diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc index 5930902ed37..faf0ad15146 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc @@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, array& col_idxs) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto value_ptr = - reinterpret_cast(values.get_const_data()); + using device_value_type = device_type; + auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_num_elems(); // count nonzeros - auto nnz = - thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { - return is_nonzero(fake_complex_unpack(value)); - }); + auto nnz = thrust::count_if( + thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { return is_nonzero(value); }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr exec, // copy nonzeros auto it = thrust::make_zip_iterator(thrust::make_tuple( row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr)); - auto out_it = thrust::make_zip_iterator(thrust::make_tuple( - new_row_idxs.get_data(), new_col_idxs.get_data(), - reinterpret_cast(new_values.get_data()))); - thrust::copy_if( - thrust_policy(exec), it, it + size, out_it, - [] __device__(tuple_type entry) { - return is_nonzero(fake_complex_unpack(thrust::get<2>(entry))); - }); + auto out_it = thrust::make_zip_iterator( + thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(), + as_device_type(new_values.get_data()))); + thrust::copy_if(thrust_policy(exec), it, it + size, out_it, + [] __device__(tuple_type entry) { + return is_nonzero(thrust::get<2>(entry)); + }); // swap out storage values = std::move(new_values); row_idxs = std::move(new_row_idxs); @@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr exec, size_type, array& values, array& row_idxs, array& col_idxs) { - using device_value_type = device_member_type; const auto size = values.get_num_elems(); const auto rows = row_idxs.get_const_data(); const auto cols = col_idxs.get_const_data(); @@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr exec, size_type, // reduce duplicates auto in_locs = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - auto in_vals = - reinterpret_cast(values.get_const_data()); + auto in_vals = as_device_type(values.get_const_data()); auto out_locs = thrust::make_zip_iterator(thrust::make_tuple( new_row_idxs.get_data(), new_col_idxs.get_data())); - auto out_vals = - reinterpret_cast(new_values.get_data()); + auto out_vals = as_device_type(new_values.get_data()); thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size, in_vals, out_locs, out_vals); // swap out storage @@ -127,13 +117,9 @@ template void sort_row_major(std::shared_ptr exec, device_matrix_data& data) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; auto it = thrust::make_zip_iterator( thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs())); - auto vals = reinterpret_cast(data.get_values()); + auto vals = as_device_type(data.get_values()); thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(), vals); } diff --git a/common/cuda_hip/base/executor.hpp.inc b/common/cuda_hip/base/executor.hpp.inc index 7e71a3e24c0..ad641ecea5b 100644 --- a/common/cuda_hip/base/executor.hpp.inc +++ b/common/cuda_hip/base/executor.hpp.inc @@ -40,7 +40,7 @@ inline int convert_sm_ver_to_cores(int major, int minor) // Defines for GPU Architecture types (using the SM version to determine // the # of cores per SM typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, + int SM; // 0xMm (hexadecimal notation), M = SM Major version, // and m = SM minor version int Cores; } sSMtoCores; diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/memory.hpp.inc similarity index 100% rename from common/cuda_hip/components/volatile.hpp.inc rename to common/cuda_hip/components/memory.hpp.inc diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc index 947c2c3afd7..584f44b6415 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp.inc @@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * @internal * - * Compute a segement scan using add operation (+) of a subwarp. Each segment + * Compute a segment scan using add operation (+) of a subwarp. Each segment * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc index 6b6dcc70f24..a8fa767e4dd 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp.inc @@ -93,18 +93,18 @@ public: const auto dep_block = dependency / (block_size / subwarp_size); const auto dep_local = dependency % (block_size / subwarp_size); // assert(dependency < work_id); - if (dep_block == block_id) { - // wait for a local dependency - while (!load(local.status, dep_local)) { - __threadfence(); - } - } else { - // wait for a global dependency - while (!load(global.status, dependency)) { - __threadfence(); + if (get_lane() == 0) { + if (dep_block == block_id) { + // wait for a local dependency + while (!load_acquire_shared(local.status + dep_local)) { + } + } else { + // wait for a global dependency + while (!load_acquire(global.status + dependency)) { + } } } - __threadfence(); + group::tiled_partition(group::this_thread_block()).sync(); } __device__ __forceinline__ bool peek(IndexType dependency) @@ -114,27 +114,22 @@ public: // assert(dependency < work_id); if (dep_block == block_id) { // peek at a local dependency - auto finished = load(local.status, dep_local) != 0; - __threadfence(); - return finished; + return load_acquire_shared(local.status + dep_local); } else { // peek at a global dependency - auto finished = load(global.status, dependency); - __threadfence(); - return finished; + return load_acquire(global.status + dependency); } } __device__ __forceinline__ void mark_ready() { group::tiled_partition(group::this_thread_block()).sync(); - __threadfence(); if (get_lane() == 0) { const auto sh_id = get_work_id() % (block_size / subwarp_size); // notify local warps - store(local.status, sh_id, 1); + store_release_shared(local.status + sh_id, 1); // notify other blocks - store(global.status, get_work_id(), 1); + store_release(global.status + get_work_id(), 1); } } diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc new file mode 100644 index 00000000000..f92794ec138 --- /dev/null +++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +void sort_by_range_start( + std::shared_ptr exec, + array& range_start_ends, + array& part_ids) +{ + auto num_ranges = range_start_ends.get_num_elems() / 2; + auto strided_indices = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __host__ __device__(const int i) { return 2 * i; }); + auto start_it = thrust::make_permutation_iterator( + range_start_ends.get_data(), strided_indices); + auto end_it = thrust::make_permutation_iterator( + range_start_ends.get_data() + 1, strided_indices); + auto zip_it = thrust::make_zip_iterator( + thrust::make_tuple(end_it, part_ids.get_data())); + thrust::stable_sort_by_key(thrust_policy(exec), start_it, + start_it + num_ranges, zip_it); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc index f87969a7ad0..eb90127a8ca 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc +++ b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc @@ -149,8 +149,6 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize( template __global__ __launch_bounds__(default_block_size) void factorize( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols, - const IndexType* __restrict__ elim_tree_child_ptrs, - const IndexType* __restrict__ elim_tree_children, const IndexType* __restrict__ storage_offsets, const int32* __restrict__ storage, const int64* __restrict__ row_descs, const IndexType* __restrict__ diag_idxs, @@ -171,32 +169,21 @@ __global__ __launch_bounds__(default_block_size) void factorize( const auto row_begin = row_ptrs[row]; const auto row_diag = diag_idxs[row]; const auto row_end = row_ptrs[row + 1]; - const auto child_begin = elim_tree_child_ptrs[row]; - const auto child_end = elim_tree_child_ptrs[row + 1]; gko::matrix::csr::device_sparsity_lookup lookup{ row_ptrs, cols, storage_offsets, storage, row_descs, static_cast(row)}; - for (auto child = child_begin; child < child_end; child++) { - const auto dep = elim_tree_children[child]; - scheduler.wait(dep); - // TODO evaluate parallel waiting with __all_sync - } - // for each lower triangular entry: eliminate with corresponding row + // for each lower triangular entry: eliminate with corresponding column for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) { const auto dep = cols[lower_nz]; - auto val = vals[lower_nz]; + scheduler.wait(dep); + const auto scale = vals[lower_nz]; const auto diag_idx = diag_idxs[dep]; const auto dep_end = row_ptrs[dep + 1]; - const auto diag = vals[diag_idx]; - const auto scale = val / diag; - if (lane == 0) { - vals[lower_nz] = scale; - } - // subtract all entries past the diagonal - for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end; + // subtract column dep from current column + for (auto upper_nz = diag_idx + lane; upper_nz < dep_end; upper_nz += config::warp_size) { const auto upper_col = cols[upper_nz]; - if (upper_col < row) { + if (upper_col >= row) { const auto upper_val = vals[upper_nz]; const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin; @@ -204,17 +191,16 @@ __global__ __launch_bounds__(default_block_size) void factorize( } } } - ValueType sum{}; - for (auto lower_nz = row_begin + lane; lower_nz < row_diag; - lower_nz += config::warp_size) { - sum += squared_norm(vals[lower_nz]); - // copy the lower triangular entries to the transpose - vals[transpose_idxs[lower_nz]] = conj(vals[lower_nz]); + auto diag_val = sqrt(vals[row_diag]); + for (auto upper_nz = row_diag + 1 + lane; upper_nz < row_end; + upper_nz += config::warp_size) { + vals[upper_nz] /= diag_val; + // copy the upper triangular entries to the transpose + vals[transpose_idxs[upper_nz]] = conj(vals[upper_nz]); } - sum = reduce(warp, sum, thrust::plus{}); if (lane == 0) { // store computed diagonal - vals[row_diag] = sqrt(vals[row_diag] - sum); + vals[row_diag] = diag_val; } scheduler.mark_ready(); } @@ -365,10 +351,9 @@ void factorize(std::shared_ptr exec, kernel::factorize<<get_stream()>>>( factors->get_const_row_ptrs(), factors->get_const_col_idxs(), - forest.child_ptrs.get_const_data(), - forest.children.get_const_data(), lookup_offsets, lookup_storage, - lookup_descs, diag_idxs, transpose_idxs, - as_device_type(factors->get_values()), storage, num_rows); + lookup_offsets, lookup_storage, lookup_descs, diag_idxs, + transpose_idxs, as_device_type(factors->get_values()), storage, + num_rows); } } diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.hpp.inc index f3db34b3631..1503ede4be3 100644 --- a/common/cuda_hip/factorization/lu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/lu_kernels.hpp.inc @@ -106,7 +106,10 @@ __global__ __launch_bounds__(default_block_size) void factorize( // for each lower triangular entry: eliminate with corresponding row for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) { const auto dep = cols[lower_nz]; - auto val = vals[lower_nz]; + // we can load the value before synchronizing because the following + // updates only go past the diagonal of the dependency row, i.e. at + // least column dep + 1 + const auto val = vals[lower_nz]; const auto diag_idx = diag_idxs[dep]; const auto dep_end = row_ptrs[dep + 1]; scheduler.wait(dep); @@ -128,6 +131,88 @@ __global__ __launch_bounds__(default_block_size) void factorize( } +template +__global__ __launch_bounds__(default_block_size) void symbolic_factorize_simple( + const IndexType* __restrict__ mtx_row_ptrs, + const IndexType* __restrict__ mtx_cols, + const IndexType* __restrict__ factor_row_ptrs, + const IndexType* __restrict__ factor_cols, + const IndexType* __restrict__ storage_offsets, + const int32* __restrict__ storage, const int64* __restrict__ row_descs, + IndexType* __restrict__ diag_idxs, ValueType* __restrict__ factor_vals, + IndexType* __restrict__ out_row_nnz, syncfree_storage dep_storage, + size_type num_rows) +{ + using scheduler_t = + syncfree_scheduler; + __shared__ typename scheduler_t::shared_storage sh_dep_storage; + scheduler_t scheduler(dep_storage, sh_dep_storage); + const auto row = scheduler.get_work_id(); + if (row >= num_rows) { + return; + } + const auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = warp.thread_rank(); + const auto factor_begin = factor_row_ptrs[row]; + const auto factor_end = factor_row_ptrs[row + 1]; + const auto mtx_begin = mtx_row_ptrs[row]; + const auto mtx_end = mtx_row_ptrs[row + 1]; + gko::matrix::csr::device_sparsity_lookup lookup{ + factor_row_ptrs, factor_cols, storage_offsets, + storage, row_descs, static_cast(row)}; + const auto row_diag = lookup.lookup_unsafe(row) + factor_begin; + // fill with zeros first + for (auto nz = factor_begin + lane; nz < factor_end; + nz += config::warp_size) { + factor_vals[nz] = zero(); + } + warp.sync(); + // then fill in the system matrix + for (auto nz = mtx_begin + lane; nz < mtx_end; nz += config::warp_size) { + const auto col = mtx_cols[nz]; + factor_vals[lookup.lookup_unsafe(col) + factor_begin] = one(); + } + // finally set diagonal and store diagonal index + if (lane == 0) { + diag_idxs[row] = row_diag; + factor_vals[row_diag] = one(); + } + warp.sync(); + // for each lower triangular entry: eliminate with corresponding row + for (auto lower_nz = factor_begin; lower_nz < row_diag; lower_nz++) { + const auto dep = factor_cols[lower_nz]; + const auto dep_end = factor_row_ptrs[dep + 1]; + scheduler.wait(dep); + // read the diag entry after we are sure it was written. + const auto diag_idx = diag_idxs[dep]; + if (factor_vals[lower_nz] == one()) { + // eliminate with upper triangle/entries past the diagonal + for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end; + upper_nz += config::warp_size) { + const auto upper_col = factor_cols[upper_nz]; + const auto upper_val = factor_vals[upper_nz]; + const auto output_pos = + lookup.lookup_unsafe(upper_col) + factor_begin; + if (upper_val == one()) { + factor_vals[output_pos] = one(); + } + } + } + } + scheduler.mark_ready(); + IndexType row_nnz{}; + for (auto nz = factor_begin + lane; nz < factor_end; + nz += config::warp_size) { + row_nnz += factor_vals[nz] == one() ? 1 : 0; + } + row_nnz = reduce(warp, row_nnz, thrust::plus{}); + if (lane == 0) { + out_row_nnz[row] = row_nnz; + } +} + + } // namespace kernel @@ -177,3 +262,70 @@ void factorize(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); + + +template +void symbolic_factorize_simple( + std::shared_ptr exec, const IndexType* row_ptrs, + const IndexType* col_idxs, const IndexType* lookup_offsets, + const int64* lookup_descs, const int32* lookup_storage, + matrix::Csr* factors, IndexType* out_row_nnz) +{ + const auto num_rows = factors->get_size()[0]; + const auto factor_row_ptrs = factors->get_const_row_ptrs(); + const auto factor_cols = factors->get_const_col_idxs(); + const auto factor_vals = factors->get_values(); + array diag_idx_array{exec, num_rows}; + array tmp_storage{exec}; + const auto diag_idxs = diag_idx_array.get_data(); + if (num_rows > 0) { + syncfree_storage dep_storage(exec, tmp_storage, num_rows); + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::symbolic_factorize_simple<<get_stream()>>>( + row_ptrs, col_idxs, factor_row_ptrs, factor_cols, lookup_offsets, + lookup_storage, lookup_descs, diag_idxs, factor_vals, out_row_nnz, + dep_storage, num_rows); + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE); + + +struct first_eq_one_functor { + template + __device__ __forceinline__ bool operator()(Pair pair) const + { + return thrust::get<0>(pair) == one(); + } +}; + + +struct return_second_functor { + template + __device__ __forceinline__ auto operator()(Pair pair) const + { + return thrust::get<1>(pair); + } +}; + + +template +void symbolic_factorize_simple_finalize( + std::shared_ptr exec, + const matrix::Csr* factors, IndexType* out_col_idxs) +{ + const auto col_idxs = factors->get_const_col_idxs(); + const auto vals = factors->get_const_values(); + const auto input_it = + thrust::make_zip_iterator(thrust::make_tuple(vals, col_idxs)); + const auto output_it = thrust::make_transform_output_iterator( + out_col_idxs, return_second_functor{}); + thrust::copy_if(thrust_policy(exec), input_it, + input_it + factors->get_num_stored_elements(), output_it, + first_eq_one_functor{}); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE); diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc index 9a4d605c6a3..7a3b3da8e32 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc @@ -78,16 +78,18 @@ __global__ __launch_bounds__(default_block_size) void ic_sweep( auto l_col = l_col_idxs[l_row_begin]; auto lh_row = l_col_idxs[lh_col_begin]; if (l_col == lh_row && l_col < last_entry) { - sum += l_vals[l_row_begin] * conj(l_vals[lh_col_begin]); + sum += load_relaxed(l_vals + l_row_begin) * + conj(load_relaxed(l_vals + lh_col_begin)); } l_row_begin += l_col <= lh_row; lh_col_begin += l_col >= lh_row; } - auto to_write = row == col - ? sqrt(a_val - sum) - : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; + auto to_write = + row == col + ? sqrt(a_val - sum) + : (a_val - sum) / load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc index 7eccbda61d2..d54fe3c6c77 100644 --- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc @@ -75,8 +75,8 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( // we don't need to use the `bool valid` because last_entry is // already a smaller sentinel value than the one used in group_merge if (l_col == lh_row && l_col < last_entry) { - sum += l_vals[l_idx + l_row_begin] * - conj(l_vals[lh_idx + lh_col_begin]); + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); } // remember the transposed element auto found_transp = subwarp.ballot(lh_row == row); @@ -90,11 +90,12 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); if (subwarp.thread_rank() == 0) { - auto to_write = row == col - ? sqrt(a_val - sum) - : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; + auto to_write = + row == col ? sqrt(a_val - sum) + : (a_val - sum) / + load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } } diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc index 08bd5bf8b4e..6785c161674 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc @@ -57,7 +57,8 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors( const auto u_col = u_col_idxs[u_idx]; last_operation = zero(); if (l_col == u_col) { - last_operation = l_values[l_idx] * u_values[u_idx]; + last_operation = load_relaxed(l_values + l_idx) * + load_relaxed(u_values + u_idx); sum -= last_operation; } l_idx += (l_col <= u_col); @@ -65,14 +66,15 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors( } sum += last_operation; // undo the last operation if (row > col) { - auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1]; + auto to_write = + sum / load_relaxed(u_values + (u_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_values[l_idx - 1] = to_write; + store_relaxed(l_values + (l_idx - 1), to_write); } } else { auto to_write = sum; if (is_finite(to_write)) { - u_values[u_idx - 1] = to_write; + store_relaxed(u_values + (u_idx - 1), to_write); } } } diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc index e99888b35b3..d3cc4330c39 100644 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc @@ -87,8 +87,8 @@ __global__ __launch_bounds__(default_block_size) void sweep( // we don't need to use the `bool valid` because last_entry is // already a smaller sentinel value than the one used in group_merge if (l_col == ut_row && l_col < last_entry) { - sum += l_vals[l_idx + l_row_begin] * - ut_vals[ut_idx + ut_col_begin]; + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + load_relaxed(ut_vals + (ut_idx + ut_col_begin)); } // remember the transposed element auto found_transp = subwarp.ballot(ut_row == row); @@ -103,15 +103,16 @@ __global__ __launch_bounds__(default_block_size) void sweep( if (subwarp.thread_rank() == 0) { if (lower) { - auto to_write = (a_val - sum) / ut_vals[ut_col_ptrs[col + 1] - 1]; + auto to_write = (a_val - sum) / + load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } else { auto to_write = a_val - sum; if (is_finite(to_write)) { - u_vals[u_nz] = to_write; - ut_vals[ut_nz] = to_write; + store_relaxed(u_vals + u_nz, to_write); + store_relaxed(ut_vals + ut_nz, to_write); } } } diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp.inc new file mode 100644 index 00000000000..e8cf77960ef --- /dev/null +++ b/common/cuda_hip/log/batch_logger.hpp.inc @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @see reference/log/batch_logger.hpp + */ +template +class SimpleFinalLogger final { +public: + using real_type = RealType; + + SimpleFinalLogger(real_type* const batch_residuals, int* const batch_iters) + : final_residuals_{batch_residuals}, final_iters_{batch_iters} + {} + + __device__ __forceinline__ void log_iteration(const size_type batch_idx, + const int iter, + const real_type res_norm) + { + final_iters_[batch_idx] = iter; + final_residuals_[batch_idx] = res_norm; + } + +private: + real_type* const final_residuals_; + int* const final_iters_; +}; diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..23ae8ebd5f0 --- /dev/null +++ b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Dense* mat, + const batch::MultiVector* b, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Dense* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, + beta_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL); diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc new file mode 100644 index 00000000000..7a38cfea215 --- /dev/null +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -0,0 +1,164 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void simple_apply( + const gko::batch::matrix::dense::batch_item& mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x) +{ + constexpr auto tile_size = config::warp_size; + + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int row = subgroup_id; row < mat.num_rows; + row += num_subgroups_per_block) { + ValueType temp = zero(); + for (int j = subgroup.thread_rank(); j < mat.num_cols; + j += subgroup.size()) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += val * b[j]; + } + + // subgroup level reduction + temp = reduce(subgroup, temp, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + x[row] = temp; + } + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: + dense::uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + simple_apply(mat_b, b_b.values, x_b.values); + } +} + + +template +__device__ __forceinline__ void advanced_apply( + const ValueType alpha, + const gko::batch::matrix::dense::batch_item& mat, + const ValueType* const __restrict__ b, const ValueType beta, + ValueType* const __restrict__ x) +{ + constexpr auto tile_size = config::warp_size; + + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int row = subgroup_id; row < mat.num_rows; + row += num_subgroups_per_block) { + ValueType temp = zero(); + for (int j = subgroup.thread_rank(); j < mat.num_cols; + j += subgroup.size()) { + const ValueType val = mat.values[row * mat.stride + j]; + temp += alpha * val * b[j]; + } + + // subgroup level reduction + temp = reduce(subgroup, temp, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + x[row] = temp + beta * x[row]; + } + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void advanced_apply_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::matrix:: + dense::uniform_batch< + const ValueType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + beta, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto beta_b = gko::batch::extract_batch_item(beta, batch_id); + advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values); + } +} diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc new file mode 100644 index 00000000000..f8da432aa4d --- /dev/null +++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +void simple_apply(std::shared_ptr exec, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + simple_apply_kernel<<get_stream()>>>(mat_ub, b_ub, x_ub); +} + + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( + GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL); + + +template +void advanced_apply(std::shared_ptr exec, + const batch::MultiVector* alpha, + const batch::matrix::Ell* mat, + const batch::MultiVector* b, + const batch::MultiVector* beta, + batch::MultiVector* x) +{ + const auto num_blocks = mat->get_num_batch_items(); + const auto b_ub = get_batch_struct(b); + const auto x_ub = get_batch_struct(x); + const auto mat_ub = get_batch_struct(mat); + const auto alpha_ub = get_batch_struct(alpha); + const auto beta_ub = get_batch_struct(beta); + if (b->get_common_size()[1] > 1) { + GKO_NOT_IMPLEMENTED; + } + advanced_apply_kernel<<get_stream()>>>(alpha_ub, mat_ub, b_ub, + beta_ub, x_ub); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE( + GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL); diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc new file mode 100644 index 00000000000..de6ca879890 --- /dev/null +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -0,0 +1,156 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void simple_apply( + const gko::batch::matrix::ell::batch_item& mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x) +{ + const auto num_rows = mat.num_rows; + const auto num_stored_elements_per_row = mat.num_stored_elems_per_row; + const auto stride = mat.stride; + const auto val = mat.values; + const auto col = mat.col_idxs; + for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += val[ind] * b[col_idx]; + } + } + x[tidx] = temp; + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix:: + ell::uniform_batch< + const ValueType, + IndexType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + simple_apply(mat_b, b_b.values, x_b.values); + } +} + + +template +__device__ __forceinline__ void advanced_apply( + const ValueType alpha, + const gko::batch::matrix::ell::batch_item& mat, + const ValueType* const __restrict__ b, const ValueType beta, + ValueType* const __restrict__ x) +{ + const auto num_rows = mat.num_rows; + const auto num_stored_elements_per_row = mat.num_stored_elems_per_row; + const auto stride = mat.stride; + const auto val = mat.values; + const auto col = mat.col_idxs; + for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += alpha * val[ind] * b[col_idx]; + } + } + x[tidx] = temp + beta * x[tidx]; + } +} + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void advanced_apply_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::matrix:: + ell::uniform_batch< + const ValueType, + IndexType> + mat, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + b, + const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + beta, + const gko::batch:: + multi_vector:: + uniform_batch< + ValueType> + x) +{ + for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items; + batch_id += gridDim.x) { + const auto mat_b = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const auto b_b = gko::batch::extract_batch_item(b, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto beta_b = gko::batch::extract_batch_item(beta, batch_id); + advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0], + x_b.values); + } +} diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc index 0fce02aecfa..35718464c42 100644 --- a/common/cuda_hip/matrix/csr_common.hpp.inc +++ b/common/cuda_hip/matrix/csr_common.hpp.inc @@ -102,7 +102,6 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries( if (tile_grp.thread_rank() == 0) { *has_all_diags = false; } - return; } } } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index e73dfde00fb..9687678dc58 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -606,7 +606,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam( } // advance by the number of merged elements // in theory, we would need to mask by `valid`, but this - // would only be false somwhere in the last iteration, where + // would only be false somewhere in the last iteration, where // we don't need the value of c_begin afterwards, anyways. c_begin += popcnt(~prev_equal_mask & lanemask_full); return true; @@ -665,8 +665,8 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute( if (tid >= num_rows) { return; } - auto in_row = permutation[tid]; - auto out_row = tid; + const auto in_row = permutation[tid]; + const auto out_row = tid; out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row]; } @@ -680,8 +680,8 @@ __global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute( if (tid >= num_rows) { return; } - auto in_row = tid; - auto out_row = permutation[tid]; + const auto in_row = tid; + const auto out_row = permutation[tid]; out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row]; } @@ -699,12 +699,12 @@ __global__ __launch_bounds__(default_block_size) void row_permute( if (tid >= num_rows) { return; } - auto lane = threadIdx.x % subwarp_size; - auto in_row = permutation[tid]; - auto out_row = tid; - auto in_begin = in_row_ptrs[in_row]; - auto in_size = in_row_ptrs[in_row + 1] - in_begin; - auto out_begin = out_row_ptrs[out_row]; + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = permutation[tid]; + const auto out_row = tid; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { out_cols[out_begin + i] = in_cols[in_begin + i]; out_vals[out_begin + i] = in_vals[in_begin + i]; @@ -725,12 +725,12 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute( if (tid >= num_rows) { return; } - auto lane = threadIdx.x % subwarp_size; - auto in_row = tid; - auto out_row = permutation[tid]; - auto in_begin = in_row_ptrs[in_row]; - auto in_size = in_row_ptrs[in_row + 1] - in_begin; - auto out_begin = out_row_ptrs[out_row]; + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { out_cols[out_begin + i] = in_cols[in_begin + i]; out_vals[out_begin + i] = in_vals[in_begin + i]; @@ -751,12 +751,12 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( if (tid >= num_rows) { return; } - auto lane = threadIdx.x % subwarp_size; - auto in_row = tid; - auto out_row = permutation[tid]; - auto in_begin = in_row_ptrs[in_row]; - auto in_size = in_row_ptrs[in_row + 1] - in_begin; - auto out_begin = out_row_ptrs[out_row]; + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; for (IndexType i = lane; i < in_size; i += subwarp_size) { out_cols[out_begin + i] = permutation[in_cols[in_begin + i]]; out_vals[out_begin + i] = in_vals[in_begin + i]; @@ -764,6 +764,147 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( } +template +__global__ __launch_bounds__(default_block_size) void inv_nonsymm_permute( + size_type num_rows, const IndexType* __restrict__ row_permutation, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = row_permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = col_permutation[in_cols[in_begin + i]]; + out_vals[out_begin + i] = in_vals[in_begin + i]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void row_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = permutation[tid]; + const auto out_row = tid; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] * scale[in_row]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_row_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + out_cols[out_begin + i] = in_cols[in_begin + i]; + out_vals[out_begin + i] = in_vals[in_begin + i] / scale[out_row]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_symm_scale_permute( + size_type num_rows, const ValueType* __restrict__ scale, + const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + const auto out_col = permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (scale[out_row] * scale[out_col]); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void inv_nonsymm_scale_permute( + size_type num_rows, const ValueType* __restrict__ row_scale, + const IndexType* __restrict__ row_permutation, + const ValueType* __restrict__ col_scale, + const IndexType* __restrict__ col_permutation, + const IndexType* __restrict__ in_row_ptrs, + const IndexType* __restrict__ in_cols, + const ValueType* __restrict__ in_vals, + const IndexType* __restrict__ out_row_ptrs, + IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals) +{ + auto tid = thread::get_subwarp_id_flat(); + if (tid >= num_rows) { + return; + } + const auto lane = threadIdx.x % subwarp_size; + const auto in_row = tid; + const auto out_row = row_permutation[tid]; + const auto in_begin = in_row_ptrs[in_row]; + const auto in_size = in_row_ptrs[in_row + 1] - in_begin; + const auto out_begin = out_row_ptrs[out_row]; + for (IndexType i = lane; i < in_size; i += subwarp_size) { + const auto out_col = col_permutation[in_cols[in_begin + i]]; + out_cols[out_begin + i] = out_col; + out_vals[out_begin + i] = + in_vals[in_begin + i] / (row_scale[out_row] * col_scale[out_col]); + } +} + + template __global__ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( @@ -826,15 +967,19 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity( auto tile_grp = group::tiled_partition(group::this_thread_block()); const auto warpid = thread::get_subwarp_id_flat(); - const auto num_warps = thread::get_subwarp_num_flat(); if (warpid < num_rows) { const auto tid_in_warp = tile_grp.thread_rank(); const IndexType row_start = row_ptrs[warpid]; const IndexType num_nz = row_ptrs[warpid + 1] - row_start; + const auto beta_val = beta[0]; + const auto alpha_val = alpha[0]; for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { - values[iz + row_start] *= beta[0]; - if (col_idxs[iz + row_start] == warpid) { - values[iz + row_start] += alpha[0]; + if (beta_val != one()) { + values[iz + row_start] *= beta_val; + } + if (col_idxs[iz + row_start] == warpid && + alpha_val != zero()) { + values[iz + row_start] += alpha_val; } } } @@ -872,11 +1017,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } auto in_rows = in_row_idxs.get_data(); auto in_cols = in_col_idxs.get_data(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(in_values.get_data()); + auto in_vals = as_device_type(in_values.get_data()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -924,22 +1065,19 @@ void convert_to_fbcsr(std::shared_ptr exec, // fill in values components::fill_array(exec, block_value_array.get_data(), num_blocks * bs * bs, zero()); - thrust::for_each_n( - thrust_policy(exec), iota, num_blocks, - [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, - values] __device__(size_type i) { - const auto block_begin = block_ptrs[i]; - const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; - for (auto nz = block_begin; nz < block_end; nz++) { - values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); - } - }); + thrust::for_each_n(thrust_policy(exec), iota, num_blocks, + [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, + in_vals, values] __device__(size_type i) { + const auto block_begin = block_ptrs[i]; + const auto block_end = + i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; + for (auto nz = block_begin; nz < block_end; nz++) { + values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = in_vals[nz]; + } + }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - namespace kernel { @@ -1122,7 +1260,407 @@ void build_lookup(std::shared_ptr exec, storage); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + +namespace { + + +template +void spgeam(syn::value_list, + std::shared_ptr exec, const ValueType* alpha, + const IndexType* a_row_ptrs, const IndexType* a_col_idxs, + const ValueType* a_vals, const ValueType* beta, + const IndexType* b_row_ptrs, const IndexType* b_col_idxs, + const ValueType* b_vals, matrix::Csr* c) +{ + auto m = static_cast(c->get_size()[0]); + auto c_row_ptrs = c->get_row_ptrs(); + // count nnz for alpha * A + beta * B + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(m, subwarps_per_block); + if (num_blocks > 0) { + kernel::spgeam_nnz + <<get_stream()>>>( + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + } + + // build row pointers + components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1); + + // accumulate non-zeros for alpha * A + beta * B + matrix::CsrBuilder c_builder{c}; + auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); + c_builder.get_col_idx_array().resize_and_reset(c_nnz); + c_builder.get_value_array().resize_and_reset(c_nnz); + auto c_col_idxs = c->get_col_idxs(); + auto c_vals = c->get_values(); + if (num_blocks > 0) { + kernel::spgeam + <<get_stream()>>>( + as_device_type(alpha), a_row_ptrs, a_col_idxs, + as_device_type(a_vals), as_device_type(beta), b_row_ptrs, + b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs, + as_device_type(c_vals)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); + + +} // namespace + + +template +void spgeam(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Csr* a, + const matrix::Dense* beta, + const matrix::Csr* b, + matrix::Csr* c) +{ + auto total_nnz = + a->get_num_stored_elements() + b->get_num_stored_elements(); + auto nnz_per_row = total_nnz / a->get_size()[0]; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), a->get_const_row_ptrs(), + a->get_const_col_idxs(), a->get_const_values(), + beta->get_const_values(), b->get_const_row_ptrs(), + b->get_const_col_idxs(), b->get_const_values(), c); +} + + +template +void fill_in_dense(std::shared_ptr exec, + const matrix::Csr* source, + matrix::Dense* result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto stride = result->get_stride(); + const auto row_ptrs = source->get_const_row_ptrs(); + const auto col_idxs = source->get_const_col_idxs(); + const auto vals = source->get_const_values(); + + auto grid_dim = ceildiv(num_rows, default_block_size); + if (grid_dim > 0) { + kernel::fill_in_dense<<get_stream()>>>( + num_rows, as_device_type(row_ptrs), as_device_type(col_idxs), + as_device_type(vals), stride, as_device_type(result->get_values())); + } +} + + +template +void inv_symm_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_symm_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_perm, const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, row_perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_nonsymm_permute + <<get_stream()>>>( + num_rows, row_perm, col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::row_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_row_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_row_permute + <<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_symm_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Csr* orig, + matrix::Csr* permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, row_perm, orig->get_const_row_ptrs(), + permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_nonsymm_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(row_scale), row_perm, + as_device_type(col_scale), col_perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + permuted->get_row_ptrs(), permuted->get_col_idxs(), + as_device_type(permuted->get_values())); + } +} + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::row_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* row_permuted) +{ + auto num_rows = orig->get_size()[0]; + auto count_num_blocks = ceildiv(num_rows, default_block_size); + if (count_num_blocks > 0) { + kernel::inv_row_ptr_permute<<get_stream()>>>( + num_rows, perm, orig->get_const_row_ptrs(), + row_permuted->get_row_ptrs()); + } + components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(), + num_rows + 1); + auto copy_num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + if (copy_num_blocks > 0) { + kernel::inv_row_scale_permute + <<get_stream()>>>( + num_rows, as_device_type(scale), perm, + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), + as_device_type(orig->get_const_values()), + row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(), + as_device_type(row_permuted->get_values())); + } +} + + +template +void calculate_nonzeros_per_row_in_span( + std::shared_ptr exec, + const matrix::Csr* source, const span& row_span, + const span& col_span, array* row_nnz) +{ + const auto num_rows = source->get_size()[0]; + auto row_ptrs = source->get_const_row_ptrs(); + auto col_idxs = source->get_const_col_idxs(); + auto grid_dim = ceildiv(row_span.length(), default_block_size); + if (grid_dim > 0) { + kernel::calculate_nnz_per_row_in_span<<get_stream()>>>( + row_span, col_span, as_device_type(row_ptrs), + as_device_type(col_idxs), as_device_type(row_nnz->get_data())); + } +} + + +template +void compute_submatrix(std::shared_ptr exec, + const matrix::Csr* source, + gko::span row_span, gko::span col_span, + matrix::Csr* result) +{ + auto row_offset = row_span.begin; + auto col_offset = col_span.begin; + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + auto row_ptrs = source->get_const_row_ptrs(); + auto grid_dim = ceildiv(num_rows, default_block_size); + if (grid_dim > 0) { + kernel::compute_submatrix_idxs_and_vals<<get_stream()>>>( + num_rows, num_cols, row_offset, col_offset, + as_device_type(source->get_const_row_ptrs()), + as_device_type(source->get_const_col_idxs()), + as_device_type(source->get_const_values()), + as_device_type(result->get_const_row_ptrs()), + as_device_type(result->get_col_idxs()), + as_device_type(result->get_values())); + } +} + + +template +void calculate_nonzeros_per_row_in_index_set( + std::shared_ptr exec, + const matrix::Csr* source, + const gko::index_set& row_index_set, + const gko::index_set& col_index_set, + IndexType* row_nnz) GKO_NOT_IMPLEMENTED; + + +template +void compute_submatrix_from_index_set( + std::shared_ptr exec, + const matrix::Csr* source, + const gko::index_set& row_index_set, + const gko::index_set& col_index_set, + matrix::Csr* result) GKO_NOT_IMPLEMENTED; template @@ -1135,13 +1673,10 @@ void fallback_transpose(std::shared_ptr exec, const auto nnz = output->get_num_stored_elements(); const auto in_row_ptrs = input->get_const_row_ptrs(); const auto in_col_idxs = input->get_const_col_idxs(); - // workaround for CUDA 9.2 Thrust unconstrained constructor issues - const auto in_vals = reinterpret_cast*>( - input->get_const_values()); + const auto in_vals = as_device_type(input->get_const_values()); const auto out_row_ptrs = output->get_row_ptrs(); const auto out_col_idxs = output->get_col_idxs(); - const auto out_vals = - reinterpret_cast*>(output->get_values()); + const auto out_vals = as_device_type(output->get_values()); array out_row_idxs{exec, nnz}; components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows, out_col_idxs); @@ -1161,8 +1696,7 @@ void fallback_sort(std::shared_ptr exec, { const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - const auto vals = - reinterpret_cast*>(to_sort->get_values()); + const auto vals = as_device_type(to_sort->get_values()); const auto nnz = to_sort->get_num_stored_elements(); const auto num_rows = to_sort->get_size()[0]; array row_idx_array(exec, nnz); @@ -1178,3 +1712,91 @@ void fallback_sort(std::shared_ptr exec, thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, col_val_it); } + + +template +void is_sorted_by_column_index( + std::shared_ptr exec, + const matrix::Csr* to_check, bool* is_sorted) +{ + *is_sorted = true; + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); + auto gpu_array = array{exec, cpu_array}; + auto block_size = default_block_size; + auto num_rows = static_cast(to_check->get_size()[0]); + auto num_blocks = ceildiv(num_rows, block_size); + if (num_blocks > 0) { + kernel:: + check_unsorted<<get_stream()>>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_rows, gpu_array.get_data()); + } + cpu_array = gpu_array; +} + + +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Csr* orig, + matrix::Diagonal* diag) +{ + const auto nnz = orig->get_num_stored_elements(); + const auto diag_size = diag->get_size()[0]; + const auto num_blocks = + ceildiv(config::warp_size * diag_size, default_block_size); + + const auto orig_values = orig->get_const_values(); + const auto orig_row_ptrs = orig->get_const_row_ptrs(); + const auto orig_col_idxs = orig->get_const_col_idxs(); + auto diag_values = diag->get_values(); + if (num_blocks > 0) { + kernel::extract_diagonal<<get_stream()>>>( + diag_size, nnz, as_device_type(orig_values), + as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs), + as_device_type(diag_values)); + } +} + + +template +void check_diagonal_entries_exist( + std::shared_ptr exec, + const matrix::Csr* const mtx, bool& has_all_diags) +{ + const auto num_diag = static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])); + if (num_diag > 0) { + const IndexType num_blocks = + ceildiv(num_diag, default_block_size / config::warp_size); + array has_diags(exec, {true}); + kernel::check_diagonal_entries<<get_stream()>>>( + num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + has_diags.get_data()); + has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); + } else { + has_all_diags = true; + } +} + + +template +void add_scaled_identity(std::shared_ptr exec, + const matrix::Dense* const alpha, + const matrix::Dense* const beta, + matrix::Csr* const mtx) +{ + const auto nrows = mtx->get_size()[0]; + if (nrows == 0) { + return; + } + const auto nthreads = nrows * config::warp_size; + const auto nblocks = ceildiv(nthreads, default_block_size); + kernel::add_scaled_identity<<get_stream()>>>( + as_device_type(alpha->get_const_values()), + as_device_type(beta->get_const_values()), static_cast(nrows), + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_device_type(mtx->get_values())); +} diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index 6c81fb4964c..e7bcac351cb 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -43,13 +43,14 @@ __device__ void spmv_kernel( acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -69,13 +70,13 @@ __device__ void spmv_kernel( const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; __shared__ uninitialized_array< - OutputValueType, default_block_size / num_thread_per_worker> + arithmetic_type, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = 0; } __syncthreads(); - auto temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -114,7 +115,9 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }); + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); } @@ -128,7 +131,8 @@ __global__ __launch_bounds__(default_block_size) void spmv( const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -139,16 +143,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }); } } diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 27314c06a59..607ec5046ea 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } auto in_rows = data.get_row_idxs(); auto in_cols = data.get_col_idxs(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(data.get_values()); + auto in_vals = as_device_type(data.get_values()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -232,15 +228,11 @@ void fill_in_matrix_data(std::shared_ptr exec, const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; for (auto nz = block_begin; nz < block_end; nz++) { block_values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = - fake_complex_unpack(in_vals[nz]); + (in_rows[nz] % bs)] = in_vals[nz]; } }); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL); - namespace kernel { @@ -323,9 +315,6 @@ void fill_in_dense(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); - template void convert_to_csr(const std::shared_ptr exec, @@ -345,9 +334,6 @@ void convert_to_csr(const std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); - template void is_sorted_by_column_index( @@ -372,23 +358,14 @@ void is_sorted_by_column_index( *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); - template void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* const to_sort) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); - template void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Diagonal* diag) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc index dddd7946a04..2d2ca9a5183 100644 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc @@ -121,19 +121,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void sort_by_column_index(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); - - -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::SparsityCsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); +void fallback_sort(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + const auto nnz = to_sort->get_num_nonzeros(); + const auto num_rows = to_sort->get_size()[0]; + array row_idx_array(exec, nnz); + const auto row_idxs = row_idx_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); + // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort + thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, + row_idxs); + thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, + col_idxs); +} diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc index 30cce92b8de..b08e86efaaa 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc @@ -45,15 +45,11 @@ template void sort_row_major(std::shared_ptr exec, size_type nnz, IndexType* row_idxs, IndexType* col_idxs, ValueType* vals) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); - // Because reduce_by_key is not determinstic, so we do not need + // Because reduce_by_key is not deterministic, so we do not need // stable_sort_by_key - // TODO: If we have determinstic reduce_by_key, it should be + // TODO: If we have deterministic reduce_by_key, it should be // stable_sort_by_key thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it); } @@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr exec, const IndexType* col_idxs, const ValueType* vals, matrix::Coo* coarse_coo) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto key_it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); - auto coarse_vals_it = - reinterpret_cast(coarse_coo->get_values()); + auto coarse_vals_it = as_device_type(coarse_coo->get_values()); auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple( coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs())); diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp.inc b/common/cuda_hip/preconditioner/batch_identity.hpp.inc new file mode 100644 index 00000000000..923ed4ce946 --- /dev/null +++ b/common/cuda_hip/preconditioner/batch_identity.hpp.inc @@ -0,0 +1,61 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +/** + * @see reference/preconditioner/batch_identity.hpp + */ +template +class Identity final { +public: + using value_type = ValueType; + + static constexpr int work_size = 0; + + __host__ __device__ static constexpr int dynamic_work_size(int, int) + { + return 0; + } + + template + __device__ __forceinline__ void generate(size_type, const batch_item_type&, + ValueType*) + {} + + __device__ __forceinline__ void apply(const int num_rows, + const ValueType* const r, + ValueType* const z) const + { + for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { + z[li] = r[li]; + } + } +}; diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc index 8827a47620b..2a0f7bd0dd7 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc @@ -197,23 +197,23 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi( const auto block_stride = storage_scheme.get_stride(); const auto rank = subwarp.thread_rank(); - if (rank < block_size) { - GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( - ValueType, block_precisions[block_id], - auto local_block = - reinterpret_cast( - blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id); - auto local_out_block = - reinterpret_cast( - out_blocks + storage_scheme.get_group_offset(block_id)) + - storage_scheme.get_block_offset(block_id); - for (IndexType i = 0; i < block_size; ++i) { - auto val = local_block[i * block_stride + rank]; - local_out_block[i + rank * block_stride] = - conjugate ? conj(val) : val; - }); - } + GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( + ValueType, block_precisions[block_id], + auto local_block = + reinterpret_cast( + blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + auto local_out_block = + reinterpret_cast( + out_blocks + storage_scheme.get_group_offset(block_id)) + + storage_scheme.get_block_offset(block_id); + for (int i = rank; i < block_size * block_size; i += subwarp_size) { + int row = i % block_size; + int col = i / block_size; + auto val = local_block[row + col * block_stride]; + local_out_block[row * block_stride + col] = + conjugate ? conj(val) : val; + }); } diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc new file mode 100644 index 00000000000..faee2e069a7 --- /dev/null +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc @@ -0,0 +1,382 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +__device__ __forceinline__ void initialize( + Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry, + const ValueType* const b_global_entry, + const ValueType* const x_global_entry, ValueType& rho_old, ValueType& omega, + ValueType& alpha, ValueType* const x_shared_entry, + ValueType* const r_shared_entry, ValueType* const r_hat_shared_entry, + ValueType* const p_shared_entry, ValueType* const p_hat_shared_entry, + ValueType* const v_shared_entry, + typename gko::remove_complex& rhs_norm, + typename gko::remove_complex& res_norm) +{ + rho_old = one(); + omega = one(); + alpha = one(); + + // copy x from global to shared memory + // r = b + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + x_shared_entry[iz] = x_global_entry[iz]; + r_shared_entry[iz] = b_global_entry[iz]; + } + __syncthreads(); + + // r = b - A*x + advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); + __syncthreads(); + + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm); + } else if (threadIdx.x / config::warp_size == 1) { + // Compute norms of rhs + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm); + } + __syncthreads(); + + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + r_hat_shared_entry[iz] = r_shared_entry[iz]; + p_shared_entry[iz] = zero(); + p_hat_shared_entry[iz] = zero(); + v_shared_entry[iz] = zero(); + } +} + + +template +__device__ __forceinline__ void update_p( + const int num_rows, const ValueType& rho_new, const ValueType& rho_old, + const ValueType& alpha, const ValueType& omega, + const ValueType* const r_shared_entry, + const ValueType* const v_shared_entry, ValueType* const p_shared_entry) +{ + const ValueType beta = (rho_new / rho_old) * (alpha / omega); + for (int r = threadIdx.x; r < num_rows; r += blockDim.x) { + p_shared_entry[r] = + r_shared_entry[r] + + beta * (p_shared_entry[r] - omega * v_shared_entry[r]); + } +} + +template +__device__ __forceinline__ void compute_alpha( + Group subgroup, const int num_rows, const ValueType& rho_new, + const ValueType* const r_hat_shared_entry, + const ValueType* const v_shared_entry, ValueType& alpha) +{ + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, + v_shared_entry, alpha); + } + __syncthreads(); + if (threadIdx.x == 0) { + alpha = rho_new / alpha; + } +} + + +template +__device__ __forceinline__ void update_s(const int num_rows, + const ValueType* const r_shared_entry, + const ValueType& alpha, + const ValueType* const v_shared_entry, + ValueType* const s_shared_entry) +{ + for (int r = threadIdx.x; r < num_rows; r += blockDim.x) { + s_shared_entry[r] = r_shared_entry[r] - alpha * v_shared_entry[r]; + } +} + + +template +__device__ __forceinline__ void compute_omega( + Group subgroup, const int num_rows, const ValueType* const t_shared_entry, + const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega) +{ + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + s_shared_entry, omega); + } else if (threadIdx.x / config::warp_size == 1) { + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + t_shared_entry, temp); + } + + __syncthreads(); + if (threadIdx.x == 0) { + omega /= temp; + } +} + +template +__device__ __forceinline__ void update_x_and_r( + const int num_rows, const ValueType* const p_hat_shared_entry, + const ValueType* const s_hat_shared_entry, const ValueType& alpha, + const ValueType& omega, const ValueType* const s_shared_entry, + const ValueType* const t_shared_entry, ValueType* const x_shared_entry, + ValueType* const r_shared_entry) +{ + for (int r = threadIdx.x; r < num_rows; r += blockDim.x) { + x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r] + + omega * s_hat_shared_entry[r]; + r_shared_entry[r] = s_shared_entry[r] - omega * t_shared_entry[r]; + } +} + + +template +__device__ __forceinline__ void update_x_middle( + const int num_rows, const ValueType& alpha, + const ValueType* const p_hat_shared_entry, ValueType* const x_shared_entry) +{ + for (int r = threadIdx.x; r < num_rows; r += blockDim.x) { + x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r]; + } +} + + +template +__global__ void apply_kernel( + const gko::kernels::batch_bicgstab::storage_config sconf, + const int max_iter, const gko::remove_complex tol, + LogType logger, PrecType prec_shared, const BatchMatrixType mat, + const ValueType* const __restrict__ b, ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) +{ + using real_type = typename gko::remove_complex; + const auto num_batch_items = mat.num_batch_items; + const auto num_rows = mat.num_rows; + + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + + for (int batch_id = blockIdx.x; batch_id < num_batch_items; + batch_id += gridDim.x) { + const int gmem_offset = + batch_id * sconf.gmem_stride_bytes / sizeof(ValueType); + extern __shared__ char local_mem_sh[]; + + ValueType* p_hat_sh; + ValueType* s_hat_sh; + ValueType* p_sh; + ValueType* s_sh; + ValueType* r_sh; + ValueType* r_hat_sh; + ValueType* v_sh; + ValueType* t_sh; + ValueType* x_sh; + ValueType* prec_work_sh; + + if (n_shared >= 1) { + p_hat_sh = reinterpret_cast(local_mem_sh); + } else { + p_hat_sh = workspace + gmem_offset; + } + if (n_shared == 1) { + s_hat_sh = workspace + gmem_offset; + } else { + s_hat_sh = p_hat_sh + sconf.padded_vec_len; + } + if (n_shared == 2) { + v_sh = workspace + gmem_offset; + } else { + v_sh = s_hat_sh + sconf.padded_vec_len; + } + if (n_shared == 3) { + t_sh = workspace + gmem_offset; + } else { + t_sh = v_sh + sconf.padded_vec_len; + } + if (n_shared == 4) { + p_sh = workspace + gmem_offset; + } else { + p_sh = t_sh + sconf.padded_vec_len; + } + if (n_shared == 5) { + s_sh = workspace + gmem_offset; + } else { + s_sh = p_sh + sconf.padded_vec_len; + } + if (n_shared == 6) { + r_sh = workspace + gmem_offset; + } else { + r_sh = s_sh + sconf.padded_vec_len; + } + if (n_shared == 7) { + r_hat_sh = workspace + gmem_offset; + } else { + r_hat_sh = r_sh + sconf.padded_vec_len; + } + if (n_shared == 8) { + x_sh = workspace + gmem_offset; + } else { + x_sh = r_hat_sh + sconf.padded_vec_len; + } + if (!prec_shared_bool && n_shared == 9) { + prec_work_sh = workspace + gmem_offset; + } else { + prec_work_sh = x_sh + sconf.padded_vec_len; + } + + __shared__ uninitialized_array rho_old_sh; + __shared__ uninitialized_array rho_new_sh; + __shared__ uninitialized_array omega_sh; + __shared__ uninitialized_array alpha_sh; + __shared__ uninitialized_array temp_sh; + __shared__ real_type norms_rhs_sh[1]; + __shared__ real_type norms_res_sh[1]; + + const auto mat_entry = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const ValueType* const b_entry_ptr = + gko::batch::multi_vector::batch_item_ptr(b, 1, num_rows, batch_id); + ValueType* const x_gl_entry_ptr = + gko::batch::multi_vector::batch_item_ptr(x, 1, num_rows, batch_id); + + // generate preconditioner + prec_shared.generate(batch_id, mat_entry, prec_work_sh); + + // initialization + // rho_old = 1, omega = 1, alpha = 1 + // compute b norms + // copy x from global to shared memory + // r = b - A*x + // compute residual norms + // r_hat = r + // p = 0 + // p_hat = 0 + // v = 0 + initialize(subgroup, num_rows, mat_entry, b_entry_ptr, x_gl_entry_ptr, + rho_old_sh[0], omega_sh[0], alpha_sh[0], x_sh, r_sh, + r_hat_sh, p_sh, p_hat_sh, v_sh, norms_rhs_sh[0], + norms_res_sh[0]); + __syncthreads(); + + // stopping criterion object + StopType stop(tol, norms_rhs_sh); + + int iter = 0; + for (; iter < max_iter; iter++) { + if (stop.check_converged(norms_res_sh)) { + logger.log_iteration(batch_id, iter, norms_res_sh[0]); + break; + } + + // rho_new = < r_hat , r > = (r_hat)' * (r) + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh, + rho_new_sh[0]); + } + __syncthreads(); + + // beta = (rho_new / rho_old)*(alpha / omega) + // p = r + beta*(p - omega * v) + update_p(num_rows, rho_new_sh[0], rho_old_sh[0], alpha_sh[0], + omega_sh[0], r_sh, v_sh, p_sh); + __syncthreads(); + + // p_hat = precond * p + prec_shared.apply(num_rows, p_sh, p_hat_sh); + __syncthreads(); + + // v = A * p_hat + simple_apply(mat_entry, p_hat_sh, v_sh); + __syncthreads(); + + // alpha = rho_new / < r_hat , v> + compute_alpha(subgroup, num_rows, rho_new_sh[0], r_hat_sh, v_sh, + alpha_sh[0]); + __syncthreads(); + + // s = r - alpha*v + update_s(num_rows, r_sh, alpha_sh[0], v_sh, s_sh); + __syncthreads(); + + // an estimate of residual norms + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_norm2(subgroup, num_rows, s_sh, + norms_res_sh[0]); + } + __syncthreads(); + + // if (norms_res_sh[0] / norms_rhs_sh[0] < tol) { + if (stop.check_converged(norms_res_sh)) { + update_x_middle(num_rows, alpha_sh[0], p_hat_sh, x_sh); + logger.log_iteration(batch_id, iter, norms_res_sh[0]); + break; + } + + // s_hat = precond * s + prec_shared.apply(num_rows, s_sh, s_hat_sh); + __syncthreads(); + + // t = A * s_hat + simple_apply(mat_entry, s_hat_sh, t_sh); + __syncthreads(); + + // omega = / + compute_omega(subgroup, num_rows, t_sh, s_sh, temp_sh[0], + omega_sh[0]); + __syncthreads(); + + // x = x + alpha*p_hat + omega *s_hat + // r = s - omega * t + update_x_and_r(num_rows, p_hat_sh, s_hat_sh, alpha_sh[0], + omega_sh[0], s_sh, t_sh, x_sh, r_sh); + __syncthreads(); + + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_norm2(subgroup, num_rows, r_sh, + norms_res_sh[0]); + } + //__syncthreads(); + + if (threadIdx.x == blockDim.x - 1) { + rho_old_sh[0] = rho_new_sh[0]; + } + __syncthreads(); + } + + logger.log_iteration(batch_id, iter, norms_res_sh[0]); + + // copy x back to global memory + single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); + __syncthreads(); + } +} diff --git a/benchmark/utils/spmv_validation.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc similarity index 59% rename from benchmark/utils/spmv_validation.hpp rename to common/cuda_hip/stop/batch_criteria.hpp.inc index 83ea2085ec2..d9ca9d10487 100644 --- a/benchmark/utils/spmv_validation.hpp +++ b/common/cuda_hip/stop/batch_criteria.hpp.inc @@ -30,54 +30,51 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ -#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ - - -#include - - -#include -#include - - -#include - - -std::string example_config = R"( - [ - {"filename": "my_file.mtx"}, - {"filename": "my_file2.mtx"}, - {"size": 100, "stencil": "7pt"}, - ] -)"; - /** - * Function which outputs the input format for benchmarks similar to the spmv. + * @see reference/stop/batch_criteria.hpp */ -[[noreturn]] void print_config_error_and_exit() -{ - std::cerr << "Input has to be a JSON array of matrix configurations:\n" - << example_config << std::endl; - std::exit(1); -} +template +class SimpleRelResidual { +public: + using real_type = remove_complex; + + __device__ __forceinline__ SimpleRelResidual( + const real_type rel_res_tol, const real_type* const rhs_b_norms) + : rel_tol_{rel_res_tol}, rhs_norms_{rhs_b_norms} + {} + + __device__ __forceinline__ bool check_converged( + const real_type* const residual_norms) const + { + return residual_norms[0] <= (rel_tol_ * rhs_norms_[0]); + } + +private: + const real_type rel_tol_; + const real_type* const rhs_norms_; +}; /** - * Validates whether the input format is correct for spmv-like benchmarks. - * - * @param value the JSON value to test. + * @see reference/stop/batch_criteria.hpp */ -void validate_option_object(const rapidjson::Value& value) -{ - if (!value.IsObject() || - !((value.HasMember("size") && value.HasMember("stencil") && - value["size"].IsInt64() && value["stencil"].IsString()) || - (value.HasMember("filename") && value["filename"].IsString()))) { - print_config_error_and_exit(); +template +class SimpleAbsResidual { +public: + using real_type = remove_complex; + + __device__ __forceinline__ SimpleAbsResidual(const real_type tol, + const real_type*) + : abs_tol_{tol} + {} + + __device__ __forceinline__ bool check_converged( + const real_type* const residual_norms) const + { + return (residual_norms[0] <= abs_tol_); } -} - -#endif // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_ +private: + const real_type abs_tol_; +}; diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt new file mode 100644 index 00000000000..7ac6b3df40c --- /dev/null +++ b/common/unified/CMakeLists.txt @@ -0,0 +1,33 @@ +set(UNIFIED_SOURCES + base/device_matrix_data_kernels.cpp + base/index_set_kernels.cpp + components/absolute_array_kernels.cpp + components/fill_array_kernels.cpp + components/format_conversion_kernels.cpp + components/precision_conversion_kernels.cpp + components/reduce_array_kernels.cpp + distributed/partition_helpers_kernels.cpp + distributed/partition_kernels.cpp + matrix/coo_kernels.cpp + matrix/csr_kernels.cpp + matrix/ell_kernels.cpp + matrix/hybrid_kernels.cpp + matrix/permutation_kernels.cpp + matrix/scaled_permutation_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + matrix/diagonal_kernels.cpp + multigrid/pgm_kernels.cpp + preconditioner/jacobi_kernels.cpp + solver/bicg_kernels.cpp + solver/bicgstab_kernels.cpp + solver/cg_kernels.cpp + solver/cgs_kernels.cpp + solver/common_gmres_kernels.cpp + solver/fcg_kernels.cpp + solver/gcr_kernels.cpp + solver/gmres_kernels.cpp + solver/ir_kernels.cpp + ) +list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) +set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE) \ No newline at end of file diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp new file mode 100644 index 00000000000..3c041dd7e4b --- /dev/null +++ b/common/unified/distributed/partition_helpers_kernels.cpp @@ -0,0 +1,102 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/partition_helpers_kernels.hpp" + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition_helpers { + + +template +void check_consecutive_ranges(std::shared_ptr exec, + const array& range_start_ends, + bool& result) +{ + array result_uint32{exec, 1}; + auto num_ranges = range_start_ends.get_num_elems() / 2; + // need additional guard because DPCPP doesn't return the initial value for + // empty inputs + if (num_ranges > 1) { + run_kernel_reduction( + exec, + [] GKO_KERNEL(const auto i, const auto* ranges) { + return ranges[2 * i] == ranges[2 * i + 1]; + }, + [] GKO_KERNEL(const auto a, const auto b) { + return static_cast(a && b); + }, + [] GKO_KERNEL(auto x) { return x; }, static_cast(true), + result_uint32.get_data(), num_ranges - 1, + range_start_ends.get_const_data() + 1); + result = + static_cast(exec->copy_val_to_host(result_uint32.get_data())); + } else { + result = true; + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES); + + +template +void compress_ranges(std::shared_ptr exec, + const array& range_start_ends, + array& range_offsets) +{ + run_kernel( + exec, + [] GKO_KERNEL(const auto i, const auto* start_ends, auto* offsets) { + if (i == 0) { + offsets[0] = start_ends[0]; + } + offsets[i + 1] = start_ends[2 * i + 1]; + }, + range_offsets.get_num_elems() - 1, range_start_ends.get_const_data(), + range_offsets.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES); + + +} // namespace partition_helpers +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp index cb0f4813da5..dc13fec9f1b 100644 --- a/common/unified/distributed/partition_kernels.cpp +++ b/common/unified/distributed/partition_kernels.cpp @@ -66,19 +66,22 @@ void count_ranges(std::shared_ptr exec, template void build_from_contiguous(std::shared_ptr exec, const array& ranges, + const array& part_id_mapping, GlobalIndexType* range_bounds, comm_index_type* part_ids) { run_kernel( exec, - [] GKO_KERNEL(auto i, auto ranges, auto bounds, auto ids) { + [] GKO_KERNEL(auto i, auto ranges, auto mapping, auto bounds, auto ids, + bool uses_mapping) { if (i == 0) { bounds[0] = 0; } bounds[i + 1] = ranges[i + 1]; - ids[i] = i; + ids[i] = uses_mapping ? mapping[i] : i; }, - ranges.get_num_elems() - 1, ranges, range_bounds, part_ids); + ranges.get_num_elems() - 1, ranges, part_id_mapping, range_bounds, + part_ids, part_id_mapping.get_num_elems() > 0); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_CONTIGUOUS); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index f4e034998bd..d1abb043c44 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -54,53 +54,71 @@ namespace GKO_DEVICE_NAMESPACE { namespace csr { -template -void invert_permutation(std::shared_ptr exec, - size_type size, const IndexType* permutation_indices, - IndexType* inv_permutation) +template +void inv_col_permute(std::shared_ptr exec, + const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) { + auto num_rows = orig->get_size()[0]; + auto nnz = orig->get_num_stored_elements(); + auto size = std::max(num_rows + 1, nnz); run_kernel( exec, - [] GKO_KERNEL(auto tid, auto permutation, auto inv_permutation) { - inv_permutation[permutation[tid]] = tid; + [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, + auto permutation, auto in_row_ptrs, auto in_col_idxs, + auto in_vals, auto out_row_ptrs, auto out_col_idxs, + auto out_vals) { + if (tid < num_nonzeros) { + out_col_idxs[tid] = permutation[in_col_idxs[tid]]; + out_vals[tid] = in_vals[tid]; + } + if (tid <= num_rows) { + out_row_ptrs[tid] = in_row_ptrs[tid]; + } }, - size, permutation_indices, inv_permutation); + size, num_rows, nnz, perm, orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), orig->get_const_values(), + col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(), + col_permuted->get_values()); } -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL); template -void inverse_column_permute(std::shared_ptr exec, - const IndexType* perm, - const matrix::Csr* orig, - matrix::Csr* column_permuted) +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Csr* orig, + matrix::Csr* col_permuted) { auto num_rows = orig->get_size()[0]; auto nnz = orig->get_num_stored_elements(); - auto size = std::max(num_rows, nnz); + auto size = std::max(num_rows + 1, nnz); run_kernel( exec, - [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, + [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, auto scale, auto permutation, auto in_row_ptrs, auto in_col_idxs, auto in_vals, auto out_row_ptrs, auto out_col_idxs, auto out_vals) { if (tid < num_nonzeros) { - out_col_idxs[tid] = permutation[in_col_idxs[tid]]; - out_vals[tid] = in_vals[tid]; + const auto out_col = permutation[in_col_idxs[tid]]; + out_col_idxs[tid] = out_col; + out_vals[tid] = in_vals[tid] / scale[out_col]; } if (tid <= num_rows) { out_row_ptrs[tid] = in_row_ptrs[tid]; } }, - size, num_rows, nnz, perm, orig->get_const_row_ptrs(), + size, num_rows, nnz, scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), orig->get_const_values(), - column_permuted->get_row_ptrs(), column_permuted->get_col_idxs(), - column_permuted->get_values()); + col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(), + col_permuted->get_values()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL); + GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL); template @@ -154,8 +172,8 @@ void convert_to_sellp(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + slice_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += slice_size; } }, @@ -185,8 +203,8 @@ void convert_to_ell(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + ell_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += ell_stride; } }, diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp new file mode 100644 index 00000000000..73e06385f54 --- /dev/null +++ b/common/unified/matrix/dense_kernels.instantiate.cpp @@ -0,0 +1,130 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "common/unified/matrix/dense_kernels.template.cpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace dense { + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( + GKO_DECLARE_DENSE_COPY_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_INV_SCALE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL); +// end + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.template.cpp similarity index 68% rename from common/unified/matrix/dense_kernels.cpp rename to common/unified/matrix/dense_kernels.template.cpp index 18d2fbabe6c..ed508066ba8 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -67,9 +67,6 @@ void copy(std::shared_ptr exec, input->get_size(), input, output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY( - GKO_DECLARE_DENSE_COPY_KERNEL); - template void fill(std::shared_ptr exec, @@ -83,8 +80,6 @@ void fill(std::shared_ptr exec, mat->get_size(), mat, value); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL); - template void fill_in_matrix_data(std::shared_ptr exec, @@ -100,9 +95,6 @@ void fill_in_matrix_data(std::shared_ptr exec, data.get_const_col_idxs(), data.get_const_values(), output); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); - template void scale(std::shared_ptr exec, @@ -125,8 +117,6 @@ void scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); - template void inv_scale(std::shared_ptr exec, @@ -150,9 +140,6 @@ void inv_scale(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_INV_SCALE_KERNEL); - template void add_scaled(std::shared_ptr exec, @@ -176,9 +163,6 @@ void add_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); - template void sub_scaled(std::shared_ptr exec, @@ -202,9 +186,6 @@ void sub_scaled(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); - template void add_scaled_diag(std::shared_ptr exec, @@ -221,8 +202,6 @@ void add_scaled_diag(std::shared_ptr exec, x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); - template void sub_scaled_diag(std::shared_ptr exec, @@ -239,8 +218,6 @@ void sub_scaled_diag(std::shared_ptr exec, x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); - template void compute_dot(std::shared_ptr exec, @@ -257,8 +234,6 @@ void compute_dot(std::shared_ptr exec, tmp, x, y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - template void compute_conj_dot(std::shared_ptr exec, @@ -275,8 +250,6 @@ void compute_conj_dot(std::shared_ptr exec, tmp, x, y); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - template void compute_norm2(std::shared_ptr exec, @@ -292,8 +265,6 @@ void compute_norm2(std::shared_ptr exec, result->get_values(), x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - template void compute_norm1(std::shared_ptr exec, const matrix::Dense* x, @@ -306,7 +277,21 @@ void compute_norm1(std::shared_ptr exec, x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); + +template +void compute_mean(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense* result, array& tmp) +{ + using ValueType_nc = gko::remove_complex; + run_kernel_col_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto inv_total_size) { + return x(i, j) * inv_total_size; + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + tmp, x, ValueType_nc{1.} / x->get_size()[0]); +} template @@ -325,9 +310,6 @@ void compute_max_nnz_per_row(std::shared_ptr exec, source->get_size()[0]); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); - template void compute_slice_sets(std::shared_ptr exec, @@ -357,9 +339,6 @@ void compute_slice_sets(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); - template void count_nonzeros_per_row(std::shared_ptr exec, @@ -374,11 +353,6 @@ void count_nonzeros_per_row(std::shared_ptr exec, GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); - template void compute_squared_norm2(std::shared_ptr exec, @@ -393,9 +367,6 @@ void compute_squared_norm2(std::shared_ptr exec, x->get_size(), tmp, x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); - template void compute_sqrt(std::shared_ptr exec, @@ -409,12 +380,10 @@ void compute_sqrt(std::shared_ptr exec, x->get_size(), x); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); - template void symm_permute(std::shared_ptr exec, - const array* permutation_indices, + const IndexType* permutation_indices, const matrix::Dense* orig, matrix::Dense* permuted) { @@ -423,16 +392,13 @@ void symm_permute(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, col) = orig(perm[row], perm[col]); }, - orig->get_size(), orig, *permutation_indices, permuted); + orig->get_size(), orig, permutation_indices, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); - template void inv_symm_permute(std::shared_ptr exec, - const array* permutation_indices, + const IndexType* permutation_indices, const matrix::Dense* orig, matrix::Dense* permuted) { @@ -441,17 +407,49 @@ void inv_symm_permute(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(perm[row], perm[col]) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, permuted); + orig->get_size(), orig, permutation_indices, permuted); +} + + +template +void nonsymm_permute(std::shared_ptr exec, + const IndexType* row_permutation_indices, + const IndexType* column_permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm, + auto col_perm, auto permuted) { + permuted(row, col) = orig(row_perm[row], col_perm[col]); + }, + orig->get_size(), orig, row_permutation_indices, + column_permutation_indices, permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); + +template +void inv_nonsymm_permute(std::shared_ptr exec, + const IndexType* row_permutation_indices, + const IndexType* column_permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm, + auto col_perm, auto permuted) { + permuted(row_perm[row], col_perm[col]) = orig(row, col); + }, + orig->get_size(), orig, row_permutation_indices, + column_permutation_indices, permuted); +} template void row_gather(std::shared_ptr exec, - const array* row_idxs, - const matrix::Dense* orig, + const IndexType* row_idxs, const matrix::Dense* orig, matrix::Dense* row_collection) { run_kernel( @@ -459,18 +457,14 @@ void row_gather(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { gathered(row, col) = orig(rows[row], col); }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, - row_collection); + row_collection->get_size(), orig, row_idxs, row_collection); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); - template void advanced_row_gather(std::shared_ptr exec, const matrix::Dense* alpha, - const array* row_idxs, + const IndexType* row_idxs, const matrix::Dense* orig, const matrix::Dense* beta, matrix::Dense* row_collection) @@ -485,67 +479,202 @@ void advanced_row_gather(std::shared_ptr exec, static_cast(beta[0]) * static_cast(gathered(row, col)); }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, - alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), - row_collection); + row_collection->get_size(), alpha->get_const_values(), orig, row_idxs, + beta->get_const_values(), row_collection); } -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); - template -void column_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) +void col_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* col_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, col) = orig(row, perm[col]); }, - orig->get_size(), orig, *permutation_indices, column_permuted); + orig->get_size(), orig, permutation_indices, col_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); - template -void inverse_row_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* row_permuted) +void inv_row_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* row_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(perm[row], col) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, row_permuted); + orig->get_size(), orig, permutation_indices, row_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); - template -void inverse_column_permute(std::shared_ptr exec, - const array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) +void inv_col_permute(std::shared_ptr exec, + const IndexType* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* col_permuted) { run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { permuted(row, perm[col]) = orig(row, col); }, - orig->get_size(), orig, *permutation_indices, column_permuted); + orig->get_size(), orig, permutation_indices, col_permuted); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); + +template +void symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto row = perm[i]; + const auto col = perm[j]; + permuted(i, j) = scale[row] * scale[col] * orig(row, col); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_symm_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto row = perm[i]; + const auto col = perm[j]; + permuted(row, col) = orig(i, j) / (scale[row] * scale[col]); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, + auto col_scale, auto col_perm, auto orig, auto permuted) { + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted(i, j) = row_scale[row] * col_scale[col] * orig(row, col); + }, + orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, + permuted); +} + + +template +void inv_nonsymm_scale_permute(std::shared_ptr exec, + const ValueType* row_scale, + const IndexType* row_perm, + const ValueType* col_scale, + const IndexType* col_perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm, + auto col_scale, auto col_perm, auto orig, auto permuted) { + const auto row = row_perm[i]; + const auto col = col_perm[j]; + permuted(row, col) = orig(i, j) / (row_scale[row] * col_scale[col]); + }, + orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig, + permuted); +} + + +template +void row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto row = perm[i]; + permuted(i, j) = scale[row] * orig(row, j); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_row_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto row = perm[i]; + permuted(row, j) = orig(i, j) / scale[row]; + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto col = perm[j]; + permuted(i, j) = scale[col] * orig(i, col); + }, + orig->get_size(), scale, perm, orig, permuted); +} + + +template +void inv_col_scale_permute(std::shared_ptr exec, + const ValueType* scale, const IndexType* perm, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig, + auto permuted) { + const auto col = perm[j]; + permuted(i, col) = orig(i, j) / scale[col]; + }, + orig->get_size(), scale, perm, orig, permuted); +} template @@ -559,8 +688,6 @@ void extract_diagonal(std::shared_ptr exec, diag->get_size()[0], orig, diag->get_values()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); - template void inplace_absolute_dense(std::shared_ptr exec, @@ -574,8 +701,6 @@ void inplace_absolute_dense(std::shared_ptr exec, source->get_size(), source); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); - template void outplace_absolute_dense(std::shared_ptr exec, @@ -590,8 +715,6 @@ void outplace_absolute_dense(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); - template void make_complex(std::shared_ptr exec, @@ -606,8 +729,6 @@ void make_complex(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); - template void get_real(std::shared_ptr exec, @@ -622,8 +743,6 @@ void get_real(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); - template void get_imag(std::shared_ptr exec, @@ -638,8 +757,6 @@ void get_imag(std::shared_ptr exec, source->get_size(), source, result); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); - template void add_scaled_identity(std::shared_ptr exec, @@ -659,9 +776,6 @@ void add_scaled_identity(std::shared_ptr exec, mtx); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); - } // namespace dense } // namespace GKO_DEVICE_NAMESPACE diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp new file mode 100644 index 00000000000..e437737c524 --- /dev/null +++ b/common/unified/matrix/permutation_kernels.cpp @@ -0,0 +1,85 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/permutation_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace permutation { + + +template +void invert(std::shared_ptr exec, + const IndexType* permutation_indices, size_type size, + IndexType* inv_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto permutation, auto inv_permutation) { + inv_permutation[permutation[i]] = i; + }, + size, permutation_indices, inv_permutation); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL); + + +template +void compose(std::shared_ptr exec, + const IndexType* first_permutation, + const IndexType* second_permutation, size_type size, + IndexType* output_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto first_permutation, auto second_permutation, + auto output_permutation) { + output_permutation[i] = first_permutation[second_permutation[i]]; + }, + size, first_permutation, second_permutation, output_permutation); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL); + + +} // namespace permutation +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp new file mode 100644 index 00000000000..ff3bb55becb --- /dev/null +++ b/common/unified/matrix/scaled_permutation_kernels.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/scaled_permutation_kernels.hpp" + + +#include + + +#include "common/unified/base/kernel_launch.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace scaled_permutation { + + +template +void invert(std::shared_ptr exec, + const ValueType* input_scale, const IndexType* input_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto input_scale, auto input_permutation, + auto output_scale, auto output_permutation) { + const auto ip = input_permutation[i]; + output_permutation[ip] = i; + output_scale[i] = one(input_scale[ip]) / input_scale[ip]; + }, + size, input_scale, input_permutation, output_scale, output_permutation); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL); + + +template +void compose(std::shared_ptr exec, + const ValueType* first_scale, const IndexType* first_permutation, + const ValueType* second_scale, const IndexType* second_permutation, + size_type size, ValueType* output_scale, + IndexType* output_permutation) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto first_scale, auto first_permutation, + auto second_scale, auto second_permutation, + auto output_permutation, auto output_scale) { + const auto second_permuted = second_permutation[i]; + const auto combined_permuted = first_permutation[second_permuted]; + output_permutation[i] = combined_permuted; + output_scale[combined_permuted] = + first_scale[combined_permuted] * second_scale[second_permuted]; + }, + size, first_scale, first_permutation, second_scale, second_permutation, + output_permutation, output_scale); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL); + + +} // namespace scaled_permutation +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 5836486f2a6..a61b32dacbd 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -135,7 +135,7 @@ void map_row(std::shared_ptr exec, exec, [] GKO_KERNEL(auto tidx, auto fine_row_ptrs, auto agg, auto row_idxs) { const auto coarse_row = agg[tidx]; - // TODO: when it is neccessary, it can use warp per row to improve. + // TODO: when it is necessary, it can use warp per row to improve. for (auto i = fine_row_ptrs[tidx]; i < fine_row_ptrs[tidx + 1]; i++) { row_idxs[i] = coarse_row; @@ -232,7 +232,7 @@ void find_strongest_neighbor( // all neighbor is agg, connect to the strongest agg // Also, no others will use this item as their // strongest_neighbor because they are already aggregated. Thus, - // it is determinstic behavior + // it is deterministic behavior agg[row] = agg[strongest_agg]; } else if (strongest_unagg != -1) { // set the strongest neighbor in the unagg group @@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr exec, { const auto num = agg.get_num_elems(); if (intermediate_agg.get_num_elems() > 0) { - // determinstic kernel + // deterministic kernel run_kernel( exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, diff --git a/contributors.txt b/contributors.txt index 1f1259bc082..aec120d93dd 100644 --- a/contributors.txt +++ b/contributors.txt @@ -20,6 +20,7 @@ Kashi Aditya Karlsruhe Institute of Technology Koch Marcel Karlsruhe Institute of Technology Maier Matthias Texas A&M University Nayak Pratik Karlsruhe Institute of Technology +Nguyen Phuong University of Tennessee, Knoxville Olenik Gregor HPSim Ribizel Tobias Karlsruhe Institute of Technology Riemer Lukas Karlsruhe Institute of Technology diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 2f9643115c9..ce4a52037b9 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -4,12 +4,14 @@ add_library(ginkgo "") target_sources(ginkgo PRIVATE base/array.cpp + base/batch_multi_vector.cpp base/combination.cpp base/composition.cpp base/dense_cache.cpp base/device_matrix_data.cpp base/executor.cpp base/index_set.cpp + base/memory.cpp base/mpi.cpp base/mtx_io.cpp base/perturbation.cpp @@ -27,6 +29,7 @@ target_sources(ginkgo factorization/par_ilu.cpp factorization/par_ilut.cpp factorization/symbolic.cpp + log/batch_logger.cpp log/convergence.cpp log/logger.cpp log/performance_hint.cpp @@ -37,6 +40,9 @@ target_sources(ginkgo log/vtune.cpp log/record.cpp log/stream.cpp + matrix/batch_dense.cpp + matrix/batch_ell.cpp + matrix/batch_identity.cpp matrix/coo.cpp matrix/csr.cpp matrix/dense.cpp @@ -47,16 +53,19 @@ target_sources(ginkgo matrix/hybrid.cpp matrix/identity.cpp matrix/permutation.cpp + matrix/row_gatherer.cpp + matrix/scaled_permutation.cpp matrix/sellp.cpp matrix/sparsity_csr.cpp - matrix/row_gatherer.cpp multigrid/pgm.cpp multigrid/fixed_coarsening.cpp preconditioner/isai.cpp preconditioner/jacobi.cpp reorder/amd.cpp + reorder/mc64.cpp reorder/rcm.cpp reorder/scaled_reordered.cpp + solver/batch_bicgstab.cpp solver/bicg.cpp solver/bicgstab.cpp solver/cb_gmres.cpp @@ -93,6 +102,7 @@ if(GINKGO_BUILD_MPI) PRIVATE mpi/exception.cpp distributed/matrix.cpp + distributed/partition_helpers.cpp distributed/vector.cpp distributed/preconditioner/schwarz.cpp) endif() @@ -111,10 +121,7 @@ target_link_libraries(ginkgo set(GKO_RPATH_ADDITIONS "") if(GINKGO_HAVE_PAPI_SDE) - target_link_libraries(ginkgo PUBLIC PAPI::PAPI) - list(GET PAPI_LIBRARIES 0 PAPI_FIRST_LIB) - get_filename_component(GKO_PAPI_LIBDIR "${PAPI_FIRST_LIB}" DIRECTORY) - list(APPEND GKO_RPATH_ADDITIONS "${GKO_PAPI_LIBDIR}") + target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE) endif() if(GINKGO_HAVE_TAU) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp new file mode 100644 index 00000000000..6dcf8dd90b5 --- /dev/null +++ b/core/base/batch_multi_vector.cpp @@ -0,0 +1,299 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/base/batch_multi_vector_kernels.hpp" + + +namespace gko { +namespace batch { +namespace multi_vector { +namespace { + + +GKO_REGISTER_OPERATION(scale, batch_multi_vector::scale); +GKO_REGISTER_OPERATION(add_scaled, batch_multi_vector::add_scaled); +GKO_REGISTER_OPERATION(compute_dot, batch_multi_vector::compute_dot); +GKO_REGISTER_OPERATION(compute_conj_dot, batch_multi_vector::compute_conj_dot); +GKO_REGISTER_OPERATION(compute_norm2, batch_multi_vector::compute_norm2); +GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy); + + +} // namespace +} // namespace multi_vector + + +namespace detail { + + +template +batch_dim<2> compute_batch_size( + const std::vector*>& matrices) +{ + auto common_size = matrices[0]->get_size(); + for (size_type i = 1; i < matrices.size(); ++i) { + GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size()); + } + return batch_dim<2>{matrices.size(), common_size}; +} + + +} // namespace detail + + +template +std::unique_ptr> +MultiVector::create_view_for_item(size_type item_id) +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create( + exec, this->get_common_size(), + make_array_view(exec, num_rows * stride, + this->get_values_for_item(item_id)), + stride); + return mat; +} + + +template +std::unique_ptr> +MultiVector::create_const_view_for_item(size_type item_id) const +{ + auto exec = this->get_executor(); + auto num_rows = this->get_common_size()[0]; + auto stride = this->get_common_size()[1]; + auto mat = unbatch_type::create_const( + exec, this->get_common_size(), + make_const_array_view(exec, num_rows * stride, + this->get_const_values_for_item(item_id)), + stride); + return mat; +} + + +template +MultiVector::MultiVector(std::shared_ptr exec, + const batch_dim<2>& size) + : EnablePolymorphicObject>(exec), + batch_size_(size), + values_(exec, compute_num_elems(size)) +{} + + +template +std::unique_ptr> +MultiVector::create_with_config_of( + ptr_param other) +{ + // De-referencing `other` before calling the functions (instead of + // using operator `->`) is currently required to be compatible with + // CUDA 10.1. + // Otherwise, it results in a compile error. + return (*other).create_with_same_config(); +} + + +template +std::unique_ptr> +MultiVector::create_const( + std::shared_ptr exec, const batch_dim<2>& sizes, + gko::detail::const_array_view&& values) +{ + // cast const-ness away, but return a const object afterwards, + // so we can ensure that no modifications take place. + return std::unique_ptr(new MultiVector{ + exec, sizes, gko::detail::array_const_cast(std::move(values))}); +} + + +template +void MultiVector::fill(ValueType value) +{ + GKO_ASSERT(this->values_.get_num_elems() > 0); + this->values_.fill(value); +} + + +template +void MultiVector::set_size(const batch_dim<2>& value) noexcept +{ + batch_size_ = value; +} + + +template +std::unique_ptr> +MultiVector::create_with_same_config() const +{ + return MultiVector::create(this->get_executor(), + this->get_size()); +} + + +template +void MultiVector::scale( + ptr_param> alpha) +{ + GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); + if (alpha->get_common_size()[1] != 1) { + // different alpha for each column + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); + } + auto exec = this->get_executor(); + exec->run(multi_vector::make_scale(make_temporary_clone(exec, alpha).get(), + this)); +} + + +template +void MultiVector::add_scaled( + ptr_param> alpha, + ptr_param> b) +{ + GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1)); + if (alpha->get_common_size()[1] != 1) { + // different alpha for each column + GKO_ASSERT_EQUAL_COLS(this->get_common_size(), + alpha->get_common_size()); + } + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); + + auto exec = this->get_executor(); + exec->run(multi_vector::make_add_scaled( + make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get(), this)); +} + + +inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes) +{ + return batch_dim<2>(sizes.get_num_batch_items(), + dim<2>(1, sizes.get_common_size()[1])); +} + + +template +void MultiVector::compute_conj_dot( + ptr_param> b, + ptr_param> result) const +{ + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); + auto exec = this->get_executor(); + exec->run(multi_vector::make_compute_conj_dot( + this, make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get())); +} + + +template +void MultiVector::compute_dot( + ptr_param> b, + ptr_param> result) const +{ + GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size()); + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); + auto exec = this->get_executor(); + exec->run(multi_vector::make_compute_dot( + this, make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get())); +} + + +template +void MultiVector::compute_norm2( + ptr_param>> result) const +{ + GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items()); + GKO_ASSERT_EQUAL_DIMENSIONS( + result->get_common_size(), + get_col_sizes(this->get_size()).get_common_size()); + + auto exec = this->get_executor(); + exec->run(multi_vector::make_compute_norm2( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void MultiVector::convert_to( + MultiVector>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>* result) +{ + this->convert_to(result); +} + + +#define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); + + +} // namespace batch +} // namespace gko diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp new file mode 100644 index 00000000000..5a39567f470 --- /dev/null +++ b/core/base/batch_multi_vector_kernels.hpp @@ -0,0 +1,111 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ +#define GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ + + +#include + + +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(_type) \ + void scale(std::shared_ptr exec, \ + const batch::MultiVector<_type>* alpha, \ + batch::MultiVector<_type>* x) + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(_type) \ + void add_scaled(std::shared_ptr exec, \ + const batch::MultiVector<_type>* alpha, \ + const batch::MultiVector<_type>* x, \ + batch::MultiVector<_type>* y) + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(_type) \ + void compute_dot(std::shared_ptr exec, \ + const batch::MultiVector<_type>* x, \ + const batch::MultiVector<_type>* y, \ + batch::MultiVector<_type>* result) + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(_type) \ + void compute_conj_dot(std::shared_ptr exec, \ + const batch::MultiVector<_type>* x, \ + const batch::MultiVector<_type>* y, \ + batch::MultiVector<_type>* result) + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(_type) \ + void compute_norm2(std::shared_ptr exec, \ + const batch::MultiVector<_type>* x, \ + batch::MultiVector>* result) + +#define GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(_type) \ + void copy(std::shared_ptr exec, \ + const batch::MultiVector<_type>* x, \ + batch::MultiVector<_type>* result) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(ValueType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_multi_vector, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_ diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp new file mode 100644 index 00000000000..041630af66e --- /dev/null +++ b/core/base/batch_struct.hpp @@ -0,0 +1,142 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_BATCH_STRUCT_HPP_ +#define GKO_CORE_BASE_BATCH_STRUCT_HPP_ + + +#include +#include +#include + + +namespace gko { +namespace batch { +namespace multi_vector { + + +/** + * Encapsulates one matrix from a batch of multi-vectors. + */ +template +struct batch_item { + using value_type = ValueType; + ValueType* values; + int32 stride; + int32 num_rows; + int32 num_rhs; +}; + + +/** + * A 'simple' structure to store a global uniform batch of multi-vectors. + */ +template +struct uniform_batch { + using value_type = ValueType; + using entry_type = batch_item; + + ValueType* values; + size_type num_batch_items; + int32 stride; + int32 num_rows; + int32 num_rhs; + + inline size_type get_single_item_num_nnz() const + { + return static_cast(stride * num_rows); + } +}; + + +template +GKO_ATTRIBUTES GKO_INLINE ValueType* batch_item_ptr( + ValueType* const batch_start, const size_type stride, const int num_rows, + const size_type batch_idx) +{ + return batch_start + batch_idx * stride * num_rows; +} + + +} // namespace multi_vector + + +template +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item to_const( + const multi_vector::batch_item& b) +{ + return {b.values, b.stride, b.num_rows, b.num_rhs}; +} + + +template +GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch to_const( + const multi_vector::uniform_batch& ub) +{ + return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs}; +} + + +/** + * Extract one object (matrix, vector etc.) from a batch of objects + * + * This overload is for batch multi-vectors. + * These overloads are intended to be called from within a kernel. + * + * @param batch The batch of objects to extract from + * @param batch_idx The position of the desired object in the batch + */ +template +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item +extract_batch_item(const multi_vector::uniform_batch& batch, + const size_type batch_idx) +{ + return {batch.values + batch_idx * batch.stride * batch.num_rows, + batch.stride, batch.num_rows, batch.num_rhs}; +} + +template +GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item +extract_batch_item(ValueType* const batch_values, const int32 stride, + const int32 num_rows, const int32 num_rhs, + const size_type batch_idx) +{ + return {batch_values + batch_idx * stride * num_rows, stride, num_rows, + num_rhs}; +} + + +} // namespace batch +} // namespace gko + + +#endif // GKO_CORE_BASE_BATCH_STRUCT_HPP_ diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp new file mode 100644 index 00000000000..cc92d294173 --- /dev/null +++ b/core/base/batch_utilities.hpp @@ -0,0 +1,451 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_BATCH_UTILITIES_HPP_ +#define GKO_CORE_BASE_BATCH_UTILITIES_HPP_ + + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace batch { + + +/** + * Duplicate a given input batch object. + */ +template +std::unique_ptr duplicate(std::shared_ptr exec, + size_type num_duplications, + const OutputType* input, + TArgs&&... create_args) +{ + auto num_batch_items = input->get_num_batch_items(); + auto tmp = + OutputType::create(exec, + batch_dim<2>(num_batch_items * num_duplications, + input->get_common_size()), + std::forward(create_args)...); + + for (size_type i = 0; i < num_duplications; ++i) { + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(i * num_batch_items + b) + ->copy_from(input->create_const_view_for_item(b).get()); + } + } + + return std::move(tmp); +} + + +/** + * Duplicate a monolithic matrix and create a batch object. + */ +template +std::unique_ptr create_from_item( + std::shared_ptr exec, const size_type num_duplications, + const typename OutputType::unbatch_type* input, TArgs&&... create_args) +{ + auto num_batch_items = num_duplications; + auto tmp = OutputType::create( + exec, batch_dim<2>(num_batch_items, input->get_size()), + std::forward(create_args)...); + + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(b)->copy_from(input); + } + + return std::move(tmp); +} + + +/** + * Create a batch object from a vector of monolithic object that share the same + * sparsity pattern. + * + * @note The sparsity of the elements in the input vector of matrices needs to + * be the same. TODO: Check for same sparsity among the different input items + */ +template +std::unique_ptr create_from_item( + std::shared_ptr exec, + const std::vector& input, + TArgs&&... create_args) +{ + auto num_batch_items = input.size(); + auto tmp = OutputType::create( + exec, batch_dim<2>(num_batch_items, input[0]->get_size()), + std::forward(create_args)...); + + for (size_type b = 0; b < num_batch_items; ++b) { + tmp->create_view_for_item(b)->copy_from(input[b]); + } + + return std::move(tmp); +} + + +/** + * Unbatch a batched object into a vector of items of its unbatch_type. + */ +template +auto unbatch(const InputType* batch_object) +{ + auto unbatched_mats = + std::vector>{}; + for (size_type b = 0; b < batch_object->get_num_batch_items(); ++b) { + unbatched_mats.emplace_back( + batch_object->create_const_view_for_item(b)->clone()); + } + return unbatched_mats; +} + + +namespace detail { + + +template +void assert_same_sparsity_in_batched_data( + const std::vector>& data) +{ + if (data.empty()) { + return; + } + auto num_nnz = data.at(0).nonzeros.size(); + auto base_data = data.at(0); + base_data.ensure_row_major_order(); + for (int b = 1; b < data.size(); ++b) { + if (data[b].nonzeros.size() != num_nnz) { + GKO_NOT_IMPLEMENTED; + } + auto temp_data = data.at(b); + temp_data.ensure_row_major_order(); + for (int nnz = 0; nnz < num_nnz; ++nnz) { + if (temp_data.nonzeros.at(nnz).row != + base_data.nonzeros.at(nnz).row || + temp_data.nonzeros.at(nnz).column != + base_data.nonzeros.at(nnz).column) { + GKO_NOT_IMPLEMENTED; + } + } + } +} + + +} // namespace detail + + +/** + * Create a batch object from a vector of gko::matrix_data objects. Each item of + * the vector needs to store the same sparsity pattern. + */ +template +std::unique_ptr read( + std::shared_ptr exec, + const std::vector>& data, + TArgs&&... create_args) +{ + auto num_batch_items = data.size(); + // Throw if all the items in the batch dont have same sparsity. + if (!std::is_same>::value && + !std::is_same>::value) { + detail::assert_same_sparsity_in_batched_data(data); + } + auto tmp = + OutputType::create(exec, batch_dim<2>(num_batch_items, data.at(0).size), + std::forward(create_args)...); + + for (size_type b = 0; b < num_batch_items; ++b) { + if (data.at(b).size != data.at(0).size) { + GKO_INVALID_STATE("Incorrect data passed in"); + } + tmp->create_view_for_item(b)->read(data[b]); + } + + return std::move(tmp); +} + + +/** + * Write a vector of matrix data objects from an input batch object. + */ +template +std::vector> write( + const OutputType* mvec) +{ + auto data = std::vector>( + mvec->get_num_batch_items()); + + for (size_type b = 0; b < mvec->get_num_batch_items(); ++b) { + data[b] = {mvec->get_common_size(), {}}; + mvec->create_const_view_for_item(b)->write(data[b]); + } + + return data; +} + + +/** + * Creates and initializes a batch of the specified Matrix type from a series of + * single column-vectors. + * + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the batch vector + * @param exec Executor associated to the vector + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; + auto common_size = dim<2>(common_num_rows, 1); + for (auto& val : vals) { + GKO_ASSERT_EQ(common_num_rows, val.size()); + } + auto b_size = batch_dim<2>(num_batch_items, common_size); + size_type batch = 0; + std::vector input_mat_data(num_batch_items, common_size); + for (const auto& b : vals) { + input_mat_data[batch].nonzeros.reserve(b.size()); + size_type idx = 0; + for (const auto& elem : b) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem); + } + ++idx; + } + ++batch; + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch of matrices. + * + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param vals values used to initialize the matrix + * @param exec Executor associated with the matrix + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + std::initializer_list>> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + size_type num_batch_items = vals.size(); + GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty"); + auto vals_begin = begin(vals); + size_type common_num_rows = vals_begin ? vals_begin->size() : 0; + size_type common_num_cols = + vals_begin->begin() ? vals_begin->begin()->size() : 0; + auto common_size = dim<2>(common_num_rows, common_num_cols); + for (const auto& b : vals) { + auto num_rows = b.size(); + auto num_cols = begin(b)->size(); + auto b_size = dim<2>(num_rows, num_cols); + GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size); + } + + auto b_size = batch_dim<2>(num_batch_items, common_size); + size_type batch = 0; + std::vector input_mat_data(num_batch_items, common_size); + for (const auto& b : vals) { + size_type ridx = 0; + for (const auto& row : b) { + size_type cidx = 0; + for (const auto& elem : row) { + if (elem != zero()) { + input_mat_data[batch].nonzeros.emplace_back(ridx, cidx, + elem); + } + ++cidx; + } + ++ridx; + } + ++batch; + } + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a batch of specified Matrix type with a single + * column-vector by making copies of the single input column vector. + * + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_batch_items The number of times the input vector is to be + * duplicated + * @param vals values used to initialize each vector in the temp. batch + * @param exec Executor associated with the matrix + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + const size_type num_batch_items, + std::initializer_list vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, + "Input data is empty"); + auto num_rows = begin(vals) ? vals.size() : 0; + auto common_size = dim<2>(num_rows, 1); + auto b_size = batch_dim<2>(num_batch_items, common_size); + mat_data single_mat_data(common_size); + single_mat_data.nonzeros.reserve(num_rows); + size_type idx = 0; + for (const auto& elem : vals) { + if (elem != zero()) { + single_mat_data.nonzeros.emplace_back(idx, 0, elem); + } + ++idx; + } + std::vector input_mat_data(num_batch_items, single_mat_data); + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +/** + * Creates and initializes a matrix from copies of a given matrix. + * + * @tparam Matrix matrix type to initialize (It has to implement the + * read function) + * @tparam TArgs argument types for Matrix::create method + * (not including the implied Executor as the first argument) + * + * @param num_batch_items The number of times the input matrix is duplicated + * @param vals values used to initialize each matrix in the temp. batch + * @param exec Executor associated to the matrix + * @param create_args additional arguments passed to Matrix::create, not + * including the Executor, which is passed as the first + * argument + * + * @ingroup mat_formats + */ +template +std::unique_ptr initialize( + const size_type num_batch_items, + std::initializer_list> + vals, + std::shared_ptr exec, TArgs&&... create_args) +{ + using value_type = typename Matrix::value_type; + using index_type = typename Matrix::index_type; + using mat_data = gko::matrix_data; + GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0, + "Input data is empty"); + auto common_size = dim<2>(begin(vals) ? vals.size() : 0, + begin(vals) ? begin(vals)->size() : 0); + batch_dim<2> b_size(num_batch_items, common_size); + mat_data single_mat_data(common_size); + size_type ridx = 0; + for (const auto& row : vals) { + size_type cidx = 0; + for (const auto& elem : row) { + if (elem != zero()) { + single_mat_data.nonzeros.emplace_back(ridx, cidx, elem); + } + ++cidx; + } + ++ridx; + } + std::vector input_mat_data(num_batch_items, single_mat_data); + return read( + exec, input_mat_data, std::forward(create_args)...); +} + + +} // namespace batch +} // namespace gko + + +#endif // GKO_CORE_BASE_BATCH_UTILITIES_HPP_ diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp new file mode 100644 index 00000000000..7f5e4125e10 --- /dev/null +++ b/core/base/copy_assignable.hpp @@ -0,0 +1,130 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ +#define GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ + + +#include + + +namespace gko { +namespace detail { + + +template +class copy_assignable; + + +/** + * Helper class to make a type copy assignable. + * + * This class wraps an object of a type that has a copy constructor, but not + * a copy assignment. This is most often the case for lambdas. The wrapped + * object can then be copy assigned, by relying on the copy constructor. + * + * @tparam T type with a copy constructor + */ +template +class copy_assignable< + T, typename std::enable_if::value>::type> { +public: + copy_assignable() = default; + + copy_assignable(const copy_assignable& other) + { + if (this != &other) { + *this = other; + } + } + + copy_assignable(copy_assignable&& other) noexcept + { + if (this != &other) { + *this = std::move(other); + } + } + + copy_assignable(const T& obj) : obj_{new (buf)(T)(obj)} {} + + copy_assignable(T&& obj) : obj_{new (buf)(T)(std::move(obj))} {} + + copy_assignable& operator=(const copy_assignable& other) + { + if (this != &other) { + if (obj_) { + obj_->~T(); + } + obj_ = new (buf)(T)(*other.obj_); + } + return *this; + } + + copy_assignable& operator=(copy_assignable&& other) noexcept + { + if (this != &other) { + if (obj_) { + obj_->~T(); + } + obj_ = new (buf)(T)(std::move(*other.obj_)); + } + return *this; + } + + ~copy_assignable() + { + if (obj_) { + obj_->~T(); + } + } + + template + decltype(auto) operator()(Args&&... args) const + { + return (*obj_)(std::forward(args)...); + } + + T const& get() const { return *obj_; } + + T& get() { return *obj_; } + +private: + //!< Store wrapped object on the stack, should use std::optional in c++17 + T* obj_{}; + alignas(T) unsigned char buf[sizeof(T)]; +}; + + +} // namespace detail +} // namespace gko + +#endif // GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_ diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp index 155d5ef6c23..7ca04107575 100644 --- a/core/base/dispatch_helper.hpp +++ b/core/base/dispatch_helper.hpp @@ -54,16 +54,16 @@ namespace gko { * @note this is the end case */ template -void run(T, Func, Args...) +void run(T obj, Func, Args...) { - GKO_NOT_IMPLEMENTED; + GKO_NOT_SUPPORTED(obj); } /** * run uses template to go through the list and select the valid * template and run it. * - * @tparam K the current type tried in the convertion + * @tparam K the current type tried in the conversion * @tparam ...Types the other types will be tried in the conversion if K fails * @tparam T the type of input object * @tparam Func the function will run if the object can be converted to K @@ -97,9 +97,9 @@ void run(T obj, Func f, Args... args) */ template