apply script

ginkgo-project · May 19, 2024 · 5f4c652 · 5f4c652
1 parent 127c204
commit 5f4c652
Show file tree

Hide file tree

Showing 167 changed files with 4,029 additions and 8,735 deletions.
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(unified)
 set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
+    base/device_matrix_data_kernels.cpp
+    base/kernel_launch.hpp
+    base/kernel_launch_reduction.hpp
+    base/kernel_launch_solver.hpp
+    base/math.hpp
+    components/atomic.hpp
+    components/diagonal_block_manipulation.hpp
+    components/intrinsics.hpp
+    components/merging.hpp
+    components/prefix_sum.hpp
+    components/prefix_sum_kernels.cpp
+    components/reduction.hpp
+    components/searching.hpp
+    components/segment_scan.hpp
+    components/sorting.hpp
+    components/syncfree.hpp
+    components/thread_ids.hpp
+    components/uninitialized_array.hpp
+    components/warp_blas.hpp
+    distributed/index_map_kernels.cpp
+    distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
+    distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
+    factorization/cholesky_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/lu_kernels.cpp
+    factorization/par_ic_kernels.cpp
+    factorization/par_ict_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_filter_kernels.cpp
+    factorization/par_ilut_select_kernels.cpp
+    factorization/par_ilut_spgeam_kernels.cpp
+    factorization/par_ilut_sweep_kernels.cpp
+    log/batch_logger.hpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/diagonal_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/fbcsr_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    multigrid/pgm_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    reorder/rcm_kernels.cpp
+    solver/cb_gmres_kernels.cpp
+    solver/idr_kernels.cpp
+    solver/multigrid_kernels.cpp
+    stop/batch_criteria.hpp
+    )
+list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
diff --git a/...p/base/batch_multi_vector_kernels.hpp.inc → ...a_hip/base/batch_multi_vector_kernels.cpp b/...p/base/batch_multi_vector_kernels.hpp.inc → ...a_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,50 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <common/cuda_hip/base/batch_struct.hpp>
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -299,3 +343,14 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
         copy(src_b, dst_b);
     }
 }
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/...p/base/device_matrix_data_kernels.hpp.inc → ...a_hip/base/device_matrix_data_kernels.cpp b/...p/base/device_matrix_data_kernels.hpp.inc → ...a_hip/base/device_matrix_data_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace components {
+
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
@@ -99,3 +119,9 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
+
+
+}  // namespace components
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc → common/cuda_hip/base/kernel_launch.hpp b/common/cuda_hip/base/kernel_launch.hpp.inc → common/cuda_hip/base/kernel_launch.hpp
@@ -2,6 +2,55 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
+#endif
+
+
+#include <accessor/device_helper.hpp>
+
+
+#include <thrust/tuple.h>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+template <typename AccessorType>
+struct to_device_type_impl<gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+template <typename AccessorType>
+struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(const gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+
+namespace device_std = thrust;
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
     int64 size, KernelFunction fn, KernelArgs... args)
@@ -52,3 +101,8 @@ void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
             map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/..._hip/base/kernel_launch_reduction.hpp.inc → ...cuda_hip/base/kernel_launch_reduction.hpp b/..._hip/base/kernel_launch_reduction.hpp.inc → ...cuda_hip/base/kernel_launch_reduction.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__ __launch_bounds__(
@@ -505,3 +523,8 @@ void run_kernel_col_reduction_cached(
         }
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/...uda_hip/base/kernel_launch_solver.hpp.inc → ...on/cuda_hip/base/kernel_launch_solver.hpp b/...uda_hip/base/kernel_launch_solver.hpp.inc → ...on/cuda_hip/base/kernel_launch_solver.hpp
@@ -2,6 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
     int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
@@ -32,3 +46,8 @@ void run_kernel_solver(std::shared_ptr<const DefaultExecutor> exec,
             static_cast<int64>(default_stride), fn, map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/math.hpp.inc → common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp.inc → common/cuda_hip/base/math.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
+
+
+#include <thrust/complex.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+
+
 // We need this struct, because otherwise we would call a __host__ function in a
 // __device__ function (even though it is constexpr)
 template <typename T>
@@ -37,3 +50,9 @@ struct truncate_type_impl<thrust::complex<T>> {
 
 
 }  // namespace detail
+
+
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
diff --git a/common/cuda_hip/components/atomic.hpp.inc → common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp.inc → common/cuda_hip/components/atomic.hpp
@@ -2,6 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
+
+
+#include <type_traits>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -228,3 +244,11 @@ __forceinline__ __device__ thrust::complex<double> atomic_add(
     auto imag = atomic_add(addr + 1, val.imag());
     return {real, imag};
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
diff --git a/...nents/diagonal_block_manipulation.hpp.inc → ...omponents/diagonal_block_manipulation.hpp b/...nents/diagonal_block_manipulation.hpp.inc → ...omponents/diagonal_block_manipulation.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
+
+
+#include <type_traits>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace csr {
+
+
 /**
  * @internal
  *
@@ -63,3 +81,12 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
         }
     }
 }
+
+
+}  // namespace csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_