From ac149517f82c714d83a16d7ee584c52156cfb82e Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 23 Apr 2024 18:13:51 +0200 Subject: [PATCH] Add workspace aliasing and use int in logger --- ABOUT-LICENSING.md | 16 ++ common/cuda_hip/log/batch_logger.hpp.inc | 2 +- core/log/batch_logger.cpp | 53 +++- dpcpp/log/batch_logger.hpp | 2 +- include/ginkgo/core/base/types.hpp | 16 ++ .../ginkgo/core/base/workspace_aliases.hpp | 253 ++++++++++++++++++ include/ginkgo/core/log/batch_logger.hpp | 36 +-- include/ginkgo/core/log/logger.hpp | 4 +- .../ginkgo/core/solver/batch_solver_base.hpp | 5 +- include/ginkgo/ginkgo.hpp | 1 + reference/log/batch_logger.hpp | 2 +- .../test/solver/batch_bicgstab_kernels.cpp | 2 +- 12 files changed, 365 insertions(+), 27 deletions(-) create mode 100644 include/ginkgo/core/base/workspace_aliases.hpp diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md index b2fbb31f961..00a803b2ff4 100644 --- a/ABOUT-LICENSING.md +++ b/ABOUT-LICENSING.md @@ -238,3 +238,19 @@ When using testing with MPI switched on, the gtest-mpi-listener header only libr > THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The file `workspace_aliases.cuh` is a modified version of the code from CCCL(https://github.com/NVIDIA/cccl). The original code from CCCL is available through the Apache-2.0 and the BSD-3 licenses. We re-state the Apache-2.0 license here below: + +> Copyright 2021 NVIDIA Corporation +> +> Licensed under the Apache License, Version 2.0 (the "License"); +> you may not use this file except in compliance with the License. +> You may obtain a copy of the License at +> +> http://www.apache.org/licenses/LICENSE-2.0 +> +> Unless required by applicable law or agreed to in writing, software +> distributed under the License is distributed on an "AS IS" BASIS, +> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +> See the License for the specific language governing permissions and +> limitations under the License. diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp.inc index 26134d5002e..04b614b50f9 100644 --- a/common/cuda_hip/log/batch_logger.hpp.inc +++ b/common/cuda_hip/log/batch_logger.hpp.inc @@ -9,7 +9,7 @@ template class SimpleFinalLogger final { public: using real_type = RealType; - using idx_type = int64; + using idx_type = int; SimpleFinalLogger(real_type* const batch_residuals, idx_type* const batch_iters) diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp index 41f272b78c5..f87c607f77b 100644 --- a/core/log/batch_logger.cpp +++ b/core/log/batch_logger.cpp @@ -13,16 +13,63 @@ namespace gko { namespace batch { namespace log { +// namespace detail { + + +// template +// log_data::log_data(std::shared_ptr exec, +// size_type num_batch_items) +// : res_norms(exec), iter_counts(exec) +// { +// if (num_batch_items > 0) { +// iter_counts.resize_and_reset(num_batch_items); +// res_norms.resize_and_reset(num_batch_items); +// } else { +// GKO_INVALID_STATE("Invalid num batch items passed in"); +// } +// } + + +// template +// log_data::log_data(std::shared_ptr exec, +// size_type num_batch_items, +// array& workspace) +// : res_norms(exec), iter_counts(exec) +// { +// const size_type workspace_size = +// num_batch_items * (sizeof(real_type) + sizeof(idx_type)); + +// if (num_batch_items > 0 && !workspace.is_owning() && +// workspace.get_size() >= workspace_size) { +// gko::detail::layout<2> workspace_alias; +// auto slot_1 = workspace_alias.get_slot(0); +// auto slot_2 = workspace_alias.get_slot(1); + +// // Temporary storage mapping +// workspace_alias.map_to_buffer(workspace.get_data(), workspace_size); +// iter_counts = array::view( +// exec, num_batch_items, +// slot_1->create_alias(num_batch_items).get()); +// res_norms = array::view( +// exec, num_batch_items, +// slot_2->create_alias(num_batch_items).get()); +// } else { +// GKO_INVALID_STATE("invalid workspace or num batch items passed in"); +// } +// } + + +// } // namespace detail template void BatchConvergence::on_batch_solver_completed( - const array& iteration_count, + const array& iteration_count, const array>& residual_norm) const { if (this->iteration_count_.get_size() == 0) { - this->iteration_count_ = gko::array( - iteration_count.get_executor(), iteration_count.get_size()); + this->iteration_count_ = gko::array(iteration_count.get_executor(), + iteration_count.get_size()); } if (this->residual_norm_.get_size() == 0) { this->residual_norm_ = gko::array>( diff --git a/dpcpp/log/batch_logger.hpp b/dpcpp/log/batch_logger.hpp index 663d8b0a2a0..309c624d6fc 100644 --- a/dpcpp/log/batch_logger.hpp +++ b/dpcpp/log/batch_logger.hpp @@ -29,7 +29,7 @@ template class SimpleFinalLogger final { public: using real_type = remove_complex; - using idx_type = int64; + using idx_type = int; SimpleFinalLogger(real_type* const batch_residuals, idx_type* const batch_iters) diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 39c3c5ab06c..679f13cefb5 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -35,6 +35,22 @@ #endif // defined(__CUDACC__) || defined(__HIPCC__) +// Macros for handling different device error return types uniformly +#if defined(__CUDACC__) +#define GKO_DEVICE_ERROR_TYPE cudaError_t +#define GKO_DEVICE_ERROR_INVALID cudaErrorInvalidValue +#define GKO_DEVICE_NO_ERROR cudaSuccess +#elif defined(__HIPCC__) +#define GKO_DEVICE_ERROR_TYPE hipError_t +#define GKO_DEVICE_ERROR_INVALID hipErrorInvalidValue +#define GKO_DEVICE_NO_ERROR hipSuccess +#else +#define GKO_DEVICE_ERROR_TYPE int +#define GKO_DEVICE_ERROR_INVALID 1 +#define GKO_DEVICE_NO_ERROR 0 +#endif + + #if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \ defined(__HIP_DEVICE_COMPILE__) diff --git a/include/ginkgo/core/base/workspace_aliases.hpp b/include/ginkgo/core/base/workspace_aliases.hpp new file mode 100644 index 00000000000..d6c347de7bd --- /dev/null +++ b/include/ginkgo/core/base/workspace_aliases.hpp @@ -0,0 +1,253 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + + +#ifndef GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_ +#define GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_ + + +#include +#include + + +// This code is a modified version of the code from CCCL +// (https://github.com/NVIDIA/cccl) (cub/detail/temporary_storage.cuh and +// cub/temporary_storage.cuh), made available through the Apache-2.0 and BSD-3 +// licenses. See ABOUT-LICENSING.md for more details. + + +namespace gko { +namespace detail { + + +template +GKO_ATTRIBUTES GKO_INLINE GKO_DEVICE_ERROR_TYPE create_workspace_aliases( + void* workspace_ptr, size_t& num_bytes, void* (&allocations)[num_allocs], + size_t (&allocation_sizes)[num_allocs]) +{ + constexpr int align_bytes = 8; + constexpr int align_mask = ~(align_bytes - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[num_allocs]; + size_t bytes_needed = 0; + for (int i = 0; i < num_allocs; ++i) { + size_t allocation_bytes = + (allocation_sizes[i] + align_bytes - 1) & align_mask; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += align_bytes - 1; + + // Check if the caller is simply requesting the size of the storage + // allocation + if (!workspace_ptr) { + num_bytes = bytes_needed; + return GKO_DEVICE_NO_ERROR; + } + + // Check if enough storage provided + if (num_bytes < bytes_needed) { + return GKO_DEVICE_ERROR_INVALID; + } + + // Alias + workspace_ptr = + (void*)((size_t(workspace_ptr) + align_bytes - 1) & align_mask); + for (int i = 0; i < num_allocs; ++i) { + allocations[i] = + static_cast(workspace_ptr) + allocation_offsets[i]; + } + + return GKO_DEVICE_NO_ERROR; +} + + +class slot; + +template +class alias; + +template +class layout; + +class slot { + template + friend class alias; + + template + friend class layout; + +public: + slot() = default; + + /** + * @brief Returns an array of type @p T and length @p num_elems + */ + template + GKO_ATTRIBUTES alias create_alias(std::size_t num_elems = 0); + +private: + GKO_ATTRIBUTES void set_bytes_required(std::size_t new_size) + { + size_ = max(size_, new_size); + } + + GKO_ATTRIBUTES std::size_t get_bytes_required() const { return size_; } + + GKO_ATTRIBUTES void set_storage(void* ptr) { ptr_ = ptr; } + + GKO_ATTRIBUTES void* get_storage() const { return ptr_; } + + std::size_t size_{}; + + void* ptr_{}; +}; + +/** + * @brief Named memory region of a temporary storage slot + * + * @par Overview + * This class provides a typed wrapper of a temporary slot memory region. + * It can be considered as a field in the C++ union. It's only possible to + * increase the array size. + */ +template +class alias { + friend class slot; + +public: + alias() = delete; + + /** + * @brief Returns pointer to array + * + * If the @p num_elems number is equal to zero, or storage layout isn't + * mapped, + * @p nullptr is returned. + */ + GKO_ATTRIBUTES T* get() const + { + if (num_elems_ == 0) { + return nullptr; + } + + return reinterpret_cast(slot_.get_storage()); + } + +private: + GKO_ATTRIBUTES explicit alias(slot& slot, std::size_t num_elems = 0) + : slot_(slot), num_elems_(num_elems) + { + this->update_slot(); + } + + GKO_ATTRIBUTES void update_slot() + { + slot_.set_bytes_required(num_elems_ * sizeof(T)); + } + slot& slot_; + std::size_t num_elems_{}; +}; + + +template +GKO_ATTRIBUTES alias slot::create_alias(std::size_t num_elems) +{ + return alias(*this, num_elems); +} + + +/** + * @brief Temporary storage layout represents a structure with + * @p num_slots union-like fields + * + * The layout can be mapped to a temporary buffer only once. + * + * @par A Simple Example + * @code + * gko::detail::layout<2> temp; + * + * auto slot_1 = temp.get_slot(0); + * auto slot_2 = temp.get_slot(1); + * + * // Add fields into the first slot + * auto int_array = slot_1->create_alias(1); + * auto double_array = slot_2->create_alias(2); + * + * temporary_storage.map_to_buffer(workspace_ptr, num_bytes); + * + * // Use pointers + * int *int_ptr = int_array.get(); + * double *double_ptr = double_array.get(); + * @endcode + */ +template +class layout { +public: + layout() = default; + + GKO_ATTRIBUTES slot* get_slot(int slot_id) + { + if (slot_id < num_slots) { + return &slots_[slot_id]; + } + + return nullptr; + } + + /** + * @brief Maps the layout to the temporary storage buffer. + */ + GKO_ATTRIBUTES GKO_DEVICE_ERROR_TYPE map_to_buffer(void* workspace_ptr, + std::size_t num_bytes) + { + if (is_layout_mapped_) { + return GKO_DEVICE_ERROR_INVALID; // TODO: maybe use something + // similar to + // cudaErrorAlreadyMapped + } + + this->initialize(); + + GKO_DEVICE_ERROR_TYPE error = GKO_DEVICE_NO_ERROR; + if ((error = create_workspace_aliases(workspace_ptr, num_bytes, + data_ptrs_, slot_sizes_))) { + return error; + } + + for (std::size_t slot_id = 0; slot_id < num_slots; slot_id++) { + slots_[slot_id].set_storage(data_ptrs_[slot_id]); + } + + is_layout_mapped_ = true; + return error; + } + +private: + GKO_ATTRIBUTES void initialize() + { + if (is_layout_mapped_) { + return; + } + + for (std::size_t slot_id = 0; slot_id < num_slots; slot_id++) { + const std::size_t slot_size = slots_[slot_id].get_bytes_required(); + + slot_sizes_[slot_id] = slot_size; + data_ptrs_[slot_id] = nullptr; + } + } + slot slots_[num_slots]; + std::size_t slot_sizes_[num_slots]; + void* data_ptrs_[num_slots]; + bool is_layout_mapped_{}; +}; + + +} // namespace detail +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_ diff --git a/include/ginkgo/core/log/batch_logger.hpp b/include/ginkgo/core/log/batch_logger.hpp index 16e0e887320..1c07c19acc0 100644 --- a/include/ginkgo/core/log/batch_logger.hpp +++ b/include/ginkgo/core/log/batch_logger.hpp @@ -11,6 +11,7 @@ #include #include +#include #include @@ -33,13 +34,11 @@ namespace detail { template struct log_data final { using real_type = remove_complex; - using idx_type = int64; + using idx_type = int; log_data(std::shared_ptr exec, size_type num_batch_items) : res_norms(exec), iter_counts(exec) { - const size_type workspace_size = - num_batch_items * (sizeof(real_type) + sizeof(idx_type)); if (num_batch_items > 0) { iter_counts.resize_and_reset(num_batch_items); res_norms.resize_and_reset(num_batch_items); @@ -52,18 +51,22 @@ struct log_data final { array& workspace) : res_norms(exec), iter_counts(exec) { - const size_type workspace_size = - num_batch_items * (sizeof(real_type) + sizeof(idx_type)); + const size_type workspace_size = num_batch_items * 32; + if (num_batch_items > 0 && !workspace.is_owning() && workspace.get_size() >= workspace_size) { - iter_counts = array::view( - exec, num_batch_items, - reinterpret_cast(workspace.get_data())); - res_norms = array::view( - exec, num_batch_items, - reinterpret_cast( - workspace.get_data() + - (sizeof(idx_type) * num_batch_items))); + gko::detail::layout<2> workspace_alias; + auto slot_1 = workspace_alias.get_slot(0); + auto slot_2 = workspace_alias.get_slot(1); + auto iter_alias = slot_1->create_alias(num_batch_items); + auto res_alias = slot_2->create_alias(num_batch_items); + + // Temporary storage mapping + workspace_alias.map_to_buffer(workspace.get_data(), workspace_size); + iter_counts = + array::view(exec, num_batch_items, iter_alias.get()); + res_norms = + array::view(exec, num_batch_items, res_alias.get()); } else { GKO_INVALID_STATE("invalid workspace or num batch items passed in"); } @@ -100,10 +103,11 @@ template class BatchConvergence final : public gko::log::Logger { public: using real_type = remove_complex; + using index_type = int; using mask_type = gko::log::Logger::mask_type; void on_batch_solver_completed( - const array& iteration_count, + const array& iteration_count, const array& residual_norm) const override; /** @@ -129,7 +133,7 @@ class BatchConvergence final : public gko::log::Logger { /** * @return The number of iterations for entire batch */ - const array& get_num_iterations() const noexcept + const array& get_num_iterations() const noexcept { return iteration_count_; } @@ -149,7 +153,7 @@ class BatchConvergence final : public gko::log::Logger { {} private: - mutable array iteration_count_{}; + mutable array iteration_count_{}; mutable array residual_norm_{}; }; diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index e1c68125811..7f7351addf5 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -566,7 +566,7 @@ public: \ * @param residual_norms the array storing the residual norms. */ virtual void on_batch_solver_completed( - const array& iters, const array& residual_norms) const + const array& iters, const array& residual_norms) const {} /** @@ -577,7 +577,7 @@ public: \ * @param residual_norms the array storing the residual norms. */ virtual void on_batch_solver_completed( - const array& iters, const array& residual_norms) const + const array& iters, const array& residual_norms) const {} public: diff --git a/include/ginkgo/core/solver/batch_solver_base.hpp b/include/ginkgo/core/solver/batch_solver_base.hpp index 983d30e971e..52ed5ea5378 100644 --- a/include/ginkgo/core/solver/batch_solver_base.hpp +++ b/include/ginkgo/core/solver/batch_solver_base.hpp @@ -289,8 +289,9 @@ class EnableBatchSolver auto id = Identity::create(exec, system_matrix->get_size()); preconditioner_ = std::move(id); } - const size_type workspace_size = system_matrix->get_num_batch_items() * - (sizeof(real_type) + sizeof(int64)); + const size_type workspace_size = + system_matrix->get_num_batch_items() * 32; + // 2 * (sizeof(real_type) + sizeof(int)); workspace_.set_executor(exec); workspace_.resize_and_reset(workspace_size); } diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index dede45cf6e6..54d43826478 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -51,6 +51,7 @@ #include #include #include +#include #include diff --git a/reference/log/batch_logger.hpp b/reference/log/batch_logger.hpp index 6868511a71b..f8af78b875b 100644 --- a/reference/log/batch_logger.hpp +++ b/reference/log/batch_logger.hpp @@ -24,7 +24,7 @@ template class SimpleFinalLogger final { public: using real_type = RealType; - using idx_type = int64; + using idx_type = int; /** * Constructor diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp index 2051b1764b4..6dba8a0ede9 100644 --- a/reference/test/solver/batch_bicgstab_kernels.cpp +++ b/reference/test/solver/batch_bicgstab_kernels.cpp @@ -229,7 +229,7 @@ TYPED_TEST(BatchBicgstab, CanSolveEllSystem) .with_tolerance_type(gko::batch::stop::tolerance_type::relative) .on(this->exec); const int num_rows = 13; - const size_t num_batch_items = 2; + const size_t num_batch_items = 1; const int num_rhs = 1; auto stencil_mat = gko::share(gko::test::generate_3pt_stencil_batch_matrix(