From ac149517f82c714d83a16d7ee584c52156cfb82e Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 23 Apr 2024 18:13:51 +0200
Subject: [PATCH] Add workspace aliasing and use int in logger

---
 ABOUT-LICENSING.md                            |  16 ++
 common/cuda_hip/log/batch_logger.hpp.inc      |   2 +-
 core/log/batch_logger.cpp                     |  53 +++-
 dpcpp/log/batch_logger.hpp                    |   2 +-
 include/ginkgo/core/base/types.hpp            |  16 ++
 .../ginkgo/core/base/workspace_aliases.hpp    | 253 ++++++++++++++++++
 include/ginkgo/core/log/batch_logger.hpp      |  36 +--
 include/ginkgo/core/log/logger.hpp            |   4 +-
 .../ginkgo/core/solver/batch_solver_base.hpp  |   5 +-
 include/ginkgo/ginkgo.hpp                     |   1 +
 reference/log/batch_logger.hpp                |   2 +-
 .../test/solver/batch_bicgstab_kernels.cpp    |   2 +-
 12 files changed, 365 insertions(+), 27 deletions(-)
 create mode 100644 include/ginkgo/core/base/workspace_aliases.hpp
diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md
index b2fbb31f961..00a803b2ff4 100644
--- a/ABOUT-LICENSING.md
+++ b/ABOUT-LICENSING.md
@@ -238,3 +238,19 @@ When using testing with MPI switched on, the gtest-mpi-listener header only libr
 > THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The file `workspace_aliases.cuh` is a modified version of the code from CCCL(https://github.com/NVIDIA/cccl). The original code from CCCL is available through the Apache-2.0 and the BSD-3 licenses. We re-state the Apache-2.0 license here below:
+
+> Copyright 2021 NVIDIA Corporation
+>
+>  Licensed under the Apache License, Version 2.0 (the "License");
+>  you may not use this file except in compliance with the License.
+>  You may obtain a copy of the License at
+>
+>      http://www.apache.org/licenses/LICENSE-2.0
+>
+>  Unless required by applicable law or agreed to in writing, software
+>  distributed under the License is distributed on an "AS IS" BASIS,
+>  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+>  See the License for the specific language governing permissions and
+>  limitations under the License.
diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp.inc
index 26134d5002e..04b614b50f9 100644
--- a/common/cuda_hip/log/batch_logger.hpp.inc
+++ b/common/cuda_hip/log/batch_logger.hpp.inc
@@ -9,7 +9,7 @@ template <typename RealType>
 class SimpleFinalLogger final {
 public:
     using real_type = RealType;
-    using idx_type = int64;
+    using idx_type = int;
 
     SimpleFinalLogger(real_type* const batch_residuals,
                       idx_type* const batch_iters)
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
index 41f272b78c5..f87c607f77b 100644
--- a/core/log/batch_logger.cpp
+++ b/core/log/batch_logger.cpp
@@ -13,16 +13,63 @@
 namespace gko {
 namespace batch {
 namespace log {
+// namespace detail {
+
+
+// template <typename ValueType>
+// log_data<ValueType>::log_data(std::shared_ptr<const Executor> exec,
+//                               size_type num_batch_items)
+//     : res_norms(exec), iter_counts(exec)
+// {
+//     if (num_batch_items > 0) {
+//         iter_counts.resize_and_reset(num_batch_items);
+//         res_norms.resize_and_reset(num_batch_items);
+//     } else {
+//         GKO_INVALID_STATE("Invalid num batch items passed in");
+//     }
+// }
+
+
+// template <typename ValueType>
+// log_data<ValueType>::log_data(std::shared_ptr<const Executor> exec,
+//                               size_type num_batch_items,
+//                               array<unsigned char>& workspace)
+//     : res_norms(exec), iter_counts(exec)
+// {
+//     const size_type workspace_size =
+//         num_batch_items * (sizeof(real_type) + sizeof(idx_type));
+
+//     if (num_batch_items > 0 && !workspace.is_owning() &&
+//         workspace.get_size() >= workspace_size) {
+//         gko::detail::layout<2> workspace_alias;
+//         auto slot_1 = workspace_alias.get_slot(0);
+//         auto slot_2 = workspace_alias.get_slot(1);
+
+//         // Temporary storage mapping
+//         workspace_alias.map_to_buffer(workspace.get_data(), workspace_size);
+//         iter_counts = array<idx_type>::view(
+//             exec, num_batch_items,
+//             slot_1->create_alias<idx_type>(num_batch_items).get());
+//         res_norms = array<real_type>::view(
+//             exec, num_batch_items,
+//             slot_2->create_alias<real_type>(num_batch_items).get());
+//     } else {
+//         GKO_INVALID_STATE("invalid workspace or num batch items passed in");
+//     }
+// }
+
+
+// }  // namespace detail
 
 
 template <typename ValueType>
 void BatchConvergence<ValueType>::on_batch_solver_completed(
-    const array<int64>& iteration_count,
+    const array<int>& iteration_count,
     const array<remove_complex<ValueType>>& residual_norm) const
 {
     if (this->iteration_count_.get_size() == 0) {
-        this->iteration_count_ = gko::array<int64>(
-            iteration_count.get_executor(), iteration_count.get_size());
+        this->iteration_count_ = gko::array<int>(iteration_count.get_executor(),
+                                                 iteration_count.get_size());
     }
     if (this->residual_norm_.get_size() == 0) {
         this->residual_norm_ = gko::array<remove_complex<ValueType>>(
diff --git a/dpcpp/log/batch_logger.hpp b/dpcpp/log/batch_logger.hpp
index 663d8b0a2a0..309c624d6fc 100644
--- a/dpcpp/log/batch_logger.hpp
+++ b/dpcpp/log/batch_logger.hpp
@@ -29,7 +29,7 @@ template <typename RealType>
 class SimpleFinalLogger final {
 public:
     using real_type = remove_complex<RealType>;
-    using idx_type = int64;
+    using idx_type = int;
 
     SimpleFinalLogger(real_type* const batch_residuals,
                       idx_type* const batch_iters)
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 39c3c5ab06c..679f13cefb5 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -35,6 +35,22 @@
 #endif  // defined(__CUDACC__) || defined(__HIPCC__)
 
 
+// Macros for handling different device error return types uniformly
+#if defined(__CUDACC__)
+#define GKO_DEVICE_ERROR_TYPE cudaError_t
+#define GKO_DEVICE_ERROR_INVALID cudaErrorInvalidValue
+#define GKO_DEVICE_NO_ERROR cudaSuccess
+#elif defined(__HIPCC__)
+#define GKO_DEVICE_ERROR_TYPE hipError_t
+#define GKO_DEVICE_ERROR_INVALID hipErrorInvalidValue
+#define GKO_DEVICE_NO_ERROR hipSuccess
+#else
+#define GKO_DEVICE_ERROR_TYPE int
+#define GKO_DEVICE_ERROR_INVALID 1
+#define GKO_DEVICE_NO_ERROR 0
+#endif
+
+
 #if (defined(__CUDA_ARCH__) && defined(__APPLE__)) || \
     defined(__HIP_DEVICE_COMPILE__)
 
diff --git a/include/ginkgo/core/base/workspace_aliases.hpp b/include/ginkgo/core/base/workspace_aliases.hpp
new file mode 100644
index 00000000000..d6c347de7bd
--- /dev/null
+++ b/include/ginkgo/core/base/workspace_aliases.hpp
@@ -0,0 +1,253 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#ifndef GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_
+#define GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+// This code is a modified version of the code from CCCL
+// (https://github.com/NVIDIA/cccl) (cub/detail/temporary_storage.cuh and
+// cub/temporary_storage.cuh), made available through the Apache-2.0 and BSD-3
+// licenses. See ABOUT-LICENSING.md for more details.
+
+
+namespace gko {
+namespace detail {
+
+
+template <int num_allocs>
+GKO_ATTRIBUTES GKO_INLINE GKO_DEVICE_ERROR_TYPE create_workspace_aliases(
+    void* workspace_ptr, size_t& num_bytes, void* (&allocations)[num_allocs],
+    size_t (&allocation_sizes)[num_allocs])
+{
+    constexpr int align_bytes = 8;
+    constexpr int align_mask = ~(align_bytes - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[num_allocs];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < num_allocs; ++i) {
+        size_t allocation_bytes =
+            (allocation_sizes[i] + align_bytes - 1) & align_mask;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += align_bytes - 1;
+
+    // Check if the caller is simply requesting the size of the storage
+    // allocation
+    if (!workspace_ptr) {
+        num_bytes = bytes_needed;
+        return GKO_DEVICE_NO_ERROR;
+    }
+
+    // Check if enough storage provided
+    if (num_bytes < bytes_needed) {
+        return GKO_DEVICE_ERROR_INVALID;
+    }
+
+    // Alias
+    workspace_ptr =
+        (void*)((size_t(workspace_ptr) + align_bytes - 1) & align_mask);
+    for (int i = 0; i < num_allocs; ++i) {
+        allocations[i] =
+            static_cast<char*>(workspace_ptr) + allocation_offsets[i];
+    }
+
+    return GKO_DEVICE_NO_ERROR;
+}
+
+
+class slot;
+
+template <typename T>
+class alias;
+
+template <int num_slots>
+class layout;
+
+class slot {
+    template <typename T>
+    friend class alias;
+
+    template <int>
+    friend class layout;
+
+public:
+    slot() = default;
+
+    /**
+     * @brief Returns an array of type @p T and length @p num_elems
+     */
+    template <typename T>
+    GKO_ATTRIBUTES alias<T> create_alias(std::size_t num_elems = 0);
+
+private:
+    GKO_ATTRIBUTES void set_bytes_required(std::size_t new_size)
+    {
+        size_ = max(size_, new_size);
+    }
+
+    GKO_ATTRIBUTES std::size_t get_bytes_required() const { return size_; }
+
+    GKO_ATTRIBUTES void set_storage(void* ptr) { ptr_ = ptr; }
+
+    GKO_ATTRIBUTES void* get_storage() const { return ptr_; }
+
+    std::size_t size_{};
+
+    void* ptr_{};
+};
+
+/**
+ * @brief Named memory region of a temporary storage slot
+ *
+ * @par Overview
+ * This class provides a typed wrapper of a temporary slot memory region.
+ * It can be considered as a field in the C++ union. It's only possible to
+ * increase the array size.
+ */
+template <typename T>
+class alias {
+    friend class slot;
+
+public:
+    alias() = delete;
+
+    /**
+     * @brief Returns pointer to array
+     *
+     * If the @p num_elems number is equal to zero, or storage layout isn't
+     * mapped,
+     * @p nullptr is returned.
+     */
+    GKO_ATTRIBUTES T* get() const
+    {
+        if (num_elems_ == 0) {
+            return nullptr;
+        }
+
+        return reinterpret_cast<T*>(slot_.get_storage());
+    }
+
+private:
+    GKO_ATTRIBUTES explicit alias(slot& slot, std::size_t num_elems = 0)
+        : slot_(slot), num_elems_(num_elems)
+    {
+        this->update_slot();
+    }
+
+    GKO_ATTRIBUTES void update_slot()
+    {
+        slot_.set_bytes_required(num_elems_ * sizeof(T));
+    }
+    slot& slot_;
+    std::size_t num_elems_{};
+};
+
+
+template <typename T>
+GKO_ATTRIBUTES alias<T> slot::create_alias(std::size_t num_elems)
+{
+    return alias<T>(*this, num_elems);
+}
+
+
+/**
+ * @brief Temporary storage layout represents a structure with
+ *        @p num_slots union-like fields
+ *
+ * The layout can be mapped to a temporary buffer only once.
+ *
+ * @par A Simple Example
+ * @code
+ * gko::detail::layout<2> temp;
+ *
+ * auto slot_1 = temp.get_slot(0);
+ * auto slot_2 = temp.get_slot(1);
+ *
+ * // Add fields into the first slot
+ * auto int_array = slot_1->create_alias<int>(1);
+ * auto double_array = slot_2->create_alias<double>(2);
+ *
+ * temporary_storage.map_to_buffer(workspace_ptr, num_bytes);
+ *
+ * // Use pointers
+ * int *int_ptr = int_array.get();
+ * double *double_ptr = double_array.get();
+ * @endcode
+ */
+template <int num_slots>
+class layout {
+public:
+    layout() = default;
+
+    GKO_ATTRIBUTES slot* get_slot(int slot_id)
+    {
+        if (slot_id < num_slots) {
+            return &slots_[slot_id];
+        }
+
+        return nullptr;
+    }
+
+    /**
+     * @brief Maps the layout to the temporary storage buffer.
+     */
+    GKO_ATTRIBUTES GKO_DEVICE_ERROR_TYPE map_to_buffer(void* workspace_ptr,
+                                                       std::size_t num_bytes)
+    {
+        if (is_layout_mapped_) {
+            return GKO_DEVICE_ERROR_INVALID;  // TODO: maybe use something
+                                              // similar to
+                                              // cudaErrorAlreadyMapped
+        }
+
+        this->initialize();
+
+        GKO_DEVICE_ERROR_TYPE error = GKO_DEVICE_NO_ERROR;
+        if ((error = create_workspace_aliases(workspace_ptr, num_bytes,
+                                              data_ptrs_, slot_sizes_))) {
+            return error;
+        }
+
+        for (std::size_t slot_id = 0; slot_id < num_slots; slot_id++) {
+            slots_[slot_id].set_storage(data_ptrs_[slot_id]);
+        }
+
+        is_layout_mapped_ = true;
+        return error;
+    }
+
+private:
+    GKO_ATTRIBUTES void initialize()
+    {
+        if (is_layout_mapped_) {
+            return;
+        }
+
+        for (std::size_t slot_id = 0; slot_id < num_slots; slot_id++) {
+            const std::size_t slot_size = slots_[slot_id].get_bytes_required();
+
+            slot_sizes_[slot_id] = slot_size;
+            data_ptrs_[slot_id] = nullptr;
+        }
+    }
+    slot slots_[num_slots];
+    std::size_t slot_sizes_[num_slots];
+    void* data_ptrs_[num_slots];
+    bool is_layout_mapped_{};
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_WORKSPACE_ALIASES_HPP_
diff --git a/include/ginkgo/core/log/batch_logger.hpp b/include/ginkgo/core/log/batch_logger.hpp
index 16e0e887320..1c07c19acc0 100644
--- a/include/ginkgo/core/log/batch_logger.hpp
+++ b/include/ginkgo/core/log/batch_logger.hpp
@@ -11,6 +11,7 @@
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/workspace_aliases.hpp>
 #include <ginkgo/core/log/logger.hpp>
 
 
@@ -33,13 +34,11 @@ namespace detail {
 template <typename ValueType>
 struct log_data final {
     using real_type = remove_complex<ValueType>;
-    using idx_type = int64;
+    using idx_type = int;
 
     log_data(std::shared_ptr<const Executor> exec, size_type num_batch_items)
         : res_norms(exec), iter_counts(exec)
     {
-        const size_type workspace_size =
-            num_batch_items * (sizeof(real_type) + sizeof(idx_type));
         if (num_batch_items > 0) {
             iter_counts.resize_and_reset(num_batch_items);
             res_norms.resize_and_reset(num_batch_items);
@@ -52,18 +51,22 @@ struct log_data final {
              array<unsigned char>& workspace)
         : res_norms(exec), iter_counts(exec)
     {
-        const size_type workspace_size =
-            num_batch_items * (sizeof(real_type) + sizeof(idx_type));
+        const size_type workspace_size = num_batch_items * 32;
+
         if (num_batch_items > 0 && !workspace.is_owning() &&
             workspace.get_size() >= workspace_size) {
-            iter_counts = array<idx_type>::view(
-                exec, num_batch_items,
-                reinterpret_cast<idx_type*>(workspace.get_data()));
-            res_norms = array<real_type>::view(
-                exec, num_batch_items,
-                reinterpret_cast<real_type*>(
-                    workspace.get_data() +
-                    (sizeof(idx_type) * num_batch_items)));
+            gko::detail::layout<2> workspace_alias;
+            auto slot_1 = workspace_alias.get_slot(0);
+            auto slot_2 = workspace_alias.get_slot(1);
+            auto iter_alias = slot_1->create_alias<idx_type>(num_batch_items);
+            auto res_alias = slot_2->create_alias<real_type>(num_batch_items);
+
+            // Temporary storage mapping
+            workspace_alias.map_to_buffer(workspace.get_data(), workspace_size);
+            iter_counts =
+                array<idx_type>::view(exec, num_batch_items, iter_alias.get());
+            res_norms =
+                array<real_type>::view(exec, num_batch_items, res_alias.get());
         } else {
             GKO_INVALID_STATE("invalid workspace or num batch items passed in");
         }
@@ -100,10 +103,11 @@ template <typename ValueType = default_precision>
 class BatchConvergence final : public gko::log::Logger {
 public:
     using real_type = remove_complex<ValueType>;
+    using index_type = int;
     using mask_type = gko::log::Logger::mask_type;
 
     void on_batch_solver_completed(
-        const array<int64>& iteration_count,
+        const array<index_type>& iteration_count,
         const array<real_type>& residual_norm) const override;
 
     /**
@@ -129,7 +133,7 @@ class BatchConvergence final : public gko::log::Logger {
     /**
      * @return  The number of iterations for entire batch
      */
-    const array<int64>& get_num_iterations() const noexcept
+    const array<index_type>& get_num_iterations() const noexcept
     {
         return iteration_count_;
     }
@@ -149,7 +153,7 @@ class BatchConvergence final : public gko::log::Logger {
     {}
 
 private:
-    mutable array<int64> iteration_count_{};
+    mutable array<index_type> iteration_count_{};
     mutable array<real_type> residual_norm_{};
 };
 
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index e1c68125811..7f7351addf5 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -566,7 +566,7 @@ public:                                                              \
      * @param residual_norms  the array storing the residual norms.
      */
     virtual void on_batch_solver_completed(
-        const array<int64>& iters, const array<double>& residual_norms) const
+        const array<int>& iters, const array<double>& residual_norms) const
     {}
 
     /**
@@ -577,7 +577,7 @@ public:                                                              \
      * @param residual_norms  the array storing the residual norms.
      */
     virtual void on_batch_solver_completed(
-        const array<int64>& iters, const array<float>& residual_norms) const
+        const array<int>& iters, const array<float>& residual_norms) const
     {}
 
 public:
diff --git a/include/ginkgo/core/solver/batch_solver_base.hpp b/include/ginkgo/core/solver/batch_solver_base.hpp
index 983d30e971e..52ed5ea5378 100644
--- a/include/ginkgo/core/solver/batch_solver_base.hpp
+++ b/include/ginkgo/core/solver/batch_solver_base.hpp
@@ -289,8 +289,9 @@ class EnableBatchSolver
             auto id = Identity::create(exec, system_matrix->get_size());
             preconditioner_ = std::move(id);
         }
-        const size_type workspace_size = system_matrix->get_num_batch_items() *
-                                         (sizeof(real_type) + sizeof(int64));
+        const size_type workspace_size =
+            system_matrix->get_num_batch_items() * 32;
+        // 2 * (sizeof(real_type) + sizeof(int));
         workspace_.set_executor(exec);
         workspace_.resize_and_reset(workspace_size);
     }
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index dede45cf6e6..54d43826478 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -51,6 +51,7 @@
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
 #include <ginkgo/core/base/version.hpp>
+#include <ginkgo/core/base/workspace_aliases.hpp>
 
 #include <ginkgo/core/config/property_tree.hpp>
 
diff --git a/reference/log/batch_logger.hpp b/reference/log/batch_logger.hpp
index 6868511a71b..f8af78b875b 100644
--- a/reference/log/batch_logger.hpp
+++ b/reference/log/batch_logger.hpp
@@ -24,7 +24,7 @@ template <typename RealType>
 class SimpleFinalLogger final {
 public:
     using real_type = RealType;
-    using idx_type = int64;
+    using idx_type = int;
 
     /**
      * Constructor
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
index 2051b1764b4..6dba8a0ede9 100644
--- a/reference/test/solver/batch_bicgstab_kernels.cpp
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -229,7 +229,7 @@ TYPED_TEST(BatchBicgstab, CanSolveEllSystem)
             .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
             .on(this->exec);
     const int num_rows = 13;
-    const size_t num_batch_items = 2;
+    const size_t num_batch_items = 1;
     const int num_rhs = 1;
     auto stencil_mat =
         gko::share(gko::test::generate_3pt_stencil_batch_matrix<const Mtx>(