oneapi-src · abhilash1910 · Mar 25, 2024 · Mar 25, 2024 · May 10, 2024 · May 10, 2024
@@ -527,6 +527,108 @@ class workgroup_load {
 private:
   uint8_t *_local_memory;
 };
+
+/// Store blocked/warped or striped work items into linear segment of items.
+/// Helper for Block Store
+enum store_algorithm {
+
+  BLOCK_STORE_DIRECT,
+  BLOCK_STORE_STRIPED,
+  // To-do: BLOCK_STORE_WARP_TRANSPOSE
+  // To-do: BLOCK_STORE_VECTORIZE
+
+};
+
+/// Stores a blocked arrangement of work items linear segment of items.
 /// Computes a CSR format sparse matrix-dense matrix product. 
 /// C = alpha * op(A) * B + beta * C 
 /// \param [in] queue The queue where the routine should be executed. It must 
 /// have the in_order property when using the USM mode. 
 /// \param [in] trans The operation applied to the matrix A. 
 /// \param [in] sparse_rows Number of rows of the matrix A. 
 /// \param [in] dense_cols Number of columns of the matrix op(B) or C. 
 /// \param [in] sparse_cols Number of columns of the matrix A. 
 /// \param [in] alpha Scaling factor for the matrix A. 
 /// \param [in] info Matrix info of the matrix A. 
 /// \param [in] val An array containing the non-zero elements of the matrix A. 
 /// \param [in] row_ptr An array of length \p num_rows + 1. 
 /// \param [in] col_ind An array containing the column indices in index-based 
 /// numbering. 
 /// \param [in] b Data of the matrix B. 
 /// \param [in] ldb Leading dimension of the matrix B. 
 /// \param [in] beta Scaling factor for the matrix B. 
 /// \param [in, out] c Data of the matrix C. 
 /// \param [in] ldc Leading dimension of the matrix C. 
 template <typename T> 
 void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans, int sparse_rows, 
            int dense_cols, int sparse_cols, const T *alpha, 
            const std::shared_ptr<matrix_info> info, const T *val, 
            const int *row_ptr, const int *col_ind, const T *b, int ldb, 
            const T *beta, T *c, int ldc) { 
   csrmm<T>(queue, trans, oneapi::mkl::transpose::nontrans, sparse_rows, 
            dense_cols, sparse_cols, alpha, info, val, row_ptr, col_ind, b, ldb, 
            beta, c, ldc); 
 } 
 /// Computes a CSR format sparse matrix-dense matrix product. 
 /// C = alpha * op(A) * B + beta * C 
 /// \param [in] queue The queue where the routine should be executed. It must 
 /// have the in_order property when using the USM mode. 
 /// \param [in] trans The operation applied to the matrix A. 
 /// \param [in] sparse_rows Number of rows of the matrix A. 
 /// \param [in] dense_cols Number of columns of the matrix op(B) or C. 
 /// \param [in] sparse_cols Number of columns of the matrix A. 
 /// \param [in] alpha Scaling factor for the matrix A. 
 /// \param [in] info Matrix info of the matrix A. 
 /// \param [in] val An array containing the non-zero elements of the matrix A. 
 /// \param [in] row_ptr An array of length \p num_rows + 1. 
 /// \param [in] col_ind An array containing the column indices in index-based 
 /// numbering. 
 /// \param [in] b Data of the matrix B. 
 /// \param [in] ldb Leading dimension of the matrix B. 
 /// \param [in] beta Scaling factor for the matrix B. 
 /// \param [in, out] c Data of the matrix C. 
 /// \param [in] ldc Leading dimension of the matrix C. 
 template <typename T> 
 void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans, int sparse_rows, 
            int dense_cols, int sparse_cols, const T *alpha, 
            const std::shared_ptr<matrix_info> info, const T *val, 
            const int *row_ptr, const int *col_ind, const T *b, int ldb, 
            const T *beta, T *c, int ldc) { 
   csrmm<T>(queue, trans, oneapi::mkl::transpose::nontrans, sparse_rows, 
            dense_cols, sparse_cols, alpha, info, val, row_ptr, col_ind, b, ldb, 
            beta, c, ldc); 
 } 
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT,
+          typename OutputIteratorT, typename Item>
+__dpct_inline__ void store_blocked(const Item &item, OutputIteratorT block_itr,
+                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range storage across
+  // workgroup items To-do: Decide whether range storage is required for group
+  // storage
+  size_t linear_tid = item.get_local_linear_id();
+  OutputIteratorT workitem_itr = block_itr + (linear_tid * ITEMS_PER_WORK_ITEM);
+#pragma unroll
+  for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    workitem_itr[idx] = items[idx];
+  }
+}
+
+/// Stores a striped arrangement of work items linear segment of items.
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT,
+          typename OutputIteratorT, typename Item>
+__dpct_inline__ void store_striped(const Item &item, OutputIteratorT block_itr,
+                                  InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range storage across
+  // workgroup items To-do: Decide whether range storage is required for group
+  // storage
+  size_t linear_tid = item.get_local_linear_id();
+  OutputIteratorT workitem_itr = block_itr + linear_tid;
+  size_t GROUP_WORK_ITEMS = item.get_global_range().size();
+#pragma unroll
+  for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    workitem_itr[(idx * GROUP_WORK_ITEMS)] = items[idx];
+  }
+}
+
+/// Stores a warp-striped arrangement of work items linear segment of items.
+// Created as free function until exchange mechanism is
+// implemented.
+// To-do: inline this function with BLOCK_STORE_WARP_TRANSPOSE mechanism
+template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename OutputIteratorT,
+          typename Item>
+__dpct_inline__ void
+store_subgroup_striped(const Item &item, OutputIteratorT block_itr,
+                                    InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+  // This implementation does not take in account range loading across
+  // workgroup items To-do: Decide whether range loading is required for group
+  // loading
+  // This implementation uses unintialized memory for loading linear segments
+  // into warp striped arrangement.
+  uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id();
+  uint32_t subgroup_size = item.get_sub_group().get_local_linear_range();
+  uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id();
+  uint32_t initial_offset =
+      (subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset;
+  OutputIteratorT workitem_itr = block_itr + initial_offset;
+#pragma unroll
+  for (uint32_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
+    workitem_itr[(idx * subgroup_size)] = items[idx];
+  }
+}
+
+// template parameters :
+// ITEMS_PER_WORK_ITEM: size_t variable controlling the number of items per
+// thread/work_item
+// ALGORITHM: store_algorithm variable controlling the type of store operation.
+// InputT: type for input sequence.
+// OutputIteratorT:  output iterator type
+// Item : typename parameter resembling sycl::nd_item<3> .
+template <size_t ITEMS_PER_WORK_ITEM, store_algorithm ALGORITHM, typename InputT,
+          typename OutputIteratorT, typename Item>
+class workgroup_store {
+public:
+  static size_t get_local_memory_size(size_t group_work_items) { return 0; }
+  workgroup_store(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  __dpct_inline__ void store(const Item &item, OutputIteratorT block_itr,
+                            InputT (&items)[ITEMS_PER_WORK_ITEM]) {
+
+    if constexpr (ALGORITHM == BLOCK_STORE_DIRECT) {
+      store_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
+    } else if constexpr (ALGORITHM == BLOCK_STORE_STRIPED) {
+      store_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, (&items)[ITEMS_PER_WORK_ITEM]);
+    }
+  }
+
+private:
+  uint8_t *_local_memory;
+};
+
 } // namespace group
 } // namespace dpct