Make a scaffold for opsax and sasax (#10)

lab-cosmo · Nov 22, 2023 · 2ea1745 · 2ea1745
1 parent 9303ba0
commit 2ea1745
Show file tree

Hide file tree

Showing 28 changed files with 928 additions and 29 deletions.
diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
@@ -59,6 +59,10 @@ add_library(mops
     "src/sap/capi.cpp"
     "src/hpe/hpe.cpp"
     "src/hpe/capi.cpp"
+    "src/opsax/opsax.cpp"
+    "src/opsax/capi.cpp"
+    "src/sasax/sasax.cpp"
+    "src/sasax/capi.cpp"
 
     "include/mops.hpp"
     "include/mops.h"
@@ -69,6 +73,10 @@ add_library(mops
     "include/mops/sap.h"
     "include/mops/hpe.hpp"
     "include/mops/hpe.h"
+    "include/mops/opsax.hpp"
+    "include/mops/opsax.h"
+    "include/mops/sasax.hpp"
+    "include/mops/sasax.h"
 )
 
 if(CMAKE_CUDA_COMPILER)

diff --git a/mops/include/mops.h b/mops/include/mops.h
@@ -2,7 +2,11 @@
 #define MOPS_H
 
 #include "mops/exports.h"     // IWYU pragma: export
+#include "mops/hpe.h"         // IWYU pragma: export
 #include "mops/opsa.h"        // IWYU pragma: export
+#include "mops/sap.h"         // IWYU pragma: export
+#include "mops/opsax.h"       // IWYU pragma: export
+#include "mops/sasax.h"       // IWYU pragma: export
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/mops/include/mops.hpp b/mops/include/mops.hpp
@@ -1,10 +1,14 @@
 #ifndef MOPS_HPP
 #define MOPS_HPP
 
-#include "mops/exports.h"    // IWYU pragma: export
-
-#include "mops/capi.hpp"     // IWYU pragma: export
-#include "mops/opsa.hpp"     // IWYU pragma: export
+#include "mops/exports.h"       // IWYU pragma: export
+
+#include "mops/capi.hpp"        // IWYU pragma: export
+#include "mops/hpe.hpp"         // IWYU pragma: export
+#include "mops/opsa.hpp"        // IWYU pragma: export
+#include "mops/sap.hpp"         // IWYU pragma: export
+#include "mops/opsax.hpp"       // IWYU pragma: export
+#include "mops/sasax.hpp"       // IWYU pragma: export
 
 
 #endif
diff --git a/mops/include/mops/hpe.h b/mops/include/mops/hpe.h
@@ -1,5 +1,5 @@
-#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_H
-#define MOPS_OUTER_PRODUCT_SCATTER_ADD_H
+#ifndef MOPS_HPE_H
+#define MOPS_HPE_H
 
 #include "mops/exports.h"
 #include "mops/tensor.h"

diff --git a/mops/include/mops/hpe.hpp b/mops/include/mops/hpe.hpp
@@ -1,5 +1,5 @@
-#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
-#define MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
+#ifndef MOPS_HPE_HPP
+#define MOPS_HPE_HPP
 
 #include <cstddef>
 #include <cstdint>

diff --git a/mops/include/mops/opsa.h b/mops/include/mops/opsa.h
@@ -1,5 +1,5 @@
-#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_H
-#define MOPS_OUTER_PRODUCT_SCATTER_ADD_H
+#ifndef MOPS_OPSA_H
+#define MOPS_OPSA_H
 
 #include "mops/exports.h"
 #include "mops/tensor.h"

diff --git a/mops/include/mops/opsa.hpp b/mops/include/mops/opsa.hpp
@@ -1,5 +1,5 @@
-#ifndef MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
-#define MOPS_OUTER_PRODUCT_SCATTER_ADD_HPP
+#ifndef MOPS_OPSA_HPP
+#define MOPS_OPSA_HPP
 
 #include <cstddef>
 #include <cstdint>

diff --git a/mops/include/mops/opsax.h b/mops/include/mops/opsax.h
@@ -0,0 +1,61 @@
+#ifndef MOPS_OPSAX_H
+#define MOPS_OPSAX_H
+
+#include "mops/exports.h"
+#include "mops/tensor.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// CPU version of mops::outer_product_scatter_add_with for 32-bit floats
+int MOPS_EXPORT mops_outer_product_scatter_add_with_weights_f32(
+    mops_tensor_3d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_r,
+    mops_tensor_2d_f32_t tensor_x,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j
+);
+
+
+/// CPU version of mops::outer_product_scatter_add_with for 64-bit floats
+int MOPS_EXPORT mops_outer_product_scatter_add_with_weights_f64(
+    mops_tensor_3d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_r,
+    mops_tensor_2d_f64_t tensor_x,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j
+);
+
+
+/// CUDA version of mops::outer_product_scatter_add_with for 32-bit floats
+int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f32(
+    mops_tensor_3d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_r,
+    mops_tensor_2d_f32_t tensor_x,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j
+);
+
+
+/// CUDA version of mops::outer_product_scatter_add_with for 64-bit floats
+int MOPS_EXPORT mops_cuda_outer_product_scatter_add_with_weights_f64(
+    mops_tensor_3d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_r,
+    mops_tensor_2d_f64_t tensor_x,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j
+);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/mops/include/mops/opsax.hpp b/mops/include/mops/opsax.hpp
@@ -0,0 +1,56 @@
+#ifndef MOPS_OPSAX_HPP
+#define MOPS_OPSAX_HPP
+
+#include <cstddef>
+#include <cstdint>
+
+#include "mops/exports.h"
+#include "mops/tensor.hpp"
+
+namespace mops {
+    /// TODO
+    template<typename scalar_t>
+    void MOPS_EXPORT outer_product_scatter_add_with_weights(
+        Tensor<scalar_t, 3> output,
+        Tensor<scalar_t, 2> tensor_a,
+        Tensor<scalar_t, 2> tensor_r,
+        Tensor<scalar_t, 2> tensor_x,
+        Tensor<int32_t, 1> i,
+        Tensor<int32_t, 1> j
+    );
+
+    // these templates will be precompiled and provided in the mops library
+    extern template void outer_product_scatter_add_with_weights(
+        Tensor<float, 3> output,
+        Tensor<float, 2> tensor_a,
+        Tensor<float, 2> tensor_r,
+        Tensor<float, 2> tensor_x,
+        Tensor<int32_t, 1> i,
+        Tensor<int32_t, 1> j
+    );
+
+    extern template void outer_product_scatter_add_with_weights(
+        Tensor<double, 3> output,
+        Tensor<double, 2> tensor_a,
+        Tensor<double, 2> tensor_r,
+        Tensor<double, 2> tensor_x,
+        Tensor<int32_t, 1> i,
+        Tensor<int32_t, 1> j
+    );
+
+    namespace cuda {
+        /// CUDA version of mops::outer_product_scatter_add_with
+        template<typename scalar_t>
+        void MOPS_EXPORT outer_product_scatter_add_with_weights(
+            Tensor<scalar_t, 3> output,
+            Tensor<scalar_t, 2> tensor_a,
+            Tensor<scalar_t, 2> tensor_r,
+            Tensor<scalar_t, 2> tensor_x,
+            Tensor<int32_t, 1> i,
+            Tensor<int32_t, 1> j
+        );
+    }
+}
+
+
+#endif
diff --git a/mops/include/mops/sap.h b/mops/include/mops/sap.h
@@ -1,5 +1,5 @@
-#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
-#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_H
+#ifndef MOPS_SAP_H
+#define MOPS_SAP_H
 
 #include "mops/exports.h"
 #include "mops/tensor.h"

diff --git a/mops/include/mops/sap.hpp b/mops/include/mops/sap.hpp
@@ -1,5 +1,5 @@
-#ifndef MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
-#define MOPS_SPARSE_ACCUMULATION_OF_PRODUCTS_HPP
+#ifndef MOPS_SAP_HPP
+#define MOPS_SAP_HPP
 
 #include <cstddef>
 #include <cstdint>

diff --git a/mops/include/mops/sasax.h b/mops/include/mops/sasax.h
@@ -0,0 +1,77 @@
+#ifndef MOPS_SASAX_H
+#define MOPS_SASAX_H
+
+#include "mops/exports.h"
+#include "mops/tensor.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// CPU version of mops::sparse_accumulation_scatter_add_with for 32-bit floats
+int MOPS_EXPORT mops_sparse_accumulation_scatter_add_with_weights_f32(
+    mops_tensor_3d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_r,
+    mops_tensor_3d_f32_t tensor_x,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j,
+    mops_tensor_1d_i32_t tensor_m_1,
+    mops_tensor_1d_i32_t tensor_m_2,
+    mops_tensor_1d_i32_t tensor_m_3
+);
+
+
+/// CPU version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
+int MOPS_EXPORT mops_sparse_accumulation_scatter_add_with_weights_f64(
+    mops_tensor_3d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_r,
+    mops_tensor_3d_f64_t tensor_x,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j,
+    mops_tensor_1d_i32_t tensor_m_1,
+    mops_tensor_1d_i32_t tensor_m_2,
+    mops_tensor_1d_i32_t tensor_m_3
+);
+
+
+/// CUDA version of mops::sparse_accumulation_scatter_add_with for 32-bit floats
+int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f32(
+    mops_tensor_3d_f32_t output,
+    mops_tensor_2d_f32_t tensor_a,
+    mops_tensor_2d_f32_t tensor_r,
+    mops_tensor_3d_f32_t tensor_x,
+    mops_tensor_1d_f32_t tensor_c,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j,
+    mops_tensor_1d_i32_t tensor_m_1,
+    mops_tensor_1d_i32_t tensor_m_2,
+    mops_tensor_1d_i32_t tensor_m_3
+);
+
+
+/// CUDA version of mops::sparse_accumulation_scatter_add_with for 64-bit floats
+int MOPS_EXPORT mops_cuda_sparse_accumulation_scatter_add_with_weights_f64(
+    mops_tensor_3d_f64_t output,
+    mops_tensor_2d_f64_t tensor_a,
+    mops_tensor_2d_f64_t tensor_r,
+    mops_tensor_3d_f64_t tensor_x,
+    mops_tensor_1d_f64_t tensor_c,
+    mops_tensor_1d_i32_t tensor_i,
+    mops_tensor_1d_i32_t tensor_j,
+    mops_tensor_1d_i32_t tensor_m_1,
+    mops_tensor_1d_i32_t tensor_m_2,
+    mops_tensor_1d_i32_t tensor_m_3
+);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/mops/include/mops/sasax.hpp b/mops/include/mops/sasax.hpp
@@ -0,0 +1,72 @@
+#ifndef MOPS_SASAX_HPP
+#define MOPS_SASAX_HPP
+
+#include <cstddef>
+#include <cstdint>
+
+#include "mops/exports.h"
+#include "mops/tensor.hpp"
+
+namespace mops {
+    /// TODO
+    template<typename scalar_t>
+    void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights(
+        Tensor<scalar_t, 3> output,
+        Tensor<scalar_t, 2> tensor_a,
+        Tensor<scalar_t, 2> tensor_r,
+        Tensor<scalar_t, 3> tensor_x,
+        Tensor<scalar_t, 1> tensor_c,
+        Tensor<int, 1> tensor_i,
+        Tensor<int, 1> tensor_j,
+        Tensor<int, 1> tensor_m_1,
+        Tensor<int, 1> tensor_m_2,
+        Tensor<int, 1> tensor_m_3
+    );
+
+    // these templates will be precompiled and provided in the mops library
+    extern template void sparse_accumulation_scatter_add_with_weights(
+        Tensor<float, 3> output,
+        Tensor<float, 2> tensor_a,
+        Tensor<float, 2> tensor_r,
+        Tensor<float, 3> tensor_x,
+        Tensor<float, 1> tensor_c,
+        Tensor<int, 1> tensor_i,
+        Tensor<int, 1> tensor_j,
+        Tensor<int, 1> tensor_m_1,
+        Tensor<int, 1> tensor_m_2,
+        Tensor<int, 1> tensor_m_3
+    );
+
+    extern template void sparse_accumulation_scatter_add_with_weights(
+        Tensor<double, 3> output,
+        Tensor<double, 2> tensor_a,
+        Tensor<double, 2> tensor_r,
+        Tensor<double, 3> tensor_x,
+        Tensor<double, 1> tensor_c,
+        Tensor<int, 1> tensor_i,
+        Tensor<int, 1> tensor_j,
+        Tensor<int, 1> tensor_m_1,
+        Tensor<int, 1> tensor_m_2,
+        Tensor<int, 1> tensor_m_3
+    );
+
+    namespace cuda {
+        /// CUDA version of mops::sparse_accumulation_scatter_add_with
+        template<typename scalar_t>
+        void MOPS_EXPORT sparse_accumulation_scatter_add_with_weights(
+            Tensor<scalar_t, 3> output,
+            Tensor<scalar_t, 2> tensor_a,
+            Tensor<scalar_t, 2> tensor_r,
+            Tensor<scalar_t, 3> tensor_x,
+            Tensor<scalar_t, 1> tensor_c,
+            Tensor<int, 1> tensor_i,
+            Tensor<int, 1> tensor_j,
+            Tensor<int, 1> tensor_m_1,
+            Tensor<int, 1> tensor_m_2,
+            Tensor<int, 1> tensor_m_3
+        );
+    }
+}
+
+
+#endif
diff --git a/mops/include/mops/tensor.h b/mops/include/mops/tensor.h
@@ -8,6 +8,16 @@
 extern "C" {
 #endif
 
+struct mops_tensor_3d_f32_t {
+    float* __restrict__ data;
+    int64_t shape[3];
+};
+
+struct mops_tensor_3d_f64_t {
+    double* __restrict__ data;
+    int64_t shape[3];
+};
+
 struct mops_tensor_2d_f32_t {
     float* __restrict__ data;
     int64_t shape[2];