lab-cosmo · Luthaf · Mar 25, 2024 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/mops-torch/include/mops/torch/opsa.hpp b/mops-torch/include/mops/torch/opsa.hpp
@@ -7,7 +7,17 @@
 
 namespace mops_torch {
 
-/// TODO
+/*
+ * Outer-Product-Scatter-Add (OPSA)
+ * Computes the outer product between tensors A, B along the last dimension, and sums the result
+ * into a new tensor of shape [output_size, A.shape[1], B.shape[1]], where the summation index
+ * is given by the tensor indices_output.
+ *
+ * For example, If A has shape (5, 32) and B has shape (5, 16), and indices_output contains
+ * [0, 0, 1, 1, 2], the output will have shape (3, 32, 16). For example using numpy terminology, the
+ * value of output[0] in this case would be equal to
+ * output[0, :, :] = A[0, :, None] * B[0, None, :] + A[1, :, None] * B[1, None,  :]
+ */
 torch::Tensor outer_product_scatter_add(
     torch::Tensor A, torch::Tensor B, torch::Tensor indices_output, int64_t output_size
 );

diff --git a/mops-torch/src/opsa.cpp b/mops-torch/src/opsa.cpp
@@ -44,6 +44,25 @@ torch::Tensor OuterProductScatterAdd::forward(
                 details::torch_to_mops_1d<int32_t>(indices_output)
             );
         });
+    } else if (A.device().is_cuda()) {
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        output = torch::empty(
+            {output_size, A.size(1), B.size(1)},
+            torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
+        );
+
+        AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add", [&]() {
+            mops::cuda::outer_product_scatter_add<scalar_t>(
+                details::torch_to_mops_3d<scalar_t>(output),
+                details::torch_to_mops_2d<scalar_t>(A),
+                details::torch_to_mops_2d<scalar_t>(B),
+                details::torch_to_mops_1d<int32_t>(indices_output)
+            );
+        });
+
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError, "outer_product_scatter_add is not implemented for device " + A.device().str()
@@ -93,6 +112,34 @@ std::vector<torch::Tensor> OuterProductScatterAdd::backward(
                 details::torch_to_mops_1d<int32_t>(indices_output)
             );
         });
+    } else if (A.device().is_cuda()) {
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
+            auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+
+            if (A.requires_grad()) {
+                grad_A = torch::empty_like(A);
+                mops_grad_A = details::torch_to_mops_2d<scalar_t>(grad_A);
+            }
+
+            auto mops_grad_B = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (B.requires_grad()) {
+                grad_B = torch::empty_like(B);
+                mops_grad_B = details::torch_to_mops_2d<scalar_t>(grad_B);
+            }
+
+            mops::cuda::outer_product_scatter_add_vjp<scalar_t>(
+                mops_grad_A,
+                mops_grad_B,
+                details::torch_to_mops_3d<scalar_t>(grad_output),
+                details::torch_to_mops_2d<scalar_t>(A),
+                details::torch_to_mops_2d<scalar_t>(B),
+                details::torch_to_mops_1d<int32_t>(indices_output)
+            );
+        });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError, "outer_product_scatter_add is not implemented for device " + A.device().str()

diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
@@ -82,13 +82,14 @@ add_library(mops
     "src/opsaw/capi.cpp"
     "src/sasaw/sasaw.cpp"
     "src/sasaw/capi.cpp"
-    "src/utils.cpp"
+
+    "src/internal/checks.hpp"
+    "src/internal/utils.hpp"
+    "src/internal/utils.cpp"
 
     "include/mops.hpp"
     "include/mops.h"
     "include/mops/capi.hpp"
-    "include/mops/checks.hpp"
-    "include/mops/utils.hpp"
     "include/mops/opsa.hpp"
     "include/mops/opsa.h"
     "include/mops/sap.hpp"
@@ -99,15 +100,19 @@ add_library(mops
     "include/mops/opsaw.h"
     "include/mops/sasaw.hpp"
     "include/mops/sasaw.h"
-    "include/mops/checks.hpp"
-    "include/mops/utils.hpp"
 )
 
 if(CMAKE_CUDA_COMPILER)
     target_compile_definitions(mops PUBLIC MOPS_CUDA_ENABLED)
     target_sources(mops
         PRIVATE
-            src/opsa/opsa.cu
+        "src/internal/cuda_utils.cuh"
+        "src/internal/cuda_utils.cu"
+
+        "src/internal/cuda_first_occurences.cuh"
+        "src/internal/cuda_first_occurences.cu"
+
+        "src/opsa/opsa.cu"
     )
 endif()
 
@@ -119,6 +124,11 @@ set_target_properties(mops PROPERTIES
     CXX_VISIBILITY_PRESET hidden
 )
 
+if (CMAKE_CUDA_COMPILER)
+    set_target_properties(mops PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    set_target_properties(mops PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+endif()
+
 include(GenerateExportHeader)
 generate_export_header(mops
     BASE_NAME MOPS
@@ -132,6 +142,8 @@ target_include_directories(mops PUBLIC
     $<INSTALL_INTERFACE:include>
 )
 
+target_include_directories(mops PRIVATE src)
+
 
 # Handle optimization and OpenMP flags
 include(CheckCXXCompilerFlag)

diff --git a/mops/examples/CMakeLists.txt b/mops/examples/CMakeLists.txt
@@ -3,3 +3,9 @@ cmake_minimum_required(VERSION 3.16)
 add_executable(example_cpp example.cpp)
 target_link_libraries(example_cpp mops)
 add_test(NAME example_cpp COMMAND ./example_cpp)
+
+if (CMAKE_CUDA_COMPILER)
+    add_executable(example_cuda example.cu)
+    target_link_libraries(example_cuda mops)
+    add_test(NAME example_cuda COMMAND ./example_cuda)
+endif()
diff --git a/mops/examples/example.cu b/mops/examples/example.cu
@@ -0,0 +1,75 @@
+#include "mops.hpp"
+#include <cmath>
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <vector>
+
+/*host macro that checks for errors in CUDA calls, and prints the file + line
+ * and error string if one occurs
+ */
+#define CUDA_CHECK(call)                                                                           \
+    do {                                                                                           \
+        cudaError_t cudaStatus = (call);                                                           \
+        if (cudaStatus != cudaSuccess) {                                                           \
+            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - "                  \
+                      << cudaGetErrorString(cudaStatus) << std::endl;                              \
+            cudaDeviceReset();                                                                     \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    } while (0)
+
+int main() {
+    // To avoid calls with a very large number of arguments,
+    // mops uses a mops::Tensor<T, N_DIMS> struct which simply
+    // consists a data pointer and a shape in the form of a std::array.
+    //
+    // All mops operations take mops::Tensor objects as their
+    // inputs, and these can be initialized in the following way:
+
+    auto A = std::vector<double>(100 * 20);
+    auto B = std::vector<double>(100 * 5);
+    auto indices_output = std::vector<int32_t>(100);
+    auto output = std::vector<double>(10 * 20 * 5);
+
+    double *A_cuda;
+    double *B_cuda;
+    int32_t *indices_output_cuda;
+    double *output_cuda;
+
+    CUDA_CHECK(cudaMalloc(&A_cuda, A.size() * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&B_cuda, B.size() * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&indices_output_cuda, indices_output.size() * sizeof(int32_t)));
+    CUDA_CHECK(cudaMalloc(&output_cuda, output.size() * sizeof(double)));
+
+    CUDA_CHECK(cudaMemcpy(A_cuda, A.data(), A.size() * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(B_cuda, B.data(), B.size() * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+        indices_output_cuda,
+        indices_output.data(),
+        indices_output.size() * sizeof(int32_t),
+        cudaMemcpyHostToDevice
+    ));
+    CUDA_CHECK(cudaMemcpy(
+        output_cuda, output.data(), output.size() * sizeof(double), cudaMemcpyHostToDevice
+    ));
+
+    mops::cuda::outer_product_scatter_add<double>(
+        {output_cuda, {100, 20, 5}},
+        {A_cuda, {100, 20}},
+        {B_cuda, {100, 5}},
+        {indices_output_cuda, {100}}
+    );
+
+    CUDA_CHECK(cudaMemcpy(
+        output.data(), output_cuda, output.size() * sizeof(double), cudaMemcpyDeviceToHost
+    ));
+
+    CUDA_CHECK(cudaFree(A_cuda));
+    CUDA_CHECK(cudaFree(B_cuda));
+    CUDA_CHECK(cudaFree(indices_output_cuda));
+    CUDA_CHECK(cudaFree(output_cuda));
+
+    return 0;
+}
diff --git a/mops/include/mops.hpp b/mops/include/mops.hpp
@@ -9,6 +9,5 @@
 #include "mops/opsaw.hpp" // IWYU pragma: export
 #include "mops/sap.hpp"   // IWYU pragma: export
 #include "mops/sasaw.hpp" // IWYU pragma: export
-#include "mops/utils.hpp" // IWYU pragma: export
 
 #endif
diff --git a/mops/include/mops/opsa.h b/mops/include/mops/opsa.h
@@ -60,6 +60,26 @@ int MOPS_EXPORT mops_cuda_outer_product_scatter_add_f64(
     mops_tensor_1d_i32_t indices_output
 );
 
+/// CUDA version of mops::outer_product_scatter_add_vjp for 32-bit floats
+int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f32(
+    mops_tensor_2d_f32_t grad_A,
+    mops_tensor_2d_f32_t grad_B,
+    mops_tensor_3d_f32_t grad_output,
+    mops_tensor_2d_f32_t A,
+    mops_tensor_2d_f32_t B,
+    mops_tensor_1d_i32_t indices_output
+);
+
+/// CUDA version of mops::outer_product_scatter_add_vjp for 64-bit floats
+int MOPS_EXPORT mops_cuda_outer_product_scatter_add_vjp_f64(
+    mops_tensor_2d_f64_t grad_A,
+    mops_tensor_2d_f64_t grad_B,
+    mops_tensor_3d_f64_t grad_output,
+    mops_tensor_2d_f64_t A,
+    mops_tensor_2d_f64_t B,
+    mops_tensor_1d_i32_t indices_output
+);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/mops/include/mops/opsa.hpp b/mops/include/mops/opsa.hpp
@@ -5,10 +5,19 @@
 
 #include "mops/exports.h"
 #include "mops/tensor.hpp"
-#include "mops/utils.hpp"
 
 namespace mops {
-/// TODO
+/*
+ * Outer-Product-Scatter-Add (OPSA)
+ * Computes the outer product between tensors A, B along the last dimension, and sums the result
+ * into a new tensor of shape [output_size, A.shape[1], B.shape[1]], where the summation index
+ * is given by the tensor indices_output.
+ *
+ * For example, If A has shape (5, 32) and B has shape (5, 16), and indices_output contains
+ * [0, 0, 1, 1, 2], the output will have shape (3, 32, 16). For clarity, the
+ * value of output[0] in this case would be equal to
+ * output[0, :, :] = A[0, :, None] * B[0, None, :] + A[1, :, None] * B[1, None,  :]
+ */
 template <typename scalar_t>
 void MOPS_EXPORT outer_product_scatter_add(
     Tensor<scalar_t, 3> output,
@@ -73,6 +82,44 @@ void MOPS_EXPORT outer_product_scatter_add(
     Tensor<scalar_t, 2> B,
     Tensor<int32_t, 1> indices_output
 );
+
+extern template void outer_product_scatter_add(
+    Tensor<float, 3> output, Tensor<float, 2> A, Tensor<float, 2> B, Tensor<int32_t, 1> indices_output
+);
+
+extern template void outer_product_scatter_add(
+    Tensor<double, 3> output, Tensor<double, 2> A, Tensor<double, 2> B, Tensor<int32_t, 1> indices_output
+);
+
+template <typename scalar_t>
+void MOPS_EXPORT outer_product_scatter_add_vjp(
+    Tensor<scalar_t, 2> grad_A,
+    Tensor<scalar_t, 2> grad_B,
+    Tensor<scalar_t, 3> grad_output,
+    Tensor<scalar_t, 2> A,
+    Tensor<scalar_t, 2> B,
+    Tensor<int32_t, 1> indices_output
+);
+
+// these templates will be precompiled and provided in the mops library
+extern template void outer_product_scatter_add_vjp(
+    Tensor<float, 2> grad_A,
+    Tensor<float, 2> grad_B,
+    Tensor<float, 3> grad_output,
+    Tensor<float, 2> A,
+    Tensor<float, 2> B,
+    Tensor<int32_t, 1> indices_output
+);
+
+extern template void outer_product_scatter_add_vjp(
+    Tensor<double, 2> grad_A,
+    Tensor<double, 2> grad_B,
+    Tensor<double, 3> grad_output,
+    Tensor<double, 2> A,
+    Tensor<double, 2> B,
+    Tensor<int32_t, 1> indices_output
+);
+
 } // namespace cuda
 } // namespace mops
 

diff --git a/mops/include/mops/opsaw.hpp b/mops/include/mops/opsaw.hpp
@@ -5,7 +5,6 @@
 
 #include "mops/exports.h"
 #include "mops/tensor.hpp"
-#include "mops/utils.hpp"
 
 namespace mops {
 /// TODO

diff --git a/mops/include/mops/sasaw.hpp b/mops/include/mops/sasaw.hpp
@@ -5,7 +5,6 @@
 
 #include "mops/exports.h"
 #include "mops/tensor.hpp"
-#include "mops/utils.hpp"
 
 namespace mops {
 /// TODO

diff --git a/mops/src/hpe/cpu.tpp b/mops/src/hpe/cpu.tpp
@@ -1,13 +1,12 @@
-#include <cassert>
 #include <algorithm>
 #include <stdexcept>
-#include <string>
 #include <vector>
 #include <array>
 
 #include "mops/hpe.hpp"
-#include "mops/checks.hpp"
-#include "mops/utils.hpp"
+
+#include "internal/checks.hpp"
+#include "internal/utils.hpp"
 
 
 template<typename scalar_t, uint8_t polynomial_order>
@@ -147,7 +146,7 @@ void _homogeneous_polynomial_evaluation_vjp_templated_polynomial_order(
     if (grad_A.data != nullptr) {
 
         check_same_shape(grad_A, "grad_A", A, "A", "hpe_vjp");
-        
+
         scalar_t* grad_o_ptr = grad_output.data;
         scalar_t* a_ptr = A.data;
         scalar_t* c_ptr = C.data;