lab-cosmo · nickjbrowning · Apr 29, 2024 · Apr 30, 2024 · May 2, 2024 · May 2, 2024
diff --git a/ci/docker/Dockerfile.base b/ci/docker/Dockerfile.base
@@ -0,0 +1,6 @@
+FROM nvcr.io/nvidia/pytorch:23.10-py3
+
+RUN apt-get update
+
+# install boost test framework
+RUN apt-get install -y libboost-test-dev
diff --git a/ci/pipeline.yml b/ci/pipeline.yml
@@ -0,0 +1,36 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+stages:
+  - build
+  - test
+
+build_base_image_job:
+  stage: build
+  extends: .container-builder-dynamic-name
+  timeout: 2h
+  variables:
+    DOCKERFILE: ci/docker/Dockerfile.base
+    WATCH_FILECHANGES: $DOCKERFILE
+    PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/mops
+
+test_job:
+  stage: test
+  extends: .container-runner-daint-gpu
+  image: $BASE_IMAGE
+  timeout: 2h
+  script:
+    - export CUDA_HOME="/usr/local/cuda"
+    - python3 -m pip install --upgrade pip
+    - echo "Install Tox"
+    - python3 -m pip install tox
+    - echo "Run the Tox Script"
+    - tox
+    - echo "Tox script completed"
+
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_PARTITION: normal
+    SLURM_NTASKS: 1
+    SLURM_TIMELIMIT: '00:40:00'
+    GIT_STRATEGY: fetch
diff --git a/mops-torch/src/hpe.cpp b/mops-torch/src/hpe.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/hpe.hpp"
 #include "mops/torch/utils.hpp"
 
@@ -38,15 +43,25 @@ torch::Tensor HomogeneousPolynomialEvaluation::forward(
         });
     } else if (A.device().is_cuda()) {
 
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation", [&]() {
             mops::cuda::homogeneous_polynomial_evaluation<scalar_t>(
                 details::torch_to_mops_1d<scalar_t>(output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_1d<scalar_t>(C),
-                details::torch_to_mops_2d<int32_t>(indices_A)
+                details::torch_to_mops_2d<int32_t>(indices_A),
+                stream
             );
         });
 
+#endif
+
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -108,6 +123,12 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
 
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
@@ -121,9 +142,11 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
                 details::torch_to_mops_1d<scalar_t>(grad_output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_1d<scalar_t>(C),
-                details::torch_to_mops_2d<int32_t>(indices_A)
+                details::torch_to_mops_2d<int32_t>(indices_A),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,

diff --git a/mops-torch/src/opsa.cpp b/mops-torch/src/opsa.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/opsa.hpp"
 #include "mops/torch/utils.hpp"
 
@@ -48,6 +53,10 @@ torch::Tensor OuterProductScatterAdd::forward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         output = torch::empty(
             {output_size, A.size(1), B.size(1)},
             torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
@@ -58,7 +67,8 @@ torch::Tensor OuterProductScatterAdd::forward(
                 details::torch_to_mops_3d<scalar_t>(output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_2d<scalar_t>(B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 
@@ -130,6 +140,10 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
 
@@ -150,7 +164,8 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
                 details::torch_to_mops_3d<scalar_t>(grad_output),
                 details::torch_to_mops_2d<scalar_t>(A),
                 details::torch_to_mops_2d<scalar_t>(B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 #endif
@@ -228,9 +243,52 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::backward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
-        C10_THROW_ERROR(
-            ValueError, "outer_product_scatter_add_vjp_vjp is not implemented for CUDA yet"
-        );
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
+        AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
+            auto mops_grad_grad_output = mops::Tensor<scalar_t, 3>{nullptr, {0, 0, 0}};
+            if (grad_output.requires_grad()) {
+                grad_grad_output = torch::empty_like(grad_output);
+                mops_grad_grad_output = details::torch_to_mops_3d<scalar_t>(grad_grad_output);
+            }
+
+            auto mops_grad_A_2 = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (A.requires_grad()) {
+                grad_A_2 = torch::empty_like(A);
+                mops_grad_A_2 = details::torch_to_mops_2d<scalar_t>(grad_A_2);
+            }
+
+            auto mops_grad_B_2 = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (B.requires_grad()) {
+                grad_B_2 = torch::empty_like(B);
+                mops_grad_B_2 = details::torch_to_mops_2d<scalar_t>(grad_B_2);
+            }
+
+            auto mops_grad_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (grad_grad_A.defined()) {
+                mops_grad_grad_A = details::torch_to_mops_2d<scalar_t>(grad_grad_A);
+            }
+
+            auto mops_grad_grad_B = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
+            if (grad_grad_B.defined()) {
+                mops_grad_grad_B = details::torch_to_mops_2d<scalar_t>(grad_grad_B);
+            }
+
+            mops::cuda::outer_product_scatter_add_vjp_vjp<scalar_t>(
+                mops_grad_grad_output,
+                mops_grad_A_2,
+                mops_grad_B_2,
+                mops_grad_grad_A,
+                mops_grad_grad_B,
+                details::torch_to_mops_3d<scalar_t>(grad_output),
+                details::torch_to_mops_2d<scalar_t>(A),
+                details::torch_to_mops_2d<scalar_t>(B),
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
+            );
+        });
 #endif
     } else {
         C10_THROW_ERROR(

diff --git a/mops-torch/src/opsaw.cpp b/mops-torch/src/opsaw.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/opsaw.hpp"
 #include "mops/torch/utils.hpp"
 

diff --git a/mops-torch/src/sap.cpp b/mops-torch/src/sap.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/sap.hpp"
 #include "mops/torch/utils.hpp"
 
@@ -59,6 +64,14 @@ torch::Tensor SparseAccumulationOfProducts::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         output = torch::empty(
             {A.size(0), output_size},
             torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
@@ -72,9 +85,11 @@ torch::Tensor SparseAccumulationOfProducts::forward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -170,6 +185,14 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
             );
         });
     } else if (A.device().is_cuda()) {
+
+#ifndef MOPS_CUDA_ENABLED
+        C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
+#else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp", [&]() {
             auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
             if (A.requires_grad()) {
@@ -192,9 +215,11 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
+#endif
     } else {
         C10_THROW_ERROR(
             ValueError,
@@ -276,6 +301,10 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
 #ifndef MOPS_CUDA_ENABLED
         C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
 #else
+        c10::cuda::CUDAGuard deviceGuard{A.device()};
+        cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
+        void* stream = reinterpret_cast<void*>(currstream);
+
         AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp_vjp", [&]() {
             auto mops_grad_grad_output = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
             if (grad_output.requires_grad()) {
@@ -317,7 +346,8 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
                 details::torch_to_mops_1d<scalar_t>(C),
                 details::torch_to_mops_1d<int32_t>(indices_A),
                 details::torch_to_mops_1d<int32_t>(indices_B),
-                details::torch_to_mops_1d<int32_t>(indices_output)
+                details::torch_to_mops_1d<int32_t>(indices_output),
+                stream
             );
         });
 #endif

diff --git a/mops-torch/src/sasaw.cpp b/mops-torch/src/sasaw.cpp
@@ -1,3 +1,8 @@
+#ifdef MOPS_CUDA_ENABLED
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
 #include "mops/torch/sasaw.hpp"
 #include "mops/torch/utils.hpp"
 

diff --git a/mops/CMakeLists.txt b/mops/CMakeLists.txt
@@ -124,6 +124,8 @@ if(CMAKE_CUDA_COMPILER AND MOPS_CUDA)
         "src/opsa/opsa.cu"
         "src/hpe/hpe.cu"
         "src/sap/sap.cu"
+        "src/sasaw/sasaw.cu"
+        "src/opsaw/opsaw.cu"
     )
 
 endif()

diff --git a/mops/include/mops/hpe.h b/mops/include/mops/hpe.h
@@ -69,15 +69,17 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f32(
     mops_tensor_1d_f32_t output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation for 64-bit floats
 int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f64(
     mops_tensor_1d_f64_t output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 32-bit floats
@@ -86,7 +88,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 64-bit floats
@@ -95,7 +98,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 32-bit floats
@@ -106,7 +110,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32(
     mops_tensor_1d_f32_t grad_output,
     mops_tensor_2d_f32_t A,
     mops_tensor_1d_f32_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 /// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 64-bit floats
@@ -117,7 +122,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64(
     mops_tensor_1d_f64_t grad_output,
     mops_tensor_2d_f64_t A,
     mops_tensor_1d_f64_t C,
-    mops_tensor_2d_i32_t indices_A
+    mops_tensor_2d_i32_t indices_A,
+    void* cuda_stream
 );
 
 #ifdef __cplusplus