Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA OPSA VJP VJP #61

Open
wants to merge 42 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
72ce285
Add CI config files
Apr 29, 2024
dd61fe5
x
nickjbrowning Apr 30, 2024
d463d21
added stream management to OPSA
nickjbrowning May 2, 2024
d81ec9c
x
nickjbrowning May 2, 2024
c8d8776
added support for other ops.
nickjbrowning May 2, 2024
0b06e79
formatting.
nickjbrowning May 2, 2024
de5a3da
Merge branch 'main' into multigpu_and_streams
nickjbrowning May 2, 2024
6a5c2de
Merge branch 'multigpu_and_streams' into cuda_vjpvjp_opsa
nickjbrowning May 2, 2024
9bfd8dd
WIP
nickjbrowning May 2, 2024
23d7784
preliminary OPSA VJP VJP implementation.
nickjbrowning May 2, 2024
d57411c
threadsync
nickjbrowning May 2, 2024
b2c170a
fixed build issues.
nickjbrowning May 3, 2024
ffaaabd
Merge branch 'multigpu_and_streams' into cuda_vjpvjp_opsa
nickjbrowning May 3, 2024
1c7424b
fixed issue with opsaw instantiation
nickjbrowning May 3, 2024
f78161c
fixed issue with opsaw instantiation
nickjbrowning May 3, 2024
e89350b
formatting.
nickjbrowning May 3, 2024
603b21c
fixed OSPA VJPVJP with conditional gradgrads
nickjbrowning May 3, 2024
ea54e9a
add comments on the test job script
May 3, 2024
1bb35aa
insignificant commit for retriggering pipeline
May 3, 2024
497d811
Merge branch 'main' into cscs_ci
frostedoyster May 4, 2024
b9be358
Install tox
frostedoyster May 4, 2024
d9fc283
attempt addressing CI compiler errors and warnings
May 6, 2024
5b04a18
Merge branch 'cscs_ci' of github.com:lab-cosmo/mops into cscs_ci
May 6, 2024
af2ec60
remove polynomial order zero case to avoid divide by zero issue
May 6, 2024
9fee2a1
correct address of the atomic add operations in sap
May 6, 2024
fc80766
specify addresses with indices for atomic add operations to address C…
May 6, 2024
2ed5771
dummy implementation for polynomial order zero
May 7, 2024
62b73d8
removed bug in CUDA HPE.
nickjbrowning May 10, 2024
b6d4795
header change
nickjbrowning May 10, 2024
68223e2
HPE divide by zero fix.
nickjbrowning May 10, 2024
3b4527f
added in code for pre-sm60 atomicAdd(doubles)
nickjbrowning May 10, 2024
e3c70c2
macro to switch out the atomicAdds depending on ARCH
nickjbrowning May 10, 2024
ade1cd3
documentation.
nickjbrowning May 10, 2024
263984d
changed macro to device function.
nickjbrowning May 10, 2024
e654289
Merge branch 'cscs_ci' into multigpu_and_streams
nickjbrowning May 10, 2024
899ea94
fixed sap cstream
nickjbrowning May 10, 2024
99cedf5
formatting.
nickjbrowning May 10, 2024
21b6235
missing device guard in SAP
nickjbrowning May 10, 2024
dd29ac0
missing stream.
nickjbrowning May 10, 2024
b033a29
Merge branch 'multigpu_and_streams' into cuda_vjpvjp_opsa
nickjbrowning May 10, 2024
aebcbe8
modified atomicAdd
nickjbrowning May 10, 2024
305e0cd
added __device__ to ATOMIC_ADD template forward declarations
nickjbrowning May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ci/docker/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM nvcr.io/nvidia/pytorch:23.10-py3

RUN apt-get update

# install boost test framework
RUN apt-get install -y libboost-test-dev
36 changes: 36 additions & 0 deletions ci/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- build
- test

build_base_image_job:
stage: build
extends: .container-builder-dynamic-name
timeout: 2h
variables:
DOCKERFILE: ci/docker/Dockerfile.base
WATCH_FILECHANGES: $DOCKERFILE
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/mops

test_job:
stage: test
extends: .container-runner-daint-gpu
image: $BASE_IMAGE
timeout: 2h
script:
- export CUDA_HOME="/usr/local/cuda"
- python3 -m pip install --upgrade pip
- echo "Install Tox"
- python3 -m pip install tox
- echo "Run the Tox Script"
- tox
- echo "Tox script completed"

variables:
SLURM_JOB_NUM_NODES: 1
SLURM_PARTITION: normal
SLURM_NTASKS: 1
SLURM_TIMELIMIT: '00:40:00'
GIT_STRATEGY: fetch
27 changes: 25 additions & 2 deletions mops-torch/src/hpe.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#ifdef MOPS_CUDA_ENABLED
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif

#include "mops/torch/hpe.hpp"
#include "mops/torch/utils.hpp"

Expand Down Expand Up @@ -38,15 +43,25 @@ torch::Tensor HomogeneousPolynomialEvaluation::forward(
});
} else if (A.device().is_cuda()) {

#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation", [&]() {
mops::cuda::homogeneous_polynomial_evaluation<scalar_t>(
details::torch_to_mops_1d<scalar_t>(output),
details::torch_to_mops_2d<scalar_t>(A),
details::torch_to_mops_1d<scalar_t>(C),
details::torch_to_mops_2d<int32_t>(indices_A)
details::torch_to_mops_2d<int32_t>(indices_A),
stream
);
});

#endif

} else {
C10_THROW_ERROR(
ValueError,
Expand Down Expand Up @@ -108,6 +123,12 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
);
});
} else if (A.device().is_cuda()) {
#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "homogeneous_polynomial_evaluation_vjp", [&]() {
auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
Expand All @@ -121,9 +142,11 @@ torch::Tensor HomogeneousPolynomialEvaluationBackward::forward(
details::torch_to_mops_1d<scalar_t>(grad_output),
details::torch_to_mops_2d<scalar_t>(A),
details::torch_to_mops_1d<scalar_t>(C),
details::torch_to_mops_2d<int32_t>(indices_A)
details::torch_to_mops_2d<int32_t>(indices_A),
stream
);
});
#endif
} else {
C10_THROW_ERROR(
ValueError,
Expand Down
68 changes: 63 additions & 5 deletions mops-torch/src/opsa.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#ifdef MOPS_CUDA_ENABLED
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif

#include "mops/torch/opsa.hpp"
#include "mops/torch/utils.hpp"

Expand Down Expand Up @@ -48,6 +53,10 @@ torch::Tensor OuterProductScatterAdd::forward(
#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

output = torch::empty(
{output_size, A.size(1), B.size(1)},
torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
Expand All @@ -58,7 +67,8 @@ torch::Tensor OuterProductScatterAdd::forward(
details::torch_to_mops_3d<scalar_t>(output),
details::torch_to_mops_2d<scalar_t>(A),
details::torch_to_mops_2d<scalar_t>(B),
details::torch_to_mops_1d<int32_t>(indices_output)
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});

Expand Down Expand Up @@ -130,6 +140,10 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};

Expand All @@ -150,7 +164,8 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::forward(
details::torch_to_mops_3d<scalar_t>(grad_output),
details::torch_to_mops_2d<scalar_t>(A),
details::torch_to_mops_2d<scalar_t>(B),
details::torch_to_mops_1d<int32_t>(indices_output)
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});
#endif
Expand Down Expand Up @@ -228,9 +243,52 @@ std::vector<torch::Tensor> OuterProductScatterAddBackward::backward(
#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
C10_THROW_ERROR(
ValueError, "outer_product_scatter_add_vjp_vjp is not implemented for CUDA yet"
);
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "outer_product_scatter_add_vjp", [&]() {
auto mops_grad_grad_output = mops::Tensor<scalar_t, 3>{nullptr, {0, 0, 0}};
if (grad_output.requires_grad()) {
grad_grad_output = torch::empty_like(grad_output);
mops_grad_grad_output = details::torch_to_mops_3d<scalar_t>(grad_grad_output);
}

auto mops_grad_A_2 = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (A.requires_grad()) {
grad_A_2 = torch::empty_like(A);
mops_grad_A_2 = details::torch_to_mops_2d<scalar_t>(grad_A_2);
}

auto mops_grad_B_2 = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (B.requires_grad()) {
grad_B_2 = torch::empty_like(B);
mops_grad_B_2 = details::torch_to_mops_2d<scalar_t>(grad_B_2);
}

auto mops_grad_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (grad_grad_A.defined()) {
mops_grad_grad_A = details::torch_to_mops_2d<scalar_t>(grad_grad_A);
}

auto mops_grad_grad_B = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (grad_grad_B.defined()) {
mops_grad_grad_B = details::torch_to_mops_2d<scalar_t>(grad_grad_B);
}

mops::cuda::outer_product_scatter_add_vjp_vjp<scalar_t>(
mops_grad_grad_output,
mops_grad_A_2,
mops_grad_B_2,
mops_grad_grad_A,
mops_grad_grad_B,
details::torch_to_mops_3d<scalar_t>(grad_output),
details::torch_to_mops_2d<scalar_t>(A),
details::torch_to_mops_2d<scalar_t>(B),
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});
#endif
} else {
C10_THROW_ERROR(
Expand Down
5 changes: 5 additions & 0 deletions mops-torch/src/opsaw.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#ifdef MOPS_CUDA_ENABLED
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif

#include "mops/torch/opsaw.hpp"
#include "mops/torch/utils.hpp"

Expand Down
36 changes: 33 additions & 3 deletions mops-torch/src/sap.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#ifdef MOPS_CUDA_ENABLED
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif

#include "mops/torch/sap.hpp"
#include "mops/torch/utils.hpp"

Expand Down Expand Up @@ -59,6 +64,14 @@ torch::Tensor SparseAccumulationOfProducts::forward(
);
});
} else if (A.device().is_cuda()) {

#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

output = torch::empty(
{A.size(0), output_size},
torch::TensorOptions().dtype(A.scalar_type()).device(A.device())
Expand All @@ -72,9 +85,11 @@ torch::Tensor SparseAccumulationOfProducts::forward(
details::torch_to_mops_1d<scalar_t>(C),
details::torch_to_mops_1d<int32_t>(indices_A),
details::torch_to_mops_1d<int32_t>(indices_B),
details::torch_to_mops_1d<int32_t>(indices_output)
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});
#endif
} else {
C10_THROW_ERROR(
ValueError,
Expand Down Expand Up @@ -170,6 +185,14 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
);
});
} else if (A.device().is_cuda()) {

#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp", [&]() {
auto mops_grad_A = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (A.requires_grad()) {
Expand All @@ -192,9 +215,11 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::forward(
details::torch_to_mops_1d<scalar_t>(C),
details::torch_to_mops_1d<int32_t>(indices_A),
details::torch_to_mops_1d<int32_t>(indices_B),
details::torch_to_mops_1d<int32_t>(indices_output)
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});
#endif
} else {
C10_THROW_ERROR(
ValueError,
Expand Down Expand Up @@ -276,6 +301,10 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
#ifndef MOPS_CUDA_ENABLED
C10_THROW_ERROR(ValueError, "MOPS was not compiled with CUDA support " + A.device().str());
#else
c10::cuda::CUDAGuard deviceGuard{A.device()};
cudaStream_t currstream = c10::cuda::getCurrentCUDAStream();
void* stream = reinterpret_cast<void*>(currstream);

AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "sparse_accumulation_of_products_vjp_vjp", [&]() {
auto mops_grad_grad_output = mops::Tensor<scalar_t, 2>{nullptr, {0, 0}};
if (grad_output.requires_grad()) {
Expand Down Expand Up @@ -317,7 +346,8 @@ std::vector<torch::Tensor> SparseAccumulationOfProductsBackward::backward(
details::torch_to_mops_1d<scalar_t>(C),
details::torch_to_mops_1d<int32_t>(indices_A),
details::torch_to_mops_1d<int32_t>(indices_B),
details::torch_to_mops_1d<int32_t>(indices_output)
details::torch_to_mops_1d<int32_t>(indices_output),
stream
);
});
#endif
Expand Down
5 changes: 5 additions & 0 deletions mops-torch/src/sasaw.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#ifdef MOPS_CUDA_ENABLED
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#endif

#include "mops/torch/sasaw.hpp"
#include "mops/torch/utils.hpp"

Expand Down
2 changes: 2 additions & 0 deletions mops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ if(CMAKE_CUDA_COMPILER AND MOPS_CUDA)
"src/opsa/opsa.cu"
"src/hpe/hpe.cu"
"src/sap/sap.cu"
"src/sasaw/sasaw.cu"
"src/opsaw/opsaw.cu"
)

endif()
Expand Down
18 changes: 12 additions & 6 deletions mops/include/mops/hpe.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,17 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f32(
mops_tensor_1d_f32_t output,
mops_tensor_2d_f32_t A,
mops_tensor_1d_f32_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

/// CUDA version of mops::homogeneous_polynomial_evaluation for 64-bit floats
int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_f64(
mops_tensor_1d_f64_t output,
mops_tensor_2d_f64_t A,
mops_tensor_1d_f64_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

/// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 32-bit floats
Expand All @@ -86,7 +88,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f32(
mops_tensor_1d_f32_t grad_output,
mops_tensor_2d_f32_t A,
mops_tensor_1d_f32_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

/// CUDA version of mops::homogeneous_polynomial_evaluation_vjp for 64-bit floats
Expand All @@ -95,7 +98,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_f64(
mops_tensor_1d_f64_t grad_output,
mops_tensor_2d_f64_t A,
mops_tensor_1d_f64_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

/// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 32-bit floats
Expand All @@ -106,7 +110,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f32(
mops_tensor_1d_f32_t grad_output,
mops_tensor_2d_f32_t A,
mops_tensor_1d_f32_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

/// CUDA version of mops::homogeneous_polynomial_evaluation_vjp_vjp for 64-bit floats
Expand All @@ -117,7 +122,8 @@ int MOPS_EXPORT mops_cuda_homogeneous_polynomial_evaluation_vjp_vjp_f64(
mops_tensor_1d_f64_t grad_output,
mops_tensor_2d_f64_t A,
mops_tensor_1d_f64_t C,
mops_tensor_2d_i32_t indices_A
mops_tensor_2d_i32_t indices_A,
void* cuda_stream
);

#ifdef __cplusplus
Expand Down
Loading
Loading