Skip to content

Commit

Permalink
Merge branch 'cscs_ci' into multigpu_and_streams
Browse files Browse the repository at this point in the history
  • Loading branch information
nickjbrowning committed May 10, 2024
2 parents e89350b + 263984d commit e654289
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 77 deletions.
6 changes: 6 additions & 0 deletions ci/docker/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM nvcr.io/nvidia/pytorch:23.10-py3

RUN apt-get update

# install boost test framework
RUN apt-get install -y libboost-test-dev
36 changes: 36 additions & 0 deletions ci/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- build
- test

build_base_image_job:
stage: build
extends: .container-builder-dynamic-name
timeout: 2h
variables:
DOCKERFILE: ci/docker/Dockerfile.base
WATCH_FILECHANGES: $DOCKERFILE
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/mops

test_job:
stage: test
extends: .container-runner-daint-gpu
image: $BASE_IMAGE
timeout: 2h
script:
- export CUDA_HOME="/usr/local/cuda"
- python3 -m pip install --upgrade pip
- echo "Install Tox"
- python3 -m pip install tox
- echo "Run the Tox Script"
- tox
- echo "Tox script completed"

variables:
SLURM_JOB_NUM_NODES: 1
SLURM_PARTITION: normal
SLURM_NTASKS: 1
SLURM_TIMELIMIT: '00:40:00'
GIT_STRATEGY: fetch
135 changes: 76 additions & 59 deletions mops/src/hpe/hpe.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,22 @@ __global__ void homogeneous_polynomial_evaluation_kernel(

__syncthreads();

int32_t i_monomial = threadIdx.x % polynomial_order;
int32_t x = threadIdx.x / polynomial_order;
int32_t nx = blockDim.x / polynomial_order;

for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
buffer_indices_A[i_monomial * blockDim.x + lbasis] =
indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
__syncthreads();

int32_t i_monomial;
int32_t x;
int32_t nx;

if (polynomial_order > 0) {
i_monomial = threadIdx.x % polynomial_order;
x = threadIdx.x / polynomial_order;
nx = blockDim.x / polynomial_order;

for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
buffer_indices_A[i_monomial * blockDim.x + lbasis] =
indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
}
}
}

Expand Down Expand Up @@ -146,47 +154,47 @@ void mops::cuda::homogeneous_polynomial_evaluation(
switch (polynomial_order) {
case 0:
homogeneous_polynomial_evaluation_kernel<scalar_t, 0>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 1:
homogeneous_polynomial_evaluation_kernel<scalar_t, 1>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 2:
homogeneous_polynomial_evaluation_kernel<scalar_t, 2>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 3:
homogeneous_polynomial_evaluation_kernel<scalar_t, 3>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 4:
homogeneous_polynomial_evaluation_kernel<scalar_t, 4>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 5:
homogeneous_polynomial_evaluation_kernel<scalar_t, 5>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 6:
homogeneous_polynomial_evaluation_kernel<scalar_t, 6>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 7:
homogeneous_polynomial_evaluation_kernel<scalar_t, 7>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 8:
homogeneous_polynomial_evaluation_kernel<scalar_t, 8>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 9:
homogeneous_polynomial_evaluation_kernel<scalar_t, 9>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
case 10:
homogeneous_polynomial_evaluation_kernel<scalar_t, 10>
<<<block_dim, thread_block, space>>>(output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(output, A, C, indices_A);
break;
default:
break;
Expand Down Expand Up @@ -256,59 +264,68 @@ __global__ void homogeneous_polynomial_evaluation_vjp_kernel(
__syncthreads();

scalar_t gout = grad_output.data[batch_id];
if (polynomial_order > 0) {
// indices_A : nbasis, polynomial_order
for (int32_t i = 0; i < nbasis; i += blockDim.x) {

// indices_A : nbasis, polynomial_order
for (int32_t i = 0; i < nbasis; i += blockDim.x) {
__syncthreads();

__syncthreads();
int32_t basis = i + threadIdx.x;

int32_t i_monomial = threadIdx.x % polynomial_order;
int32_t x = threadIdx.x / polynomial_order;
int32_t nx = blockDim.x / polynomial_order;
int32_t i_monomial;
int32_t x;
int32_t nx;

for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
buffer_indices_A[i_monomial * blockDim.x + lbasis] =
indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
i_monomial = threadIdx.x % polynomial_order;
x = threadIdx.x / polynomial_order;
nx = blockDim.x / polynomial_order;

for (int lbasis = x; lbasis < blockDim.x; lbasis += nx) {
if (i_monomial * blockDim.x + lbasis < polynomial_order * blockDim.x) {
buffer_indices_A[i_monomial * blockDim.x + lbasis] =
indices_A.data[(i + lbasis) * polynomial_order + i_monomial];
}
}
}

__syncthreads();
__syncthreads();

int32_t basis = i + threadIdx.x;
if (basis < nbasis) {

if (basis < nbasis) {
scalar_t c = C.data[basis] * gout;

scalar_t c = C.data[basis] * gout;
for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) {

for (int32_t i_monomial = 0; i_monomial < polynomial_order; i_monomial++) {
scalar_t tmp_i = c;

scalar_t tmp_i = c;
for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) {

for (int32_t j_monomial = 0; j_monomial < polynomial_order; j_monomial++) {
if (i_monomial == j_monomial) {
continue;
}

if (i_monomial == j_monomial) {
continue;
int32_t idx_j = buffer_indices_A
[j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial
// * indices_A.shape[0] + basis];

tmp_i *= buffer_nu1[idx_j];
}

int32_t idx_j =
buffer_indices_A[j_monomial * blockDim.x + threadIdx.x]; // indices_A.data[j_monomial
// * indices_A.shape[0] + basis];
int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];

tmp_i *= buffer_nu1[idx_j];
ATOMIC_ADD(&buffer_gradA[idx_i], tmp_i);
}

int32_t idx_i = buffer_indices_A[i_monomial * blockDim.x + threadIdx.x];

atomicAdd(&buffer_gradA[idx_i], tmp_i);
}
}
}

__syncthreads();

for (int32_t i = threadIdx.x; i < nnu1; i += blockDim.x) {
grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i];
if (polynomial_order > 0) {
grad_A.data[batch_id * nnu1 + i] = buffer_gradA[i];
} else {
grad_A.data[batch_id * nnu1 + i] = 0.0;
}
}
}

Expand Down Expand Up @@ -351,47 +368,47 @@ void mops::cuda::homogeneous_polynomial_evaluation_vjp(
switch (polynomial_order) {
case 0:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 0>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 1:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 1>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 2:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 2>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 3:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 3>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 4:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 4>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 5:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 5>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 6:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 6>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 7:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 7>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 8:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 8>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 9:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 9>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
case 10:
homogeneous_polynomial_evaluation_vjp_kernel<scalar_t, 10>
<<<block_dim, thread_block, space>>>(grad_A, grad_output, A, C, indices_A);
<<<block_dim, thread_block, space, cstream>>>(grad_A, grad_output, A, C, indices_A);
break;
default:
break;
Expand Down
31 changes: 31 additions & 0 deletions mops/src/internal/cuda_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,37 @@ __host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim) {
return (x + bdim - 1) / bdim;
}

__device__ double atomicAdd_presm60(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;

do {
assumed = old;
old = atomicCAS(
address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))
);

// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);

return __longlong_as_double(old);
}

template <typename scalar_t> __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val) {
#if __CUDA_ARCH__ < 600
if constexpr (sizeof(scalar_t) == 4) {
return atomicAdd(address, val);
} else if constexpr (sizeof(scalar_t) == 8) {
return atomicAdd_presm60(address, val);
}
#else
return atomicAdd(address, val);
#endif
}

template float ATOMIC_ADD<float>(float* address, float val);
template double ATOMIC_ADD<double>(double* address, double val);

template <typename T>
__host__ __device__ T* shared_array(std::size_t n_elements, void*& ptr, std::size_t* space) noexcept {
const std::uintptr_t inptr = reinterpret_cast<uintptr_t>(ptr);
Expand Down
11 changes: 11 additions & 0 deletions mops/src/internal/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@ using namespace std;
} \
} while (0)

/*
* Pre SM60 cards do not support atomicAdd(double *, double). This function implements and atomicCAS
* to lock update the address.
*/
__device__ double atomicAdd_presm60(double* address, double val);

/*
* function to select the right version of atomicAdd for the archcode being compiled.
*/
template <typename scalar_t> __device__ scalar_t ATOMIC_ADD(scalar_t* address, scalar_t val);

__host__ __device__ int32_t find_integer_divisor(int32_t x, int32_t bdim);

/*
Expand Down
4 changes: 2 additions & 2 deletions mops/src/opsa/cpu.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void mops::outer_product_scatter_add_vjp(
scalar_t *grad_output_ptr = grad_output.data;
scalar_t *a_ptr = A.data;
scalar_t *b_ptr = B.data;
int32_t *indices_output_ptr = indices_output.data;
[[maybe_unused]] int32_t *indices_output_ptr = indices_output.data;

#pragma omp parallel for
for (size_t i = 0; i < size_ab; i++) {
Expand Down Expand Up @@ -167,7 +167,7 @@ void mops::outer_product_scatter_add_vjp_vjp(
scalar_t *grad_output_ptr = grad_output.data;
scalar_t *a_ptr = A.data;
scalar_t *b_ptr = B.data;
int32_t *indices_output_ptr = indices_output.data;
[[maybe_unused]] int32_t *indices_output_ptr = indices_output.data;

scalar_t *grad_output_ptr_i = nullptr;
scalar_t *a_ptr_i = nullptr;
Expand Down
Loading

0 comments on commit e654289

Please sign in to comment.