Skip to content

Commit

Permalink
test remove dependency on cuBLAS
Browse files Browse the repository at this point in the history
  • Loading branch information
burlen committed Aug 16, 2023
1 parent dbfc22a commit a5d7560
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 33 deletions.
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if (HAMR_ENABLE_CUDA)
set_source_files_properties(test_hamr_stream_cuda.cpp PROPERTIES LANGUAGE CUDA)
endif()
add_executable(test_hamr_stream_cuda test_hamr_stream_cuda.cpp)
target_link_libraries(test_hamr_stream_cuda hamr CUDA::cublas)
target_link_libraries(test_hamr_stream_cuda hamr)
if (NOT HAMR_NVHPC_CUDA)
set_target_properties(test_hamr_stream_cuda PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}")
endif()
Expand Down
69 changes: 37 additions & 32 deletions test/test_hamr_stream_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,33 @@
#include <cstring>

#include <cuda_runtime.h>
#include <cublas_v2.h>

#include "hamr_buffer.h"
#include "hamr_cuda_launch.h"
#include "hamr_buffer_util.h"

using allocator = hamr::buffer_allocator;
using transfer = hamr::buffer_transfer;

template <typename T>
__global__
void gemm(int M, int N, int K, T alpha, const T *A,
const T *B, T beta, T *C)
{
int c = blockIdx.x * blockDim.x + threadIdx.x;
int r = blockIdx.y * blockDim.y + threadIdx.y;

if (( r >= N ) || ( c >= M ))
return;

T tmp = T();

for (int q = 0; q < K; ++q)
tmp += A[M*q + r] * B[K*c + q];

C[M*c + r] = tmp;
}

// matrix scalar multiply
template <typename T>
__global__
Expand Down Expand Up @@ -119,32 +138,35 @@ hamr::buffer<double> gen_B(int n)
}

// --------------------------------------------------------------------------
int gemm(cublasHandle_t h, cudaStream_t strm, transfer sync, int n,
int gemm(cudaStream_t strm, transfer sync, int n,
const hamr::buffer<double> &A, const hamr::buffer<double> &B,
hamr::buffer<double> &C)
{
// get A on the GPU (if it's not already there)
auto spa = A.get_cuda_accessible();
cudaError_t ierr = cudaSuccess;

// get B on the GPU (if it's not already there)
auto spb = B.get_cuda_accessible();
// get A and B on the GPU (if not already there)
auto [spa, pa] = hamr::get_cuda_accessible(A);
auto [spb, pb] = hamr::get_cuda_accessible(B);

// allocate space for the result on the GPU (will use memory allocated in C
// if it was allocated on the GPU)
hamr::buffer<double> tmp(allocator::cuda_async, strm, sync, std::move(C));
double *ptmp = tmp.data();

double one = 1.0;
double zero = 0.0;

// do the matrix multiply
cublasSetStream(h, strm);

cublasStatus_t ierr = cublasDgemm(h, CUBLAS_OP_N, CUBLAS_OP_N,
n, n, n, &one, spa.get(), n, spb.get(), n, &zero, tmp.data(), n);

if (ierr != CUBLAS_STATUS_SUCCESS)
int nthr = 16;
int nblk = n / nthr + ( n % nthr ? 1 : 0 );
dim3 gridDim( nblk, nblk );
dim3 blockDim( nthr, nthr );
gemm<<<gridDim, blockDim, 0, strm>>>(n, n, n, one, pa, pb, zero, ptmp);
if ((ierr = cudaGetLastError()) != cudaSuccess)
{
std::cerr << "ERROR: gemm failed" << std::endl;
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
" Failed to launch the gemm kernel. "
<< cudaGetErrorString(ierr) << std::endl;
return -1;
}

Expand Down Expand Up @@ -205,20 +227,6 @@ int main(int argc, char **argv)
return -1;
}

// create the streams
if (!default_stream)
{
}

// initialize cublas
cublasHandle_t cbh;
cublasStatus_t ierr = cublasCreate(&cbh);
if (ierr != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "ERROR: failed to initialize cuBLAS" << std::endl;
return -1;
}

// allocate buffers
hamr::buffer<double> A1(allocator::cuda_host, cs1, sync);
hamr::buffer<double> B1(allocator::cuda_host, cs1, sync);
Expand All @@ -232,7 +240,7 @@ int main(int argc, char **argv)
A1 = gen_A(n);
B1 = gen_B(n);

if (gemm(cbh, cs1, sync, n, A1, B1, C1))
if (gemm(cs1, sync, n, A1, B1, C1))
return -1;

double scale = 1./n;
Expand All @@ -245,7 +253,7 @@ int main(int argc, char **argv)
A2 = gen_A(n);
B2 = gen_B(n);

if (gemm(cbh, cs2, sync, n, A2, B2, C2))
if (gemm(cs2, sync, n, A2, B2, C2))
return -1;

if (msm(C2.data(), scale, C2.data(), n, n, n, cs2))
Expand Down Expand Up @@ -276,9 +284,6 @@ int main(int argc, char **argv)
C1.free();
C2.free();

// finalize cuBLAS
cublasDestroy(cbh);

// release the streams
if (!default_stream)
{
Expand Down

0 comments on commit a5d7560

Please sign in to comment.