Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add test for cuda and sycl code in same cpp #481

Draft
wants to merge 1 commit into
base: SYCLomatic
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions features/feature_case/cuda_interop_sycl/command.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

export CUDA_HOME=/usr/local/cuda-11.4
export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda-11.4/bin:$PATH
export CUDA_INCLUDE_PATH=/usr/local/cuda-11.4/include
export SYCL_ROOT_DIR=/nfs/shm/proj/icl/cmplrarch/deploy_syclos/llorgsyclngefi2linux/20230911_160000/build/linux_qa_release
export CUDA_ROOT_DIR=/usr/local/cuda-11.4

clang++ -std=c++17 --cuda-gpu-arch=sm_70 -I${SYCL_ROOT_DIR}/include/ -I${SYCL_ROOT_DIR}/include/sycl/ -Wno-linker-warnings -g test.cu -L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart -o usm_vec_add.o
clang++ -std=c++17 --cuda-gpu-arch=sm_70 -I${SYCL_ROOT_DIR}/include/ -I${SYCL_ROOT_DIR}/include/sycl/ -I${CUDA_ROOT_DIR}/include -Wno-linker-warnings -xcuda -g test.cpp -L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart -o usm_vec_add_cpp.o

clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -xcuda test.cpp -lcudart -lcuda
clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -lcudart -lcuda -xcuda test.cpp -o sycl.o

clang++ -fsycl -I${CUDA_ROOT_DIR}/include --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -xcuda test_sycl.cpp -lcudart -lcuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
100 changes: 100 additions & 0 deletions features/feature_case/cuda_interop_sycl/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@


#include <cuda_runtime.h>
#include <sycl/sycl.hpp>

// __global__ void vecAdd(double *a, double *b, double *c, int n) {
// int id = blockIdx.x * blockDim.x + threadIdx.x;
// if (id < n) {
// c[id] = a[id] + b[id];
// }
// }

void vecAdd(double *a, double *b, double *c, int n,
const sycl::nd_item<3> &item_ct1) {
// Get our global thread ID
int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);

// Make sure we do not go out of bounds
if (id < n) {
c[id] = a[id] + b[id];
}
}

int main(int argc, char *argv[]) {
using namespace sycl;
int n = 100;
size_t bytes = n * sizeof(double);

device dev{};
context myContext{dev};
queue myQueue{myContext, dev};

// Allocate memory for each vector on host
// auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
// auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
// auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
double *d_A, *d_B, *d_C;
double *h_A, *h_B, *h_C;
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
h_A = new double(n);
h_B = new double(n);
h_C = new double(n);
// Initialize vectors on host
for (int i = 0; i < n; i++) {
h_A[i] = 0.5;
h_B[i] = 0.5;
}
cudaStream_t stream_cuda;
cudaStreamCreate(&stream_cuda);
cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);
// myQueue.submit([&](handler& h) {
// int blockSize = 1024;
// int gridSize = static_cast<int>(ceil(static_cast<float>(n) /
// blockSize)); vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
// cudaDeviceSynchronize();
// });

// int blockSize = 1024;
// int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
// vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
// cudaDeviceSynchronize();

int blockSize = 1024;

// Number of thread blocks in grid
int gridSize = static_cast<int>((static_cast<float>(n) / blockSize));

// Execute the kernel
/*
DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the
limit. To get the device limit, query info::device::max_work_group_size.
Adjust the work-group size if needed.
*/
{
myQueue.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, gridSize) *
sycl::range<3>(1, 1, blockSize),
sycl::range<3>(1, 1, blockSize)),
[=](sycl::nd_item<3> item_ct1) { vecAdd(d_A, d_B, d_C, n, item_ct1); });
}

myQueue.wait();
cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);

double sum = 0;
for (int i = 0; i < n; i++) {
sum += h_C[i];
}
std::cout << "Final result " << sum / n << std::endl;

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}
63 changes: 63 additions & 0 deletions features/feature_case/cuda_interop_sycl/test1.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@


#include <sycl/sycl.hpp>
#include <cuda_runtime.h>


__global__ void vecAdd(double *a, double *b, double *c, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
c[id] = a[id] + b[id];
}
}

int main(int argc, char *argv[]) {
using namespace sycl;
int n = 100;
size_t bytes = n * sizeof(double);

device dev{};
context myContext{dev};
queue myQueue{myContext, dev};

// Allocate memory for each vector on host
auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
for (int i = 0; i < n; i++) {
std::cout <<d_C[i]<<'\t';
}
// Initialize vectors on host
for (int i = 0; i < n; i++) {
d_A[i] = std::sin(i) * std::sin(i);
d_B[i] = std::cos(i) * std::cos(i);
}
cudaStream_t stream_cuda;
cudaStreamCreate(&stream_cuda);

myQueue.submit([&](handler& h) {
int blockSize = 1024;
int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
cudaDeviceSynchronize();
});

int blockSize = 1024;
int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
cudaDeviceSynchronize();

myQueue.wait();

double sum = 0;
for (int i = 0; i < n; i++) {
sum += d_C[i];
}
std::cout << "Final result " << sum / n << std::endl;

free(d_A, myContext);
free(d_B, myContext);
free(d_C, myContext);

return 0;
}
Loading