diff --git a/features/feature_case/cuda_interop_sycl/command.sh b/features/feature_case/cuda_interop_sycl/command.sh new file mode 100644 index 000000000..1a75f5f7a --- /dev/null +++ b/features/feature_case/cuda_interop_sycl/command.sh @@ -0,0 +1,15 @@ + +export CUDA_HOME=/usr/local/cuda-11.4 +export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH +export PATH=/usr/local/cuda-11.4/bin:$PATH +export CUDA_INCLUDE_PATH=/usr/local/cuda-11.4/include +export SYCL_ROOT_DIR=/nfs/shm/proj/icl/cmplrarch/deploy_syclos/llorgsyclngefi2linux/20230911_160000/build/linux_qa_release +export CUDA_ROOT_DIR=/usr/local/cuda-11.4 + +clang++ -std=c++17 --cuda-gpu-arch=sm_70 -I${SYCL_ROOT_DIR}/include/ -I${SYCL_ROOT_DIR}/include/sycl/ -Wno-linker-warnings -g test.cu -L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart -o usm_vec_add.o +clang++ -std=c++17 --cuda-gpu-arch=sm_70 -I${SYCL_ROOT_DIR}/include/ -I${SYCL_ROOT_DIR}/include/sycl/ -I${CUDA_ROOT_DIR}/include -Wno-linker-warnings -xcuda -g test.cpp -L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart -o usm_vec_add_cpp.o + +clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -xcuda test.cpp -lcudart -lcuda +clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -lcudart -lcuda -xcuda test.cpp -o sycl.o + +clang++ -fsycl -I${CUDA_ROOT_DIR}/include --cuda-gpu-arch=sm_70 -L${CUDA_ROOT_DIR}/lib64 -xcuda test_sycl.cpp -lcudart -lcuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 diff --git a/features/feature_case/cuda_interop_sycl/test.cpp b/features/feature_case/cuda_interop_sycl/test.cpp new file mode 100644 index 000000000..3d3c58345 --- /dev/null +++ b/features/feature_case/cuda_interop_sycl/test.cpp @@ -0,0 +1,100 @@ + + +#include +#include + +// __global__ void vecAdd(double *a, double *b, double *c, int n) { +// int id = blockIdx.x * blockDim.x + threadIdx.x; +// if (id < n) { +// c[id] = a[id] + b[id]; +// } +// } + +void vecAdd(double *a, double *b, double *c, int n, + const sycl::nd_item<3> &item_ct1) { + // Get our global thread ID + int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + + // Make sure we do not go out of bounds + if (id < n) { + c[id] = a[id] + b[id]; + } +} + +int main(int argc, char *argv[]) { + using namespace sycl; + int n = 100; + size_t bytes = n * sizeof(double); + + device dev{}; + context myContext{dev}; + queue myQueue{myContext, dev}; + + // Allocate memory for each vector on host + // auto d_A = reinterpret_cast(malloc_shared(bytes, myQueue)); + // auto d_B = reinterpret_cast(malloc_shared(bytes, myQueue)); + // auto d_C = reinterpret_cast(malloc_shared(bytes, myQueue)); + double *d_A, *d_B, *d_C; + double *h_A, *h_B, *h_C; + cudaMalloc(&d_A, bytes); + cudaMalloc(&d_B, bytes); + cudaMalloc(&d_C, bytes); + h_A = new double(n); + h_B = new double(n); + h_C = new double(n); + // Initialize vectors on host + for (int i = 0; i < n; i++) { + h_A[i] = 0.5; + h_B[i] = 0.5; + } + cudaStream_t stream_cuda; + cudaStreamCreate(&stream_cuda); + cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice); + // myQueue.submit([&](handler& h) { + // int blockSize = 1024; + // int gridSize = static_cast(ceil(static_cast(n) / + // blockSize)); vecAdd<<>>(d_A, d_B, d_C, n); + // cudaDeviceSynchronize(); + // }); + + // int blockSize = 1024; + // int gridSize = static_cast(ceil(static_cast(n) / blockSize)); + // vecAdd<<>>(d_A, d_B, d_C, n); + // cudaDeviceSynchronize(); + + int blockSize = 1024; + + // Number of thread blocks in grid + int gridSize = static_cast((static_cast(n) / blockSize)); + + // Execute the kernel + /* + DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + { + myQueue.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, gridSize) * + sycl::range<3>(1, 1, blockSize), + sycl::range<3>(1, 1, blockSize)), + [=](sycl::nd_item<3> item_ct1) { vecAdd(d_A, d_B, d_C, n, item_ct1); }); + } + + myQueue.wait(); + cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost); + + double sum = 0; + for (int i = 0; i < n; i++) { + sum += h_C[i]; + } + std::cout << "Final result " << sum / n << std::endl; + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + return 0; +} diff --git a/features/feature_case/cuda_interop_sycl/test1.cpp b/features/feature_case/cuda_interop_sycl/test1.cpp new file mode 100644 index 000000000..98979d26f --- /dev/null +++ b/features/feature_case/cuda_interop_sycl/test1.cpp @@ -0,0 +1,63 @@ + + +#include +#include + + +__global__ void vecAdd(double *a, double *b, double *c, int n) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < n) { + c[id] = a[id] + b[id]; + } +} + +int main(int argc, char *argv[]) { + using namespace sycl; + int n = 100; + size_t bytes = n * sizeof(double); + + device dev{}; + context myContext{dev}; + queue myQueue{myContext, dev}; + + // Allocate memory for each vector on host + auto d_A = reinterpret_cast(malloc_shared(bytes, myQueue)); + auto d_B = reinterpret_cast(malloc_shared(bytes, myQueue)); + auto d_C = reinterpret_cast(malloc_shared(bytes, myQueue)); +for (int i = 0; i < n; i++) { + std::cout <(ceil(static_cast(n) / blockSize)); + vecAdd<<>>(d_A, d_B, d_C, n); + cudaDeviceSynchronize(); + }); + + int blockSize = 1024; + int gridSize = static_cast(ceil(static_cast(n) / blockSize)); + vecAdd<<>>(d_A, d_B, d_C, n); + cudaDeviceSynchronize(); + + myQueue.wait(); + + double sum = 0; + for (int i = 0; i < n; i++) { + sum += d_C[i]; + } + std::cout << "Final result " << sum / n << std::endl; + + free(d_A, myContext); + free(d_B, myContext); + free(d_C, myContext); + + return 0; +} \ No newline at end of file