diff --git a/CMakeLists.txt b/CMakeLists.txt index c98cab48..87e263f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,18 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG)) message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`") endif () +option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON) + # setup some defaults flags for everything set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer) -set(DEFAULT_RELEASE_FLAGS -O3 -march=native) +set(DEFAULT_RELEASE_FLAGS -O3) +if (BUILD_NATIVE) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) + set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -mcpu=native) + else() + set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native) + endif() +endif() macro(hint_flag FLAG DESCRIPTION) if (NOT DEFINED ${FLAG}) @@ -146,17 +155,15 @@ endif () include(cmake/register_models.cmake) # register out models +register_model(serial SERIAL SerialStream.cpp) register_model(omp OMP OMPStream.cpp) register_model(ocl OCL OCLStream.cpp) -register_model(std-data STD_DATA STDDataStream.cpp) -register_model(std-indices STD_INDICES STDIndicesStream.cpp) -register_model(std-ranges STD_RANGES STDRangesStream.cpp) +register_model(std STD STDStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) -register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) -register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) diff --git a/src/Stream.h b/src/Stream.h index 45c144c3..f233f54a 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -7,14 +7,13 @@ #pragma once +#include +#include #include #include +#include "benchmark.h" -// Array values -#define startA (0.1) -#define startB (0.2) -#define startC (0.0) -#define startScalar (0.4) +using std::intptr_t; template class Stream @@ -31,9 +30,8 @@ class Stream virtual void nstream() = 0; virtual T dot() = 0; - // Copy memory between host and device - virtual void init_arrays(T initA, T initB, T initC) = 0; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; + // Set pointers to read from arrays + virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0; }; // Implementation specific device functions diff --git a/src/StreamModels.h b/src/StreamModels.h index 556beb4d..820c08a4 100644 --- a/src/StreamModels.h +++ b/src/StreamModels.h @@ -3,12 +3,8 @@ #if defined(CUDA) #include "CUDAStream.h" -#elif defined(STD_DATA) -#include "STDDataStream.h" -#elif defined(STD_INDICES) -#include "STDIndicesStream.h" -#elif defined(STD_RANGES) -#include "STDRangesStream.hpp" +#elif defined(STD) +#include "STDStream.h" #elif defined(TBB) #include "TBBStream.hpp" #elif defined(THRUST) @@ -31,71 +27,69 @@ #include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" +#elif defined(SERIAL) +#include "SerialStream.h" #elif defined(FUTHARK) #include "FutharkStream.h" #endif -template -std::unique_ptr> make_stream(intptr_t array_size, int deviceIndex) { +template +std::unique_ptr> make_stream(Args... args) { #if defined(CUDA) // Use the CUDA implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(HIP) // Use the HIP implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(HC) // Use the HC implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(OCL) // Use the OpenCL implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(USE_RAJA) // Use the RAJA implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(KOKKOS) // Use the Kokkos implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); -#elif defined(STD_DATA) +#elif defined(STD) // Use the C++ STD data-oriented implementation - return std::make_unique>(array_size, deviceIndex); - -#elif defined(STD_INDICES) - // Use the C++ STD index-oriented implementation - return std::make_unique>(array_size, deviceIndex); - -#elif defined(STD_RANGES) - // Use the C++ STD ranges implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(TBB) // Use the C++20 implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(THRUST) // Use the Thrust implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(ACC) // Use the OpenACC implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(SYCL) || defined(SYCL2020) // Use the SYCL implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(OMP) // Use the OpenMP implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); + +#elif defined(SERIAL) + // Use the Serial implementation + return std::make_unique>(args...); #elif defined(FUTHARK) // Use the Futhark implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #else diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp index a346a39c..034336a4 100644 --- a/src/acc/ACCStream.cpp +++ b/src/acc/ACCStream.cpp @@ -8,11 +8,12 @@ #include "ACCStream.h" template -ACCStream::ACCStream(const intptr_t ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} +ACCStream::ACCStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + : array_size{array_size} { acc_device_t device_type = acc_get_device_type(); - acc_set_device_num(device, device_type); + acc_set_device_num(device_id, device_type); // Set up data region on device this->a = new T[array_size]; @@ -25,6 +26,8 @@ ACCStream::ACCStream(const intptr_t ARRAY_SIZE, int device) #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) {} + + init_arrays(initA, initB, initC); } template @@ -62,7 +65,7 @@ void ACCStream::init_arrays(T initA, T initB, T initC) } template -void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void ACCStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { T *a = this->a; T *b = this->b; @@ -70,12 +73,9 @@ void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) {} - for (intptr_t i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h index 1b053cb4..8345b785 100644 --- a/src/acc/ACCStream.h +++ b/src/acc/ACCStream.h @@ -19,32 +19,25 @@ template class ACCStream : public Stream { - struct A{ - T *a; - T *b; - T *c; - }; - - protected: // Size of arrays intptr_t array_size; - A aa; // Device side pointers - T *a; - T *b; - T *c; + T* restrict a; + T* restrict b; + T* restrict c; public: - ACCStream(const intptr_t, int); + ACCStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~ACCStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/benchmark.h b/src/benchmark.h new file mode 100644 index 00000000..95d675f7 --- /dev/null +++ b/src/benchmark.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include +#include + +// Array values +#define startA (0.1) +#define startB (0.2) +#define startC (0.0) +#define startScalar (0.4) + +// Benchmark Identifier: identifies individual & groups of benchmarks: +// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot. +// - All: all kernels. +// - Individual kernels only. +enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All}; + +struct Benchmark { + BenchId id; + char const* label; + // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW: + // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur + size_t weight; + // Is it one of: Copy, Mul, Add, Triad, Dot? + bool classic = false; +}; + +// Benchmarks in the order in which - if present - should be run for validation purposes: +constexpr size_t num_benchmarks = 6; +constexpr std::array bench = { + Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false } +}; + +// Which buffers are needed by each benchmark +inline bool needs_buffer(BenchId id, char n) { + auto in = [n](std::initializer_list values) { + return std::find(values.begin(), values.end(), n) != values.end(); + }; + switch(id) { + case BenchId::All: return in({'a','b','c'}); + case BenchId::Classic: return in({'a','b','c'}); + case BenchId::Copy: return in({'a','c'}); + case BenchId::Mul: return in({'b','c'}); + case BenchId::Add: return in({'a','b','c'}); + case BenchId::Triad: return in({'a','b','c'}); + case BenchId::Dot: return in({'a','b'}); + case BenchId::Nstream: return in({'a','b','c'}); + default: + std::cerr << "Unknown benchmark" << std::endl; + abort(); + } +} + +// Returns true if the benchmark needs to be run: +inline bool run_benchmark(BenchId selection, Benchmark const& b) { + if (selection == BenchId::All) return true; + if (selection == BenchId::Classic && b.classic) return true; + return selection == b.id; +} diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 249bab4f..d10e8eef 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -138,6 +138,7 @@ build_gcc() { local name="gcc_build" local cxx="-DCMAKE_CXX_COMPILER=${GCC_CXX:?}" + run_build $name "${GCC_CXX:?}" serial "$cxx" run_build $name "${GCC_CXX:?}" omp "$cxx" if [ "$MODEL" = "all" ] || [ "$MODEL" = "OMP" ]; then # sanity check that it at least runs @@ -151,9 +152,10 @@ build_gcc() { *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + #run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -219,6 +221,7 @@ build_gcc() { build_clang() { local name="clang_build" local cxx="-DCMAKE_CXX_COMPILER=${CLANG_CXX:?}" + run_build $name "${CLANG_CXX:?}" serial "$cxx" run_build $name "${CLANG_CXX:?}" omp "$cxx" if [ "${CLANG_OMP_OFFLOAD_AMD:-false}" != "false" ]; then @@ -249,9 +252,11 @@ build_clang() { OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac - run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported + run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + # run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + # TODO: clang is too old + #run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -268,14 +273,17 @@ build_clang() { build_nvhpc() { local name="nvhpc_build" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" - run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + # run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA23" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=INDICES" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } build_aocc() { + run_build aocc_build "${AOCC_CXX:?}" serial "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}" run_build aocc_build "${AOCC_CXX:?}" omp "-DCMAKE_CXX_COMPILER=${AOCC_CXX:?}" } @@ -309,6 +317,7 @@ build_icpc() { set -u local name="intel_build" local cxx="-DCMAKE_CXX_COMPILER=${ICPC_CXX:?}" + run_build $name "${ICPC_CXX:?}" serial "$cxx" run_build $name "${ICPC_CXX:?}" omp "$cxx" run_build $name "${ICPC_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" if check_cmake_ver "3.20.0"; then diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 24d05794..4f5599a7 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -5,22 +5,80 @@ // source code #include "CUDAStream.h" +#include -[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) { - std::fprintf(stderr, "Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr); +#if !defined(UNROLL_FACTOR) +#define UNROLL_FACTOR 4 +#endif + +[[noreturn]] inline void cuda_error(char const* file, int line, char const* expr, cudaError_t e) { + std::fprintf(stderr, "CUDA Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr); + exit(e); +} + +[[noreturn]] inline void nvml_error(char const* file, int line, char const* expr, nvmlReturn_t e) { + std::fprintf(stderr, "NVML Error at %s:%d: %s (%d)\n %s\n", file, line, nvmlErrorString(e), e, expr); exit(e); } // The do while is there to make sure you remember to put a semi-colon after calling CU -#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false) +#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error(__FILE__, __LINE__, #EXPR, __e); } while(false) +#define NVML(EXPR) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error(__FILE__, __LINE__, #EXPR, __e); } while(false) // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; } cudaStream_t stream; +template +T* alloc_device(const intptr_t array_size) { + size_t array_bytes = sizeof(T) * array_size; + T* p = nullptr; +#if defined(MANAGED) + CU(cudaMallocManaged(&p, array_bytes)); +#elif defined(PAGEFAULT) + p = (T*)malloc(array_bytes); +#else + CU(cudaMalloc(&p, array_bytes)); +#endif + if (p == nullptr) throw std::runtime_error("Failed to allocate device array"); + return p; +} + +template +T* alloc_host(const intptr_t array_size) { + size_t array_bytes = sizeof(T) * array_size; + T* p = nullptr; +#if defined(PAGEFAULT) + p = (T*)malloc(array_bytes); +#else + CU(cudaHostAlloc(&p, array_bytes, cudaHostAllocDefault)); +#endif + if (p == nullptr) throw std::runtime_error("Failed to allocate host array"); + return p; +} + +template +void free_device(T* p) { +#if defined(PAGEFAULT) + free(p); +#else + CU(cudaFree(p)); +#endif +} + +template +void free_host(T* p) { +#if defined(PAGEFAULT) + free(p); +#else + CU(cudaFreeHost(p)); +#endif +} + template -CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) +CUDAStream::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) : array_size(array_size) { // Set device @@ -33,201 +91,197 @@ CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) CU(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // Print out device information - std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; -#if defined(MANAGED) - std::cout << "Memory: MANAGED" << std::endl; -#elif defined(PAGEFAULT) - std::cout << "Memory: PAGEFAULT" << std::endl; -#else - std::cout << "Memory: DEFAULT" << std::endl; -#endif - - // Query device for sensible dot kernel block count - cudaDeviceProp props; - CU(cudaGetDeviceProperties(&props, device_index)); - dot_num_blocks = props.multiProcessorCount * 4; + std::cout << "CUDA Driver: " << getDeviceDriver(device_index) << std::endl; + NVML(nvmlInit()); + cudaDeviceProp dprop; + CU(cudaGetDeviceProperties(&dprop, device_index)); + unsigned int memclock; + char mybus[16]; + sprintf(&mybus[0], "%04x:%02x:%02x.0", dprop.pciDomainID, dprop.pciBusID, dprop.pciDeviceID); + nvmlDevice_t nvmldev; + NVML(nvmlDeviceGetHandleByPciBusId(mybus, &nvmldev)); + NVML(nvmlDeviceGetClockInfo(nvmldev, NVML_CLOCK_MEM, &memclock)); + std::cout << "CUDA Device " << device_index << ": \"" + << getDeviceName(device_index) + << "\" " << dprop.multiProcessorCount << " SMs(" << dprop.major << "," << dprop.minor << ") " + << "Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << "-bit = " + << 2.0*memclock*(dprop.memoryBusWidth/8)/1000.0 << " GB/s PEAK, ECC is " + << (dprop.ECCEnabled ? "ON" : "OFF") + << std::endl; + + // Print Memory allocation API used for buffers + std::cout << "Memory Allocation: "; + #if defined(MANAGED) + std::cout << "MANAGED"; + #elif defined(PAGEFAULT) + std::cout << "PAGEFAULT"; + #else + std::cout << "DEFAULT"; + #endif + std::cout << std::endl; + + std::cout << "Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl; + + // Set sensible dot kernel block count + dot_num_blocks = dprop.multiProcessorCount * 4; // Size of partial sums for dot kernels size_t sums_bytes = sizeof(T) * dot_num_blocks; size_t array_bytes = sizeof(T) * array_size; size_t total_bytes = array_bytes * size_t(3) + sums_bytes; - std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl; // Check buffers fit on the device - if (props.totalGlobalMem < total_bytes) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + if (dprop.totalGlobalMem < total_bytes) { + std::cerr << "Requested array size of " << total_bytes * 1e-9 + << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl; + throw std::runtime_error("Device does not have enough memory for all buffers"); + } - // Create device buffers -#if defined(MANAGED) - CU(cudaMallocManaged(&d_a, array_bytes)); - CU(cudaMallocManaged(&d_b, array_bytes)); - CU(cudaMallocManaged(&d_c, array_bytes)); - CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); -#elif defined(PAGEFAULT) - d_a = (T*)malloc(array_bytes); - d_b = (T*)malloc(array_bytes); - d_c = (T*)malloc(array_bytes); - sums = (T*)malloc(sums_bytes); -#else - CU(cudaMalloc(&d_a, array_bytes)); - CU(cudaMalloc(&d_b, array_bytes)); - CU(cudaMalloc(&d_c, array_bytes)); - CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); -#endif + // Allocate buffers: + d_a = alloc_device(array_size); + d_b = alloc_device(array_size); + d_c = alloc_device(array_size); + sums = alloc_host(dot_num_blocks); + + // Initialize buffers: + init_arrays(initA, initB, initC); } template CUDAStream::~CUDAStream() { CU(cudaStreamDestroy(stream)); + free_device(d_a); + free_device(d_b); + free_device(d_c); + free_host(sums); +} -#if defined(PAGEFAULT) - free(d_a); - free(d_b); - free(d_c); - free(sums); +template +__global__ void for_each_kernel(size_t array_size, size_t start, F f) { + constexpr int unroll_factor = UNROLL_FACTOR; +#if defined(GRID_STRIDE) + // Grid-stride loop + size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; + #pragma unroll(unroll_factor) + for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) { + f(i); + } +#elif defined(BLOCK_STRIDE) + // Block-stride loop + size_t i = start * blockIdx.x + threadIdx.x; + const size_t e = min(array_size, start * (blockIdx.x + size_t(1)) + threadIdx.x); + #pragma unroll(unroll_factor) + for (; i < e; i += blockDim.x) { + f(i); + } #else - CU(cudaFree(d_a)); - CU(cudaFree(d_b)); - CU(cudaFree(d_c)); - CU(cudaFreeHost(sums)); + #error Must pick grid-stride or block-stride loop #endif } -template -__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] = initA; - b[i] = initB; - c[i] = initC; +template +void for_each(size_t array_size, F f) { + static int threads_per_block = 0; + if (threads_per_block == 0) { + // Pick suitable thread block size for F: + int min_blocks_per_grid; + auto dyn_smem = [] __host__ __device__ (int){ return 0; }; + CU(cudaOccupancyMaxPotentialBlockSizeVariableSMem + (&min_blocks_per_grid, &threads_per_block, for_each_kernel, dyn_smem, 0)); + // Clamp to TBSIZE + threads_per_block = std::min(TBSIZE, threads_per_block); } + size_t blocks = ceil_div(array_size / UNROLL_FACTOR, threads_per_block); + size_t start = ceil_div(array_size, (size_t)blocks); + for_each_kernel<<>>(array_size, start, f); + CU(cudaPeekAtLastError()); + CU(cudaStreamSynchronize(stream)); } template void CUDAStream::init_arrays(T initA, T initB, T initC) { - size_t blocks = ceil_div(array_size, TBSIZE); - init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); + for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); } template -void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void CUDAStream::get_arrays(T const*& a, T const*& b, T const*& c) { - // Copy device memory to host -#if defined(PAGEFAULT) || defined(MANAGED) CU(cudaStreamSynchronize(stream)); - for (intptr_t i = 0; i < array_size; ++i) - { - a[i] = d_a[i]; - b[i] = d_b[i]; - c[i] = d_c[i]; - } +#if defined(PAGEFAULT) || defined(MANAGED) + // Unified memory: return pointers to device memory + a = d_a; + b = d_b; + c = d_c; #else - CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost)); - CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost)); - CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost)); + // No Unified memory: copy data to the host + size_t nbytes = array_size * sizeof(T); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost)); #endif } -template -__global__ void copy_kernel(const T * a, T * c, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - c[i] = a[i]; - } -} - template void CUDAStream::copy() { - size_t blocks = ceil_div(array_size, TBSIZE); - copy_kernel<<>>(d_a, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void mul_kernel(T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - b[i] = scalar * c[i]; - } + for_each(array_size, [a=d_a,c=d_c] __device__ (size_t i) { + c[i] = a[i]; + }); } template void CUDAStream::mul() { - size_t blocks = ceil_div(array_size, TBSIZE); - mul_kernel<<>>(d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - c[i] = a[i] + b[i]; - } + for_each(array_size, [b=d_b,c=d_c] __device__ (size_t i) { + b[i] = startScalar * c[i]; + }); } template void CUDAStream::add() { - size_t blocks = ceil_div(array_size, TBSIZE); - add_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] = b[i] + scalar * c[i]; - } + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + c[i] = a[i] + b[i]; + }); } template void CUDAStream::triad() { - size_t blocks = ceil_div(array_size, TBSIZE); - triad_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] += b[i] + scalar * c[i]; - } + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] = b[i] + startScalar * c[i]; + }); } template void CUDAStream::nstream() { - size_t blocks = ceil_div(array_size, TBSIZE); - nstream_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] += b[i] + startScalar * c[i]; + }); } template __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size) { - __shared__ T smem[TBSIZE]; + __shared__ T smem[TBSIZE_DOT]; T tmp = T(0.); const size_t tidx = threadIdx.x; - for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { + size_t i = tidx + (size_t)blockDim.x * blockIdx.x; + for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) { tmp += a[i] * b[i]; } smem[tidx] = tmp; @@ -244,7 +298,7 @@ __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); CU(cudaPeekAtLastError()); CU(cudaStreamSynchronize(stream)); diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 4b4a1a3a..50e099dc 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -15,7 +15,8 @@ #define IMPLEMENTATION_STRING "CUDA" -#define TBSIZE 1024 +#define TBSIZE 256 +#define TBSIZE_DOT 1024 template class CUDAStream : public Stream @@ -25,27 +26,31 @@ class CUDAStream : public Stream intptr_t array_size; // Host array for partial sums for dot kernel - T *sums; + T* sums; // Device side pointers to arrays - T *d_a; - T *d_b; - T *d_c; + T* d_a; + T* d_b; + T* d_c; + + // If UVM is disabled, host arrays for verification purposes + std::vector h_a, h_b, h_c; // Number of blocks for dot kernel intptr_t dot_num_blocks; public: - CUDAStream(const intptr_t, const int); + CUDAStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~CUDAStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 7c1b0d6e..8314e1f2 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -9,6 +9,11 @@ register_flag_optional(MEM "Device memory mode: PAGEFAULT - shared memory, only host pointers allocated." "DEFAULT") +register_flag_optional(STRIDE "Kernel stride: GRID_STRIDE or BLOCK_STRIDE" "GRID_STRIDE") + +register_flag_optional(UNROLL_FACTOR "Kernel unroll factor:" "4") + + register_flag_required(CMAKE_CUDA_COMPILER "Path to the CUDA nvcc compiler") @@ -30,11 +35,17 @@ macro(setup) enable_language(CUDA) register_definitions(${MEM}) + register_definitions(${STRIDE}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + # add --extended-lambda for device-lambdas + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" + "--extended-lambda" "-DUNROLL_FACTOR=${UNROLL_FACTOR}" ${CUDA_EXTRA_FLAGS}) string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + # Link against the NVIDIA Management Library for device information + register_link_library("nvidia-ml") + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 226693bd..9b8a7acc 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -29,12 +29,15 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } +#define WORKAROUND + #else // auto exe_policy = dpl::execution::seq; // auto exe_policy = dpl::execution::par; static constexpr auto exe_policy = dpl::execution::par_unseq; #define USE_STD_PTR_ALLOC_DEALLOC +#define WORKAROUND #endif diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp index ebd3633b..392ff898 100644 --- a/src/futhark/FutharkStream.cpp +++ b/src/futhark/FutharkStream.cpp @@ -11,9 +11,10 @@ #include "FutharkStream.h" template -FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) +FutharkStream::FutharkStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : array_size(array_size) { - this->array_size = ARRAY_SIZE; this->cfg = futhark_context_config_new(); this->device = "#" + std::to_string(device); #if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl) @@ -23,6 +24,7 @@ FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) this->a = NULL; this->b = NULL; this->c = NULL; + init_arrays(initA, initB, initC); } template <> @@ -98,19 +100,31 @@ void FutharkStream::init_arrays(double initA, double initB, double initC } template <> -void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +void FutharkStream::get_arrays(float const*& a_, float const*& b_, float const*& c_) { + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data()); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data()); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data()); futhark_context_sync(this->ctx); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); } template <> -void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +void FutharkStream::get_arrays(double const*& a_, double const*& b_, double const*& c_) { + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data()); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data()); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data()); futhark_context_sync(this->ctx); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); } template <> diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h index 6290e79a..eabdabbe 100644 --- a/src/futhark/FutharkStream.h +++ b/src/futhark/FutharkStream.h @@ -44,17 +44,21 @@ class FutharkStream : public Stream void* b; void* c; + // Host side arrays for verification + std::vector h_a, h_b, h_c; + public: - FutharkStream(const int, int); + FutharkStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~FutharkStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake index edd21fa6..d7b08795 100644 --- a/src/futhark/model.cmake +++ b/src/futhark/model.cmake @@ -44,6 +44,7 @@ macro(setup) elseif (${FUTHARK_BACKEND} STREQUAL "cuda") find_package(CUDA REQUIRED) register_link_library("nvrtc" "cuda" "cudart") + set(CMAKE_C_COMPILER "nvcc") else () message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}") endif() diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ec02425a..e3878afd 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -25,7 +25,9 @@ void check_error(void) __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; } template -HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) +HIPStream::HIPStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { // Set device int count; @@ -47,13 +49,12 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) std::cout << "Memory: DEFAULT" << std::endl; #endif - array_size = ARRAY_SIZE; // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); size_t array_bytes = sizeof(T); - array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; + array_bytes *= array_size; + size_t total_bytes = array_bytes * std::size_t{3}; // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires @@ -65,7 +66,7 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers @@ -88,6 +89,8 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) hipMalloc(&d_c, array_bytes); check_error(); #endif + + init_arrays(initA, initB, initC); } @@ -127,24 +130,28 @@ void HIPStream::init_arrays(T initA, T initB, T initC) } template -void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void HIPStream::get_arrays(T const*& a, T const*& b, T const*& c) { - - // Copy device memory to host + hipDeviceSynchronize(); #if defined(PAGEFAULT) || defined(MANAGED) - hipDeviceSynchronize(); - for (intptr_t i = 0; i < array_size; i++) - { - a[i] = d_a[i]; - b[i] = d_b[i]; - c[i] = d_c[i]; - } + // Unified memory: return pointers to device memory + a = d_a; + b = d_b; + c = d_c; #else - hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); + // No Unified memory: copy data to the host + size_t nbytes = array_size * sizeof(T); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + hipMemcpy(h_a.data(), d_a, nbytes, hipMemcpyDeviceToHost); + check_error(); + hipMemcpy(h_b.data(), d_b, nbytes, hipMemcpyDeviceToHost); + check_error(); + hipMemcpy(h_c.data(), d_c, nbytes, hipMemcpyDeviceToHost); check_error(); #endif } diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 76ef7df4..a1c45802 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -48,20 +48,21 @@ class HIPStream : public Stream T *d_b; T *d_c; + // If UVM is disabled, host arrays for verification purposes + std::vector h_a, h_b, h_c; public: - - HIPStream(const intptr_t, const int); + HIPStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~HIPStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index e49d5bcc..fcbdb7a7 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -8,21 +8,23 @@ #include "KokkosStream.hpp" template -KokkosStream::KokkosStream( - const intptr_t ARRAY_SIZE, const int device_index) - : array_size(ARRAY_SIZE) +KokkosStream::KokkosStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index)); - d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE); - d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE); - d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE); + d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), array_size); + d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), array_size); + d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), array_size); hm_a = new typename Kokkos::View::HostMirror(); hm_b = new typename Kokkos::View::HostMirror(); hm_c = new typename Kokkos::View::HostMirror(); *hm_a = create_mirror_view(*d_a); *hm_b = create_mirror_view(*d_b); *hm_c = create_mirror_view(*d_c); + + init_arrays(initA, initB, initC); } template @@ -47,18 +49,14 @@ void KokkosStream::init_arrays(T initA, T initB, T initC) } template -void KokkosStream::read_arrays( - std::vector& a, std::vector& b, std::vector& c) +void KokkosStream::get_arrays(T const*& a, T const*& b, T const*& c) { deep_copy(*hm_a, *d_a); deep_copy(*hm_b, *d_b); deep_copy(*hm_c, *d_c); - for(intptr_t ii = 0; ii < array_size; ++ii) - { - a[ii] = (*hm_a)(ii); - b[ii] = (*hm_b)(ii); - c[ii] = (*hm_c)(ii); - } + a = hm_a->data(); + b = hm_b->data(); + c = hm_c->data(); } template diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 8e40119c..bc3ac3ee 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -22,27 +22,27 @@ class KokkosStream : public Stream intptr_t array_size; // Device side pointers to arrays - typename Kokkos::View* d_a; - typename Kokkos::View* d_b; - typename Kokkos::View* d_c; - typename Kokkos::View::HostMirror* hm_a; - typename Kokkos::View::HostMirror* hm_b; - typename Kokkos::View::HostMirror* hm_c; + typename Kokkos::View* d_a; + typename Kokkos::View* d_b; + typename Kokkos::View* d_c; + typename Kokkos::View::HostMirror* hm_a; + typename Kokkos::View::HostMirror* hm_b; + typename Kokkos::View::HostMirror* hm_c; public: - KokkosStream(const intptr_t, const int); + KokkosStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~KokkosStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays( - std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 7457eebd..2223c753 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,5 +1,5 @@ register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and RAJA. + "Any CXX compiler that is supported by CMake detection and Kokkos. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") @@ -21,7 +21,7 @@ macro(setup) set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - + message("KOKKOS_IN_PACKAGE=${KOKKOS_IN_PACKAGE}") if (EXISTS "${KOKKOS_IN_TREE}") message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) diff --git a/src/main.cpp b/src/main.cpp index ee091259..af9fac6c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,11 +19,12 @@ #define VERSION_STRING "5.0" #include "Stream.h" + #include "StreamModels.h" #include "Unit.h" // Default size of 2^25 -intptr_t ARRAY_SIZE = 33554432; +intptr_t array_size = 33554432; size_t num_times = 100; size_t deviceIndex = 0; bool use_float = false; @@ -33,42 +34,11 @@ Unit unit{Unit::Kind::MegaByte}; bool silence_errors = false; std::string csv_separator = ","; -// Benchmark Identifier: identifies individual & groups of benchmarks: -// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot. -// - All: all kernels. -// - Individual kernels only. -enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All}; - -struct Benchmark { - BenchId id; - char const* label; - // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW: - // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur - size_t weight; - // Is it one of: Copy, Mul, Add, Triad, Dot? - bool classic = false; -}; - -// Benchmarks in the order in which - if present - should be run for validation purposes: -constexpr size_t num_benchmarks = 6; -std::array bench = { - Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true }, - Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true }, - Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false } -}; - // Selected benchmarks to run: default is all 5 classic benchmarks. BenchId selection = BenchId::Classic; // Returns true if the benchmark needs to be run: -bool run_benchmark(Benchmark const& b) { - if (selection == BenchId::All) return true; - if (selection == BenchId::Classic && b.classic) return true; - return selection == b.id; -} +bool run_benchmark(Benchmark const& b) { return run_benchmark(selection, b); } // Benchmark run order // - Classic: runs each bench once in the order above, and repeats n times. @@ -174,8 +144,7 @@ std::vector> run_all(std::unique_ptr>& stream, T& } template -void check_solution(const size_t ntimes, std::vector& a, std::vector& b, std::vector& c, - T& sum); +void check_solution(const size_t ntimes, T const* a, T const* b, T const* c, T sum); // Generic run routine // Runs the kernel(s) and prints output. @@ -186,7 +155,7 @@ void run() // Formatting utilities: auto fmt_bw = [&](size_t weight, double dt) { - return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt); + return unit.fmt((weight * sizeof(T) * array_size)/dt); }; auto fmt_csv_header = [] { std::cout @@ -244,47 +213,44 @@ void run() } std::cout << " "; } - std::cout << num_times << " times" << std::endl; - std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; + std::cout << num_times << " times in "; + switch (order) { + case BenchOrder::Classic: std::cout << " Classic"; break; + case BenchOrder::Isolated: std::cout << " Isolated"; break; + default: std::cerr << "Error: Unknown order" << std::endl; abort(); + }; + std::cout << " order " << std::endl; + std::cout << "Number of elements: " << array_size << std::endl; std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl; - size_t nbytes = ARRAY_SIZE * sizeof(T); + size_t nbytes = array_size * sizeof(T); std::cout << std::setprecision(1) << std::fixed << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl; std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl; std::cout.precision(ss); } - std::unique_ptr> stream = make_stream(ARRAY_SIZE, deviceIndex); - auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); }); + std::unique_ptr> stream + = make_stream(selection, array_size, deviceIndex, startA, startB, startC); // Result of the Dot kernel, if used. T sum{}; std::vector> timings = run_all(stream, sum); // Create & read host vectors: - std::vector a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE); - auto readElapsedS = time([&] { stream->read_arrays(a, b, c); }); + T const* a; + T const* b; + T const* c; + stream->get_arrays(a, b, c); check_solution(num_times, a, b, c, sum); - auto initBWps = fmt_bw(3, initElapsedS); - auto readBWps = fmt_bw(3, readElapsedS); if (output_as_csv) { fmt_csv_header(); - fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS); - fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS); } else { - std::cout << "Init: " - << std::setw(7) - << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl; - std::cout << "Read: " - << std::setw(7) - << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl; - std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12) << (std::string(unit.str()) + "/s") @@ -307,15 +273,13 @@ void run() / (double)(num_times - 1); // Display results - fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T), + fmt_result(bench[i].label, num_times, array_size, sizeof(T), fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average); } } template -void check_solution(const size_t num_times, - std::vector& a, std::vector& b, std::vector& c, T& sum) -{ +void check_solution(const size_t num_times, T const* a, T const* b, T const* c, T sum) { // Generate correct solution T goldA = startA; T goldB = startB; @@ -332,7 +296,7 @@ void check_solution(const size_t num_times, case BenchId::Add: goldC = goldA + goldB; break; case BenchId::Triad: goldA = goldB + scalar * goldC; break; case BenchId::Nstream: goldA += goldB + scalar * goldC; break; - case BenchId::Dot: goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly + case BenchId::Dot: goldS = goldA * goldB * T(array_size); break; // This calculates the answer exactly default: std::cerr << "Unimplemented Check: " << bench[b].label << std::endl; abort(); @@ -364,39 +328,42 @@ void check_solution(const size_t num_times, abort(); } - // Error relative tolerance check + // Error relative tolerance check - a higher tolerance is used for reductions. size_t failed = 0; - T epsi = std::numeric_limits::epsilon() * T(100000.0); - auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) { - if (e > epsi || std::isnan(e) || std::isnan(is)) { + T max_rel = std::numeric_limits::epsilon() * T(100.0); + T max_rel_dot = std::numeric_limits::epsilon() * T(10000000.0); + auto check = [&](const char* name, T is, T should, T mrel, size_t i = size_t(-1)) { + // Relative difference: + T diff = std::abs(is - should); + T abs_is = std::abs(is); + T abs_sh = std::abs(should); + T largest = std::max(abs_is, abs_sh); + T same = diff <= largest * mrel; + if (!same || std::isnan(is)) { ++failed; if (failed > 10) return; std::cerr << "FAILED validation of " << name; if (i != size_t(-1)) std::cerr << "[" << i << "]"; - std::cerr << ": " << is << " != " << should - << ", relative error=" << e << " > " << epsi << std::endl; + std::cerr << ": " << is << " (is) != " << should + << " (should)" << ", diff=" << diff << " > " + << largest * mrel << " (largest=" << largest + << ", max_rel=" << mrel << ")" << std::endl; } }; // Sum - T eS = std::fabs(sum - goldS) / std::fabs(goldS); for (size_t i = 0; i < num_benchmarks; ++i) { if (bench[i].id != BenchId::Dot) continue; if (run_benchmark(bench[i])) - check("sum", sum, goldS, eS); + check("sum", sum, goldS, max_rel_dot); break; } // Calculate the L^infty-norm relative error - for (size_t i = 0; i < a.size(); ++i) { - T vA = a[i], vB = b[i], vC = c[i]; - T eA = std::fabs(vA - goldA) / std::fabs(goldA); - T eB = std::fabs(vB - goldB) / std::fabs(goldB); - T eC = std::fabs(vC - goldC) / std::fabs(goldC); - - check("a", a[i], goldA, eA, i); - check("b", b[i], goldB, eB, i); - check("c", c[i], goldC, eC, i); + for (size_t i = 0; i < array_size; ++i) { + check("a", a[i], goldA, max_rel, i); + check("b", b[i], goldB, max_rel, i); + check("c", c[i], goldC, max_rel, i); } if (failed > 0 && !silence_errors) @@ -442,13 +409,11 @@ void parseArguments(int argc, char *argv[]) else if (!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i])) { - intptr_t array_size; if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0) { std::cerr << "Invalid array size." << std::endl; std::exit(EXIT_FAILURE); } - ARRAY_SIZE = array_size; } else if (!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i])) @@ -510,12 +475,12 @@ void parseArguments(int argc, char *argv[]) { if (++i >= argc) { - std::cerr << "Expected benchmark order after --order. Options: \"classic\" (default), \"isolated\"." + std::cerr << "Expected benchmark order after --order. Options: \"Classic\" (default), \"Isolated\"." << std::endl; exit(EXIT_FAILURE); } auto key = std::string(argv[i]); - if (key == "isolated") + if (key == "Isolated") { order = BenchOrder::Isolated; } @@ -566,7 +531,7 @@ void parseArguments(int argc, char *argv[]) std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << " -o --only NAME Only run one benchmark (see --print-names)" << std::endl; std::cout << " --print-names Prints all available benchmark names" << std::endl; - std::cout << " --order Benchmark run order: \"classic\" (default) or \"isolated\"." << std::endl; + std::cout << " --order Benchmark run order: \"Classic\" (default) or \"Isolated\"." << std::endl; std::cout << " --csv Output as csv table" << std::endl; std::cout << " --megabytes Use MB=10^6 for bandwidth calculation (default)" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl; diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp index c70a701d..fc1ae30c 100644 --- a/src/ocl/OCLStream.cpp +++ b/src/ocl/OCLStream.cpp @@ -100,8 +100,9 @@ std::string kernels{R"CLC( template -OCLStream::OCLStream(const intptr_t ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE} +OCLStream::OCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size{array_size} { if (!cached) getDeviceList(); @@ -172,18 +173,20 @@ OCLStream::OCLStream(const intptr_t ARRAY_SIZE, const int device_index) // Check buffers fit on the device cl_ulong totalmem = device.getInfo(); cl_ulong maxbuffer = device.getInfo(); - if (maxbuffer < sizeof(T)*ARRAY_SIZE) + if (maxbuffer < sizeof(T)*array_size) throw std::runtime_error("Device cannot allocate a buffer big enough"); - if (totalmem < 3*sizeof(T)*ARRAY_SIZE) + if (totalmem < 3*sizeof(T)*array_size) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create buffers - d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); + d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); + d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups); sums = std::vector(dot_num_groups); + + init_arrays(initA, initB, initC); } template @@ -277,11 +280,17 @@ void OCLStream::init_arrays(T initA, T initB, T initC) } template -void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void OCLStream::get_arrays(T const*& a, T const*& b, T const*& c) { - cl::copy(queue, d_a, a.begin(), a.end()); - cl::copy(queue, d_b, b.begin(), b.end()); - cl::copy(queue, d_c, c.begin(), c.end()); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + cl::copy(queue, d_a, h_a.begin(), h_a.end()); + cl::copy(queue, d_b, h_b.begin(), h_b.end()); + cl::copy(queue, d_c, h_c.begin(), h_c.end()); } void getDeviceList(void) diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h index e2366dad..e5405dde 100644 --- a/src/ocl/OCLStream.h +++ b/src/ocl/OCLStream.h @@ -42,6 +42,9 @@ class OCLStream : public Stream cl::Buffer d_c; cl::Buffer d_sum; + // Host-side arrays for verification + std::vector h_a, h_b, h_c; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; @@ -56,19 +59,19 @@ class OCLStream : public Stream public: - OCLStream(const intptr_t, const int); + OCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~OCLStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 09b749fd..f0389373 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -13,10 +13,10 @@ #endif template -OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) +OMPStream::OMPStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : array_size(array_size) { - array_size = ARRAY_SIZE; - // Allocate on the host this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); @@ -32,6 +32,7 @@ OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) {} #endif + init_arrays(initA, initB, initC); } template @@ -77,7 +78,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) } template -void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void OMPStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { #ifdef OMP_TARGET_GPU @@ -87,15 +88,9 @@ void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) {} #endif - - #pragma omp parallel for - for (intptr_t i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } - + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h index 40770005..fca4906c 100644 --- a/src/omp/OMPStream.h +++ b/src/omp/OMPStream.h @@ -29,16 +29,17 @@ class OMPStream : public Stream T *c; public: - OMPStream(const intptr_t, int); + OMPStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~OMPStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp index 6d6e8342..35fe6e8d 100644 --- a/src/raja/RAJAStream.cpp +++ b/src/raja/RAJAStream.cpp @@ -16,8 +16,9 @@ using RAJA::forall; #endif template -RAJAStream::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index) - : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE) +RAJAStream::RAJAStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size), range(0, array_size) { #ifdef RAJA_TARGET_CPU @@ -25,11 +26,13 @@ RAJAStream::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index) d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #else - cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); - cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); - cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_a, sizeof(T)*array_size, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*array_size, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*array_size, cudaMemAttachGlobal); cudaDeviceSynchronize(); #endif + + init_arrays(initA, initB, initC); } template @@ -61,12 +64,11 @@ void RAJAStream::init_arrays(T initA, T initB, T initC) } template -void RAJAStream::read_arrays( - std::vector& a, std::vector& b, std::vector& c) +void RAJAStream::get_arrays(T const*& a, T const*& b, T const*& c) { - std::copy(d_a, d_a + array_size, a.data()); - std::copy(d_b, d_b + array_size, b.data()); - std::copy(d_c, d_c + array_size, c.data()); + a = d_a; + b = d_b; + c = d_c; } template diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp index e98b0778..a2565ccc 100644 --- a/src/raja/RAJAStream.hpp +++ b/src/raja/RAJAStream.hpp @@ -50,19 +50,18 @@ class RAJAStream : public Stream T* d_c; public: - - RAJAStream(const intptr_t, const int); + RAJAStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~RAJAStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays( - std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/serial/SerialStream.cpp b/src/serial/SerialStream.cpp new file mode 100644 index 00000000..394a657c --- /dev/null +++ b/src/serial/SerialStream.cpp @@ -0,0 +1,132 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, Tom Lin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include // For aligned_alloc +#include "SerialStream.h" + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +template +SerialStream::SerialStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + : array_size{array_size} +{ + // Allocate on the host + this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + + init_arrays(initA, initB, initC); +} + +template +SerialStream::~SerialStream() +{ + free(a); + free(b); + free(c); +} + +template +void SerialStream::init_arrays(T initA, T initB, T initC) +{ + intptr_t array_size = this->array_size; + for (intptr_t i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } +} + +template +void SerialStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) +{ + h_a = a; + h_b = b; + h_c = c; +} + +template +void SerialStream::copy() +{ + for (intptr_t i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void SerialStream::mul() +{ + const T scalar = startScalar; + for (intptr_t i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void SerialStream::add() +{ + for (intptr_t i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void SerialStream::triad() +{ + const T scalar = startScalar; + for (intptr_t i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} + +template +void SerialStream::nstream() +{ + const T scalar = startScalar; + for (intptr_t i = 0; i < array_size; i++) + { + a[i] += b[i] + scalar * c[i]; + } +} + +template +T SerialStream::dot() +{ + T sum{}; + for (intptr_t i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + return sum; +} + + + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class SerialStream; +template class SerialStream; diff --git a/src/serial/SerialStream.h b/src/serial/SerialStream.h new file mode 100644 index 00000000..9f5653d4 --- /dev/null +++ b/src/serial/SerialStream.h @@ -0,0 +1,42 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, Tom Lin +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + + +#define IMPLEMENTATION_STRING "Serial" + +template +class SerialStream : public Stream +{ + protected: + // Size of arrays + intptr_t array_size; + + // Device side pointers + T *a, *b, *c; + + public: + SerialStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); + ~SerialStream(); + + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); +}; diff --git a/src/serial/model.cmake b/src/serial/model.cmake new file mode 100644 index 00000000..d1818466 --- /dev/null +++ b/src/serial/model.cmake @@ -0,0 +1,4 @@ +macro(setup) + # Nothing to do +endmacro() + diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp deleted file mode 100644 index 3efeb1b3..00000000 --- a/src/std-data/STDDataStream.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -// Updated 2021 by University of Bristol -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDDataStream.h" - -template -STDDataStream::STDDataStream(const intptr_t ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif -} - -template -STDDataStream::~STDDataStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDDataStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); -} - -template -void STDDataStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDDataStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); -} - -template -void STDDataStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, a, a + array_size, b, c, std::plus()); -} - -template -void STDDataStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); -} - -template -void STDDataStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); -} - - -template -T STDDataStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDDataStream; -template class STDDataStream; diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h deleted file mode 100644 index d92864be..00000000 --- a/src/std-data/STDDataStream.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. -// Updated 2021 by University of Bristol -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD (data-oriented)" - - -template -class STDDataStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // Device side pointers - T *a, *b, *c; - - public: - STDDataStream(const intptr_t, int) noexcept; - ~STDDataStream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; -}; - diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake deleted file mode 100644 index 837d26bf..00000000 --- a/src/std-data/model.cmake +++ /dev/null @@ -1,53 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp deleted file mode 100644 index 473d93d0..00000000 --- a/src/std-indices/STDIndicesStream.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDIndicesStream.h" - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDIndicesStream::STDIndicesStream(const intptr_t ARRAY_SIZE, int device) -noexcept : array_size{ARRAY_SIZE}, range(0, array_size), - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif -} - -template -STDIndicesStream::~STDIndicesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDIndicesStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); -} - -template -void STDIndicesStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDIndicesStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) { - return scalar * c[i]; - }); -} - -template -void STDIndicesStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) { - return a[i] + b[i]; - }); -} - -template -void STDIndicesStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return b[i] + scalar * c[i]; - }); -} - -template -void STDIndicesStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return a[i] + b[i] + scalar * c[i]; - }); -} - - -template -T STDIndicesStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDIndicesStream; -template class STDIndicesStream; diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h deleted file mode 100644 index 8a8f5de8..00000000 --- a/src/std-indices/STDIndicesStream.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD (index-oriented)" - -// A lightweight counting iterator which will be used by the STL algorithms -// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this -// implementation doesn't target -template -class ranged { -public: - class iterator { - friend class ranged; - public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = N; - using iterator_category = std::random_access_iterator_tag; - - // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. - // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. - // This type is unused for any nother STL impl. - using is_passed_directly = std::true_type; - - reference operator *() const { return i_; } - iterator &operator ++() { ++i_; return *this; } - iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - - iterator &operator --() { --i_; return *this; } - iterator operator --(int) { iterator copy(*this); --i_; return copy; } - - iterator &operator +=(N by) { i_+=by; return *this; } - - value_type operator[](const difference_type &i) const { return i_ + i; } - - difference_type operator-(const iterator &it) const { return i_ - it.i_; } - iterator operator+(const value_type v) const { return iterator(i_ + v); } - - bool operator ==(const iterator &other) const { return i_ == other.i_; } - bool operator !=(const iterator &other) const { return i_ != other.i_; } - bool operator < (const iterator &other) const { return i_ < other.i_; } - - protected: - explicit iterator(N start) : i_ (start) {} - - private: - N i_; - }; - - [[nodiscard]] iterator begin() const { return begin_; } - [[nodiscard]] iterator end() const { return end_; } - ranged(N begin, N end) : begin_(begin), end_(end) {} -private: - iterator begin_; - iterator end_; -}; - -template -class STDIndicesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // induction range - ranged range; - - // Device side pointers - T *a, *b, *c; - - public: - STDIndicesStream(const intptr_t, int) noexcept; - ~STDIndicesStream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; -}; - diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake deleted file mode 100644 index 60ef575f..00000000 --- a/src/std-indices/model.cmake +++ /dev/null @@ -1,53 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp deleted file mode 100644 index 8b7ada4b..00000000 --- a/src/std-ranges/STDRangesStream.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDRangesStream.hpp" -#include - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDRangesStream::STDRangesStream(const intptr_t ARRAY_SIZE, int device) -noexcept : array_size{ARRAY_SIZE}, - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif -} - -template -STDRangesStream::~STDRangesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDRangesStream::init_arrays(T initA, T initB, T initC) -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, // loop range - [=, this] (intptr_t i) { - a[i] = initA; - b[i] = initB; - c[i] = initC; - } - ); -} - -template -void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ - // Element-wise copy. - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); -} - -template -void STDRangesStream::copy() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i]; - } - ); -} - -template -void STDRangesStream::mul() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - b[i] = scalar * c[i]; - } - ); -} - -template -void STDRangesStream::add() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i] + b[i]; - } - ); -} - -template -void STDRangesStream::triad() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] = b[i] + scalar * c[i]; - } - ); -} - -template -void STDRangesStream::nstream() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] += b[i] + scalar * c[i]; - } - ); -} - -template -T STDRangesStream::dot() -{ - // sum += a[i] * b[i]; - return - std::transform_reduce( - exe_policy, - a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "C++20 does not expose devices" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} - -template class STDRangesStream; -template class STDRangesStream; diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp deleted file mode 100644 index 51680c62..00000000 --- a/src/std-ranges/STDRangesStream.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD C++ ranges" - -template -class STDRangesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // Device side pointers - T *a, *b, *c; - - public: - STDRangesStream(const intptr_t, int) noexcept; - ~STDRangesStream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; - diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake deleted file mode 100644 index d7fd6a8b..00000000 --- a/src/std-ranges/model.cmake +++ /dev/null @@ -1,68 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - - # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here - - # C++ 2a is too new, disable CMake's std flags completely: - set(CMAKE_CXX_EXTENSIONS OFF) - set(CMAKE_CXX_STANDARD_REQUIRED OFF) - unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default - # and append our own: - register_append_cxx_flags(ANY -std=c++20) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() - -macro(setup_target NAME) - if (USE_ONEDPL) - target_compile_features(${NAME} INTERFACE cxx_std_20) - target_compile_features(oneDPL INTERFACE cxx_std_20) - endif () -endmacro() diff --git a/src/std/STDStream.cpp b/src/std/STDStream.cpp new file mode 100644 index 00000000..ae3cd016 --- /dev/null +++ b/src/std/STDStream.cpp @@ -0,0 +1,209 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Updated 2021 by University of Bristol +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "STDStream.h" +#include +#include + +#if defined(DATA23) || defined(INDICES) +#include +#endif + + // OneDPL workaround; TODO: remove this eventually +#include "dpl_shim.h" + +#ifdef INDICES +// NVHPC workaround: TODO: remove this eventually +#if defined(__NVCOMPILER) && defined(_NVHPC_STDPAR_GPU) +#define WORKAROUND +#include +auto counting_iter(intptr_t i) { return thrust::counting_iterator(i); } +auto counting_range(intptr_t b, intptr_t e) { + struct R { + thrust::counting_iterator b, e; + thrust::counting_iterator begin() { return b; } + thrust::counting_iterator end() { return e; } + }; + return R { .b = counting_iter(b), .e = counting_iter(e) }; +} +#else // NVHPC Workaround +auto counting_iter(intptr_t i) { return std::views::iota(i).begin(); } +auto counting_range(intptr_t b, intptr_t e) { return std::views::iota(b, e); } +#endif // NVHPC Workaround +#endif // INDICES + +template +STDStream::STDStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + noexcept : array_size{array_size}, + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif + +#ifdef WORKAROUND + std::cout << "Non-conforming implementation: requires non-portable workarounds to run STREAM" << std::endl; +#endif + init_arrays(initA, initB, initC); +} + +template +STDStream::~STDStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +} + +template +void STDStream::init_arrays(T initA, T initB, T initC) +{ + std::fill_n(exe_policy, a, array_size, initA); + std::fill_n(exe_policy, b, array_size, initB); + std::fill_n(exe_policy, c, array_size, initC); +} + +template +void STDStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) +{ + h_a = a; + h_b = b; + h_c = c; +} + +template +void STDStream::copy() +{ + // c[i] = a[i] +#if defined(DATA17) || defined(DATA23) + std::copy(exe_policy, a, a + array_size, c); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,c=c](intptr_t i) { + c[i] = a[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::mul() +{ + // b[i] = scalar * c[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, c, c + array_size, b, [](T ci){ return startScalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [b=b, c=c](intptr_t i) { + b[i] = startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::add() +{ + // c[i] = a[i] + b[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + c[i] = a[i] + b[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::triad() +{ + // a[i] = b[i] + scalar * c[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + a[i] = b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::nstream() +{ + // a[i] += b[i] + scalar * c[i]; +#if defined(DATA17) + // Need to do in two round-trips with C++17 STL. + // 1: a[i] += b[i] + // 2: a[i] += scalar * c[i]; + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; }); +#elif DATA23 + // Requires GCC 14.1 (Ubuntu 24.04): + auto as = std::ranges::subrange(a, a + array_size); + auto bs = std::ranges::subrange(b, b + array_size); + auto cs = std::ranges::subrange(c, c + array_size); + auto r = std::views::zip(as, bs, cs); + std::transform(exe_policy, r.begin(), r.end(), a, [](auto vs) { + auto [a, b, c] = vs; + return a + b + startScalar * c; + }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,b=b,c=c](intptr_t i) { + a[i] += b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + + +template +T STDStream::dot() +{ +#if defined(DATA17) || defined(DATA23) + // sum = 0; sum += a[i] * b[i]; return sum; + return std::transform_reduce(exe_policy, a, a + array_size, b, T{0}); +#elif INDICES + auto r = counting_range(intptr_t(0), array_size); + return std::transform_reduce(exe_policy, r.begin(), r.end(), T{0}, std::plus{}, [a=a, b=b](intptr_t i) { + return a[i] * b[i]; + }); +#else + #error unimplemented +#endif +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class STDStream; +template class STDStream; diff --git a/src/std/STDStream.h b/src/std/STDStream.h new file mode 100644 index 00000000..254d68d7 --- /dev/null +++ b/src/std/STDStream.h @@ -0,0 +1,51 @@ +// Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +// Updated 2021 by University of Bristol +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "Stream.h" + +#ifdef DATA17 +#define STDIMPL "DATA17" +#elif DATA23 +#define STDIMPL "DATA23" +#elif INDICES +#define STDIMPL "INDICES" +#else +#error unimplemented +#endif + +#define IMPLEMENTATION_STRING "STD (" STDIMPL ")" + + +template +class STDStream : public Stream +{ + protected: + // Size of arrays + intptr_t array_size; + + // Device side pointers + T *a, *b, *c; + + public: + STDStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) noexcept; + ~STDStream(); + + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); +}; + diff --git a/src/std/model.cmake b/src/std/model.cmake new file mode 100644 index 00000000..d66e6c87 --- /dev/null +++ b/src/std/model.cmake @@ -0,0 +1,115 @@ +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection" + "c++") + +register_flag_optional(NVHPC_OFFLOAD + "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. + The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) + + Possible values are: + cc35 - Compile for compute capability 3.5 + cc50 - Compile for compute capability 5.0 + cc60 - Compile for compute capability 6.0 + cc62 - Compile for compute capability 6.2 + cc70 - Compile for compute capability 7.0 + cc72 - Compile for compute capability 7.2 + cc75 - Compile for compute capability 7.5 + cc80 - Compile for compute capability 8.0 + cc90 - Compile for compute capability 8.0 + ccall - Compile for all supported compute capabilities + ccnative - Compiles for compute capability of current device" + "") + +register_flag_optional(NVHPC_MULTICORE + "Enable multicore parallelization with the NVHPC SDK." + "") + +register_flag_optional(ACPP_OFFLOAD + "Enable offloading support (via the non-standard `-acpp-stdpar`) for AdaptiveCpp." + "OFF") + +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + DPCPP - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + +register_flag_optional(STDIMPL + "Implementation strategy (default = DATA20): + DATA17 - Parallel algorithms over data (requires C++17). + DATA23 - (default) Parallel algorithms over data (requires C++20). + INDICES - Parallel algorithms over indices (requires C++20)." + "DATA20" +) + +register_flag_optional(AMDGPU_TARGET_OFFLOAD + "Enable offloading support (via the non-standard `-stdpar`) for + Clang/LLVM. The values are AMDGPU processors (https://www.llvm.org/docs/AMDGPUUsage.html#processors) + which will be passed in via `--offload-arch=` argument. + + Example values are: + gfx906 - Compile for Vega20 GPUs + gfx908 - Compile for CDNA1 GPUs + gfx90a - Compile for CDNA2 GPUs + gfx942 - Compile for CDNA3 GPUs + gfx1030 - Compile for RDNA2 NV21 GPUs + gfx1100 - Compile for RDNA3 NV31 GPUs" + "") + +macro(setup) + register_definitions(${STDIMPL}) + if (${STDIMPL} STREQUAL "DATA17") + set(CMAKE_CXX_STANDARD 17) + elseif (${STDIMPL} STREQUAL "INDICES") + set(CMAKE_CXX_STANDARD 20) + elseif (${STDIMPL} STREQUAL "DATA23") + set(CMAKE_CXX_STANDARD 23) + endif () + if (NVHPC_OFFLOAD) + set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY ${NVHPC_FLAGS}) + register_append_link_flags(${NVHPC_FLAGS}) + endif () + if (NVHPC_MULTICORE) + set(NVHPC_FLAGS -stdpar=multicore) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY ${NVHPC_FLAGS}) + register_append_link_flags(${NVHPC_FLAGS}) + endif () + + if (ACPP_OFFLOAD) + set(ACPP_FLAGS --acpp-stdpar) + register_append_cxx_flags(ANY ${ACPP_FLAGS}) + register_append_link_flags(${ACPP_FLAGS}) + endif () + if (USE_TBB) + if (FETCH_TBB) + register_link_library(TBB::tbb) + else () + register_link_library(tbb) + endif () + endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () + if (AMDGPU_TARGET_OFFLOAD) + set(AMDGPU_TARGET_OFFLOAD_FLAGS --hipstdpar --offload-arch=${AMDGPU_TARGET_OFFLOAD}) + if (NOT AMDGPU_TARGET_OFFLOAD MATCHES "^gfx9" OR AMDGPU_INTERPOSE_ALLOC) + list(APPEND AMDGPU_TARGET_OFFLOAD_FLAGS --hipstdpar-interpose-alloc) + endif () + register_append_cxx_flags(ANY ${AMDGPU_TARGET_OFFLOAD_FLAGS}) + register_append_link_flags(--hipstdpar) + endif () +endmacro() diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index e99454e6..5c00211e 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -17,13 +17,13 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { if (!cached) getDeviceList(); - array_size = ARRAY_SIZE; - if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; @@ -79,6 +79,8 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) d_b = new buffer(array_size); d_c = new buffer(array_size); d_sum = new buffer(dot_num_groups); + + init_arrays(initA, initB, initC); } template @@ -238,17 +240,14 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) } template -void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void SYCLStream::get_arrays(T const*& a, T const*& b, T const*& c) { auto _a = d_a->template get_access(); auto _b = d_b->template get_access(); auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) - { - a[i] = _a[i]; - b[i] = _b[i]; - c[i] = _c[i]; - } + a = &_a[0]; + b = &_b[0]; + c = &_c[0]; } void getDeviceList(void) diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index 1a40242d..94c3c4e9 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -54,19 +54,19 @@ class SYCLStream : public Stream public: - SYCLStream(const intptr_t, const int); + SYCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~SYCLStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake index 3826c3c7..72aa7c40 100644 --- a/src/sycl/model.cmake +++ b/src/sycl/model.cmake @@ -9,22 +9,34 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") macro(setup) set(CMAKE_CXX_STANDARD 17) + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") + endif () + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(AdaptiveCpp CONFIG REQUIRED) + message(STATUS "ok") + elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) if (NOT EXISTS "${hipSYCL_DIR}") @@ -38,7 +50,6 @@ macro(setup) # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) find_package(hipSYCL CONFIG REQUIRED) message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) @@ -62,7 +73,14 @@ endmacro() macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + # so AdaptiveCpp has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") # so hipSYCL has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified # hence this macro here diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp deleted file mode 100644 index 742be95b..00000000 --- a/src/sycl2020-acc/SYCLStream2020.cpp +++ /dev/null @@ -1,289 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "SYCLStream2020.h" - -#include - -// Cache list of devices -bool cached = false; -std::vector devices; -void getDeviceList(void); - -template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE}, - d_a {ARRAY_SIZE}, - d_b {ARRAY_SIZE}, - d_c {ARRAY_SIZE}, - d_sum {1} -{ - if (!cached) - getDeviceList(); - - if (device_index >= devices.size()) - throw std::runtime_error("Invalid device index"); - - sycl::device dev = devices[device_index]; - - // Print out device information - std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - - // Check device can support FP64 if needed - if (sizeof(T) == sizeof(double)) - { - if (!dev.has(sycl::aspect::fp64)) - { - throw std::runtime_error("Device does not support double precision, please use --float"); - } - } - - queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) - { - bool error = false; - for(auto e: l) - { - try - { - std::rethrow_exception(e); - } - catch (sycl::exception e) - { - std::cout << e.what(); - error = true; - } - } - if(error) - { - throw std::runtime_error("SYCL errors detected"); - } - }}); - - // No longer need list of devices - devices.clear(); - cached = true; - - -} - - -template -void SYCLStream::copy() -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kc[idx] = ka[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::mul() -{ - const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor kb {d_b, cgh, sycl::write_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kb[idx] = scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::add() -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kc[idx] = ka[idx] + kb[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::triad() -{ - const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::write_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] = kb[idx] + scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::nstream() -{ - const T scalar = startScalar; - - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] += kb[idx] + scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -T SYCLStream::dot() -{ - - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - - cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - // hipSYCL doesn't sypport the initialize_to_identity property yet -#if defined(__HIPSYCL__) || defined(__OPENSYCL__) - sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), -#else - sycl::reduction(d_sum, cgh, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), -#endif - [=](sycl::id<1> idx, auto& sum) - { - sum += ka[idx] * kb[idx]; - }); - - }); - - // Get access on the host, and return a copy of the data (single number) - // This will block until the result is available, so no need to wait on the queue. - sycl::host_accessor result {d_sum, sycl::read_only}; - return result[0]; - -} - -template -void SYCLStream::init_arrays(T initA, T initB, T initC) -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; - - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] = initA; - kb[idx] = initB; - kc[idx] = initC; - }); - }); - - queue->wait(); -} - -template -void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) -{ - sycl::host_accessor _a {d_a, sycl::read_only}; - sycl::host_accessor _b {d_b, sycl::read_only}; - sycl::host_accessor _c {d_c, sycl::read_only}; - for (int i = 0; i < array_size; i++) - { - a[i] = _a[i]; - b[i] = _b[i]; - c[i] = _c[i]; - } -} - -void getDeviceList(void) -{ - // Ask SYCL runtime for all devices in system - devices = sycl::device::get_devices(); - cached = true; -} - -void listDevices(void) -{ - getDeviceList(); - - // Print device names - if (devices.size() == 0) - { - std::cerr << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < devices.size(); i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - } - std::cout << std::endl; - } -} - -std::string getDeviceName(const int device) -{ - if (!cached) - getDeviceList(); - - std::string name; - - if (device < devices.size()) - { - name = devices[device].get_info(); - } - else - { - throw std::runtime_error("Error asking for name for non-existant device"); - } - - return name; -} - -std::string getDeviceDriver(const int device) -{ - if (!cached) - getDeviceList(); - - std::string driver; - - if (device < devices.size()) - { - driver = devices[device].get_info(); - } - else - { - throw std::runtime_error("Error asking for driver for non-existant device"); - } - - return driver; -} - -template class SYCLStream; -template class SYCLStream; diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h deleted file mode 100644 index cd515f87..00000000 --- a/src/sycl2020-acc/SYCLStream2020.h +++ /dev/null @@ -1,54 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once - -#include -#include - -#include "Stream.h" - -#include - -#define IMPLEMENTATION_STRING "SYCL2020 accessors" - -template -class SYCLStream : public Stream -{ - protected: - // Size of arrays - size_t array_size; - - // SYCL objects - // Queue is a pointer because we allow device selection - std::unique_ptr queue; - - // Buffers - sycl::buffer d_a; - sycl::buffer d_b; - sycl::buffer d_c; - sycl::buffer d_sum; - - public: - - SYCLStream(const intptr_t, const int); - ~SYCLStream() = default; - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; - -// Populate the devices list -void getDeviceList(void); diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake deleted file mode 100644 index 3826c3c7..00000000 --- a/src/sycl2020-acc/model.cmake +++ /dev/null @@ -1,73 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" - "c++") - -register_flag_required(SYCL_COMPILER - "Compile using the specified SYCL compiler implementation - Supported values are - ONEAPI-ICPX - icpx as a standalone compiler - ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) - DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") - -register_flag_optional(SYCL_COMPILER_DIR - "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) - ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" - "") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - - - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - - - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) - - if (NOT EXISTS "${hipSYCL_DIR}") - message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) - endif () - if (NOT EXISTS "${hipSYCL_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") - endif () - - # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(hipSYCL CONFIG REQUIRED) - message(STATUS "ok") - - elseif (${SYCL_COMPILER} STREQUAL "DPCPP") - set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) - include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") - set(CMAKE_CXX_COMPILER icpx) - set(CMAKE_C_COMPILER icx) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") - set(CMAKE_CXX_COMPILER clang++) - set(CMAKE_C_COMPILER clang) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - else () - message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") - endif () - -endmacro() - - -macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - # so hipSYCL has this weird (and bad) CMake usage where they append their - # own custom integration header flags AFTER the target has been specified - # hence this macro here - add_sycl_to_target( - TARGET ${NAME} - SOURCES ${IMPL_SOURCES}) - endif () -endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h deleted file mode 100644 index 811c26ef..00000000 --- a/src/sycl2020-usm/SYCLStream2020.h +++ /dev/null @@ -1,54 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once - -#include -#include - -#include "Stream.h" - -#include - -#define IMPLEMENTATION_STRING "SYCL2020 USM" - -template -class SYCLStream : public Stream -{ - protected: - // Size of arrays - size_t array_size; - - // SYCL objects - // Queue is a pointer because we allow device selection - std::unique_ptr queue; - - // Buffers - T *a{}; - T *b{}; - T *c{}; - T *sum{}; - - public: - - SYCLStream(const intptr_t, const int); - ~SYCLStream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; - -// Populate the devices list -void getDeviceList(void); diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp similarity index 62% rename from src/sycl2020-usm/SYCLStream2020.cpp rename to src/sycl2020/SYCLStream2020.cpp index e4c6ec27..56292046 100644 --- a/src/sycl2020-usm/SYCLStream2020.cpp +++ b/src/sycl2020/SYCLStream2020.cpp @@ -1,5 +1,5 @@ -// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this @@ -15,8 +15,9 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE} +SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { if (!cached) getDeviceList(); @@ -60,24 +61,36 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) } }}); + // Allocate memory + #ifdef SYCL2020ACC + d_a = sycl::buffer{array_size}; + d_b = sycl::buffer{array_size}; + d_c = sycl::buffer{array_size}; + d_sum = sycl::buffer{1}; + #elif SYCL2020USM a = sycl::malloc_shared(array_size, *queue); b = sycl::malloc_shared(array_size, *queue); c = sycl::malloc_shared(array_size, *queue); sum = sycl::malloc_shared(1, *queue); - + #else + #error unimplemented + #endif + // No longer need list of devices devices.clear(); cached = true; - + init_arrays(initA, initB, initC); } template SYCLStream::~SYCLStream() { - sycl::free(a, *queue); - sycl::free(b, *queue); - sycl::free(c, *queue); - sycl::free(sum, *queue); +#ifdef SYCL2020USM + sycl::free(a, *queue); + sycl::free(b, *queue); + sycl::free(c, *queue); + sycl::free(sum, *queue); +#endif } template @@ -85,7 +98,11 @@ void SYCLStream::copy() { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::write_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a](sycl::id<1> idx) { c[idx] = a[idx]; }); @@ -99,7 +116,11 @@ void SYCLStream::mul() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor b {d_b, cgh, sycl::write_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,b=b,c=c](sycl::id<1> idx) { b[idx] = scalar * c[idx]; }); @@ -112,7 +133,12 @@ void SYCLStream::add() { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::write_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a,b=b](sycl::id<1> idx) { c[idx] = a[idx] + b[idx]; }); @@ -126,7 +152,12 @@ void SYCLStream::triad() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::write_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] = b[idx] + scalar * c[idx]; }); @@ -138,10 +169,14 @@ template void SYCLStream::nstream() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#if SYCL2020ACC + sycl::accessor a {d_a, cgh}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] += b[idx] + scalar * c[idx]; }); @@ -154,19 +189,22 @@ T SYCLStream::dot() { queue->submit([&](sycl::handler &cgh) { +#if SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; +#endif cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero - // hipSYCL doesn't sypport the initialize_to_identity property yet -#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + // AdaptiveCpp doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__) sycl::reduction(sum, sycl::plus()), #else sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), #endif - [a = this->a, b = this->b](sycl::id<1> idx, auto& sum) + [a=a,b=b](sycl::id<1> idx, auto& sum) { sum += a[idx] * b[idx]; }); - }); queue->wait(); return *sum; @@ -177,26 +215,32 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#if SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor b {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor c {d_c, cgh, sycl::write_only, sycl::no_init}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] = initA; b[idx] = initB; c[idx] = initC; }); }); - queue->wait(); } template -void SYCLStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void SYCLStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - for (int i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } +#if SYCL2020ACC + sycl::host_accessor a {d_a, sycl::read_only}; + sycl::host_accessor b {d_b, sycl::read_only}; + sycl::host_accessor c {d_c, sycl::read_only}; +#endif + h_a = &a[0]; + h_b = &b[0]; + h_c = &c[0]; } void getDeviceList(void) diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h new file mode 100644 index 00000000..3b4e1ef0 --- /dev/null +++ b/src/sycl2020/SYCLStream2020.h @@ -0,0 +1,60 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#ifdef SYCL2020ACC +#define SYCLIMPL "Accessors" +#elif SYCL2020USM +#define SYCLIMPL "USM" +#else +#error unimplemented +#endif + +#define IMPLEMENTATION_STRING "SYCL2020 " SYCLIMPL + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + T *a, *b, *c, *sum{}; + sycl::buffer d_a, d_b, d_c, d_sum; + + public: + + SYCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); + ~SYCLStream(); + + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020/model.cmake similarity index 65% rename from src/sycl2020-usm/model.cmake rename to src/sycl2020/model.cmake index 950daefd..d6452534 100644 --- a/src/sycl2020-usm/model.cmake +++ b/src/sycl2020/model.cmake @@ -9,23 +9,40 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") +register_flag_optional(SYCL_ACCESS + "Data access method: + - ACCESSOR + - USM" + "ACCESSOR") macro(setup) set(CMAKE_CXX_STANDARD 17) + register_definitions(${SYCL_ACCESS}) + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp) + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") + endif () - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - - + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(AdaptiveCpp CONFIG REQUIRED) + message(STATUS "ok") + elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) if (NOT EXISTS "${hipSYCL_DIR}") @@ -39,7 +56,6 @@ macro(setup) # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) find_package(hipSYCL CONFIG REQUIRED) message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) @@ -63,7 +79,14 @@ endmacro() macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + # so AdaptiveCpp has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") # so hipSYCL has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified # hence this macro here diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 75af6141..01508022 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -20,15 +20,16 @@ #endif template -TBBStream::TBBStream(const intptr_t ARRAY_SIZE, int device) - : partitioner(), range(0, (size_t)ARRAY_SIZE), +TBBStream::TBBStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : partitioner(), range(0, (size_t)array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(array_size), b(array_size), c(array_size) #else - array_size(ARRAY_SIZE), - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + array_size(array_size), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)) #endif { if(device != 0){ @@ -36,6 +37,8 @@ TBBStream::TBBStream(const intptr_t ARRAY_SIZE, int device) } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; + + init_arrays(initA, initB, initC); } @@ -54,12 +57,17 @@ void TBBStream::init_arrays(T initA, T initB, T initC) } template -void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void TBBStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - // Element-wise copy. - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); +#ifdef USE_VECTOR + h_a = a.data(); + h_b = b.data(); + h_c = c.data(); +#else + h_a = a; + h_b = b; + h_c = c; +#endif } template diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 80f11c17..0a73e892 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -31,7 +31,6 @@ using tbb_partitioner = tbb::auto_partitioner; #define PARTITIONER_NAME "auto_partitioner" #endif - template class TBBStream : public Stream { @@ -48,17 +47,17 @@ class TBBStream : public Stream #endif public: - TBBStream(const intptr_t, int); + TBBStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~TBBStream() = default; - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; - diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 84b27b8e..321470b8 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -19,7 +19,8 @@ static inline void synchronise() } template -ThrustStream::ThrustStream(const intptr_t array_size, int device) +ThrustStream::ThrustStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) : array_size{array_size}, a(array_size), b(array_size), c(array_size) { std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; std::cout << "Driver: " << getDeviceDriver(device) << std::endl; @@ -36,8 +37,6 @@ ThrustStream::ThrustStream(const intptr_t array_size, int device) std::cout << "Thrust backend: TBB" << std::endl; #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP std::cout << "Thrust backend: CPP" << std::endl; -#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB - std::cout << "Thrust backend: TBB" << std::endl; #else #if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP @@ -48,6 +47,7 @@ ThrustStream::ThrustStream(const intptr_t array_size, int device) #endif + init_arrays(initA, initB, initC); } template @@ -60,11 +60,23 @@ void ThrustStream::init_arrays(T initA, T initB, T initC) } template -void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ +void ThrustStream::get_arrays(T const*& a_, T const*& b_, T const*& c_) +{ + #if defined(MANAGED) + a_ = &*a.data(); + b_ = &*b.data(); + c_ = &*c.data(); + #else + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); thrust::copy(a.begin(), a.end(), h_a.begin()); thrust::copy(b.begin(), b.end(), h_b.begin()); thrust::copy(c.begin(), c.end(), h_c.begin()); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); + #endif } template diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index b0acd80f..676ecaeb 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -26,28 +26,25 @@ class ThrustStream : public Stream intptr_t array_size; #if defined(MANAGED) - thrust::universtal_vector a; - thrust::universtal_vector b; - thrust::universtal_vector c; + thrust::universal_vector a, b, c; #else - thrust::device_vector a; - thrust::device_vector b; - thrust::device_vector c; + thrust::device_vector a, b, c; + std::vector h_a, h_b, h_c; #endif public: - ThrustStream(const intptr_t, int); + ThrustStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~ThrustStream() = default; - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 6b82ef59..23627c11 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -18,8 +18,7 @@ register_flag_optional(BACKEND " "CUDA") - register_flag_optional(MANAGED "Enabled managed memory mode." - "OFF") +register_flag_optional(MANAGED "Enabled managed memory mode." "OFF") register_flag_optional(CMAKE_CUDA_COMPILER "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"