From 6f50313cdbf98994243887c0d608224ecc93bba7 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 16 Aug 2024 16:20:33 -0600 Subject: [PATCH] reformated device_api and host_api benchmark to include engine and distribution column --- benchmark/benchmark_rocrand_device_api.cpp | 1600 +++++++++----------- benchmark/benchmark_rocrand_host_api.cpp | 733 ++++----- benchmark/custom_csv_formater.hpp | 182 +++ 3 files changed, 1211 insertions(+), 1304 deletions(-) create mode 100644 benchmark/custom_csv_formater.hpp diff --git a/benchmark/benchmark_rocrand_device_api.cpp b/benchmark/benchmark_rocrand_device_api.cpp index 44ddec8fb..59f087a95 100644 --- a/benchmark/benchmark_rocrand_device_api.cpp +++ b/benchmark/benchmark_rocrand_device_api.cpp @@ -34,1020 +34,812 @@ #include #include #include +#include +#include "custom_csv_formater.hpp" #ifndef DEFAULT_RAND_N - #define DEFAULT_RAND_N (1024 * 1024 * 128) +#define DEFAULT_RAND_N (1024 * 1024 * 128) #endif -template +template __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - EngineState* states, const unsigned long long seed, const unsigned long long offset) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(seed, state_id, offset, &state); - states[state_id] = state; + EngineState *states, const unsigned long long seed, + const unsigned long long offset) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(seed, state_id, offset, &state); + states[state_id] = state; } -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - EngineState* states, T* data, const size_t size, Generator generator) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[state_id]; - unsigned int index = state_id; - while(index < size) - { - data[index] = generator(&state); - index += stride; - } - states[state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( + EngineState *states, T *data, const size_t size, Generator generator) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[state_id]; + unsigned int index = state_id; + while (index < size) { + data[index] = generator(&state); + index += stride; + } + states[state_id] = state; } -template -struct runner -{ - EngineState* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t threads, - const unsigned long long seed, - const unsigned long long offset) - { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), - dim3(blocks), - dim3(threads), - 0, - 0, - states, - seed, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +template struct runner { + EngineState *states; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t /* dimensions */, const size_t blocks, + const size_t threads, const unsigned long long seed, + const unsigned long long offset) { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(EngineState))); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(blocks), - dim3(threads), - 0, - stream, - states, - data, - size, - generator); - } -}; + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(blocks), + dim3(threads), 0, 0, states, seed, offset); -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( - rocrand_state_mtgp32* states, T* data, const size_t size, Generator generator) -{ - const unsigned int state_id = blockIdx.x; - unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int stride = gridDim.x * blockDim.x; - - __shared__ rocrand_state_mtgp32 state; - rocrand_mtgp32_block_copy(&states[state_id], &state); - - const size_t r = size % blockDim.x; - const size_t size_rounded_down = size - r; - const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; - while(index < size_rounded_down) - { - data[index] = generator(&state); - index += stride; - } - while(index < size_rounded_up) - { - auto value = generator(&state); - if(index < size) - data[index] = value; - index += stride; - } + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } - rocrand_mtgp32_block_copy(&state, &states[state_id]); -} + ~runner() { HIP_CHECK(hipFree(states)); } -template<> -struct runner -{ - rocrand_state_mtgp32* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t /* threads */, - const unsigned long long seed, - const unsigned long long /* offset */) - { - const size_t states_size = std::min((size_t)200, blocks); - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); - - ROCRAND_CHECK( - rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, states_size, seed)); - } + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), + dim3(threads), 0, stream, states, data, size, generator); + } +}; - ~runner() - { - HIP_CHECK(hipFree(states)); - } +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_kernel( + rocrand_state_mtgp32 *states, T *data, const size_t size, + Generator generator) { + const unsigned int state_id = blockIdx.x; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int stride = gridDim.x * blockDim.x; + + __shared__ rocrand_state_mtgp32 state; + rocrand_mtgp32_block_copy(&states[state_id], &state); + + const size_t r = size % blockDim.x; + const size_t size_rounded_down = size - r; + const size_t size_rounded_up = r == 0 ? size : size_rounded_down + blockDim.x; + while (index < size_rounded_down) { + data[index] = generator(&state); + index += stride; + } + while (index < size_rounded_up) { + auto value = generator(&state); + if (index < size) + data[index] = value; + index += stride; + } + + rocrand_mtgp32_block_copy(&state, &states[state_id]); +} - template - void generate(const size_t blocks, - const size_t /* threads */, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(std::min((size_t)200, blocks)), - dim3(256), - 0, - stream, - states, - data, - size, - generator); - } +template <> struct runner { + rocrand_state_mtgp32 *states; + + runner(const size_t /* dimensions */, const size_t blocks, + const size_t /* threads */, const unsigned long long seed, + const unsigned long long /* offset */) { + const size_t states_size = std::min((size_t)200, blocks); + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_mtgp32))); + + ROCRAND_CHECK(rocrand_make_state_mtgp32(states, mtgp32dc_params_fast_11213, + states_size, seed)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t /* threads */, + hipStream_t stream, T *data, const size_t size, + const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), + dim3(std::min((size_t)200, blocks)), dim3(256), 0, + stream, states, data, size, generator); + } }; __global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_kernel( - rocrand_state_lfsr113* states, const uint4 seed) -{ - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - rocrand_state_lfsr113 state; - rocrand_init(seed, state_id, &state); - states[state_id] = state; + rocrand_state_lfsr113 *states, const uint4 seed) { + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + rocrand_state_lfsr113 state; + rocrand_init(seed, state_id, &state); + states[state_id] = state; } -template<> -struct runner -{ - rocrand_state_lfsr113* states; - - runner(const size_t /* dimensions */, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long /* offset */) - { - const size_t states_size = blocks * threads; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), - dim3(blocks), - dim3(threads), - 0, - 0, - states, - uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, - ROCRAND_LFSR113_DEFAULT_SEED_Y, - ROCRAND_LFSR113_DEFAULT_SEED_Z, - ROCRAND_LFSR113_DEFAULT_SEED_W}); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } +template <> struct runner { + rocrand_state_lfsr113 *states; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t /* dimensions */, const size_t blocks, + const size_t threads, const unsigned long long /* seed */, + const unsigned long long /* offset */) { + const size_t states_size = blocks * threads; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_lfsr113))); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), - dim3(blocks), - dim3(threads), - 0, - stream, - states, - data, - size, - generator); - } + hipLaunchKernelGGL( + HIP_KERNEL_NAME(init_kernel), dim3(blocks), dim3(threads), 0, 0, states, + uint4{ROCRAND_LFSR113_DEFAULT_SEED_X, ROCRAND_LFSR113_DEFAULT_SEED_Y, + ROCRAND_LFSR113_DEFAULT_SEED_Z, ROCRAND_LFSR113_DEFAULT_SEED_W}); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), + dim3(threads), 0, stream, states, data, size, generator); + } }; -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel( - EngineState* states, SobolType* directions, SobolType offset) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], offset + state_id, &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_sobol_kernel( + EngineState *states, SobolType *directions, SobolType offset) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], + offset + state_id, &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel( - EngineState* states, SobolType* directions, SobolType* scramble_constants, SobolType offset) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - EngineState state; - rocrand_init(&directions[dimension * sizeof(SobolType) * 8], - scramble_constants[dimension], - offset + state_id, - &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void init_scrambled_sobol_kernel( + EngineState *states, SobolType *directions, SobolType *scramble_constants, + SobolType offset) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + EngineState state; + rocrand_init(&directions[dimension * sizeof(SobolType) * 8], + scramble_constants[dimension], offset + state_id, &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } // generate_kernel for the normal and scrambled sobol generators -template -__global__ __launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel( - EngineState* states, T* data, const size_t size, Generator generator) -{ - const unsigned int dimension = blockIdx.y; - const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int stride = gridDim.x * blockDim.x; - - EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; - const size_t offset = dimension * size; - unsigned int index = state_id; - while(index < size) - { - data[offset + index] = generator(&state); - skipahead(stride - 1, &state); - index += stride; - } - state = states[gridDim.x * blockDim.x * dimension + state_id]; - skipahead(static_cast(size), &state); - states[gridDim.x * blockDim.x * dimension + state_id] = state; +template +__global__ +__launch_bounds__(ROCRAND_DEFAULT_MAX_BLOCK_SIZE) void generate_sobol_kernel( + EngineState *states, T *data, const size_t size, Generator generator) { + const unsigned int dimension = blockIdx.y; + const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int stride = gridDim.x * blockDim.x; + + EngineState state = states[gridDim.x * blockDim.x * dimension + state_id]; + const size_t offset = dimension * size; + unsigned int index = state_id; + while (index < size) { + data[offset + index] = generator(&state); + skipahead(stride - 1, &state); + index += stride; + } + state = states[gridDim.x * blockDim.x * dimension + state_id]; + skipahead(static_cast(size), &state); + states[gridDim.x * blockDim.x * dimension + state_id] = state; } -template<> -struct runner -{ - rocrand_state_sobol32* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned int* h_directions; - ROCRAND_CHECK( - rocrand_get_direction_vectors32(&h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); - - unsigned int* directions; - const size_t size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - } +template <> struct runner { + rocrand_state_sobol32 *states; + size_t dimensions; - ~runner() - { - HIP_CHECK(hipFree(states)); - } + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + const unsigned int *h_directions; + ROCRAND_CHECK(rocrand_get_direction_vectors32( + &h_directions, ROCRAND_DIRECTION_VECTORS_32_JOEKUO6)); -template<> -struct runner -{ - rocrand_state_scrambled_sobol32* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned int* h_directions; - const unsigned int* h_constants; - - ROCRAND_CHECK( - rocrand_get_direction_vectors32(&h_directions, - ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); - ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol32))); - - unsigned int* directions; - const size_t directions_size = dimensions * 32 * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); - - unsigned int* scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK( - hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - scramble_constants, - static_cast(offset)); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol32))); - ~runner() - { - HIP_CHECK(hipFree(states)); - } + unsigned int *directions; + const size_t size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, static_cast(offset)); -template<> -struct runner -{ - rocrand_state_sobol64* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned long long* h_directions; - rocrand_get_direction_vectors64(&h_directions, ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); - - unsigned long long int* directions; - const size_t size = dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, size)); - HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - } + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); - ~runner() - { - HIP_CHECK(hipFree(states)); - } + HIP_CHECK(hipFree(directions)); + } - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } }; -template<> -struct runner -{ - rocrand_state_scrambled_sobol64* states; - size_t dimensions; - - runner(const size_t dimensions, - const size_t blocks, - const size_t threads, - const unsigned long long /* seed */, - const unsigned long long offset) - { - this->dimensions = dimensions; - - const unsigned long long* h_directions; - const unsigned long long* h_constants; - - rocrand_get_direction_vectors64(&h_directions, - ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); - rocrand_get_scramble_constants64(&h_constants); - - const size_t states_size = blocks * threads * dimensions; - HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_scrambled_sobol64))); - - unsigned long long int* directions; - const size_t directions_size = dimensions * 64 * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&directions, directions_size)); - HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, hipMemcpyHostToDevice)); - - unsigned long long int* scramble_constants; - const size_t constants_size = dimensions * sizeof(unsigned long long int); - HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); - HIP_CHECK( - hipMemcpy(scramble_constants, h_constants, constants_size, hipMemcpyHostToDevice)); - - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - 0, - states, - directions, - scramble_constants, - offset); - - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - HIP_CHECK(hipFree(directions)); - HIP_CHECK(hipFree(scramble_constants)); - } +template <> struct runner { + rocrand_state_scrambled_sobol32 *states; + size_t dimensions; + + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; + + const unsigned int *h_directions; + const unsigned int *h_constants; + + ROCRAND_CHECK(rocrand_get_direction_vectors32( + &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)); + ROCRAND_CHECK(rocrand_get_scramble_constants32(&h_constants)); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, + states_size * sizeof(rocrand_state_scrambled_sobol32))); + + unsigned int *directions; + const size_t directions_size = dimensions * 32 * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, + hipMemcpyHostToDevice)); + + unsigned int *scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, + hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, scramble_constants, + static_cast(offset)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } +}; - ~runner() - { - HIP_CHECK(hipFree(states)); - } +template <> struct runner { + rocrand_state_sobol64 *states; + size_t dimensions; - template - void generate(const size_t blocks, - const size_t threads, - hipStream_t stream, - T* data, - const size_t size, - const Generator& generator) - { - const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), - dim3(blocks_x, dimensions), - dim3(threads), - 0, - stream, - states, - data, - size / dimensions, - generator); - } -}; + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; -// Provide optional create and destroy functions for the generators. -struct generator_type -{ - static void create() {} + const unsigned long long *h_directions; + rocrand_get_direction_vectors64(&h_directions, + ROCRAND_DIRECTION_VECTORS_64_JOEKUO6); - static void destroy() {} -}; + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, states_size * sizeof(rocrand_state_sobol64))); -template -struct generator_uint : public generator_type -{ - typedef unsigned int data_type; + unsigned long long int *directions; + const size_t size = dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, size)); + HIP_CHECK(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); - std::string name() - { - return "uniform-uint"; - } + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, offset); - __device__ data_type operator()(Engine* state) const - { - return rocrand(state); - } -}; + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); -template -struct generator_ullong : public generator_type -{ - typedef unsigned long long int data_type; + HIP_CHECK(hipFree(directions)); + } - std::string name() - { - return "uniform-ullong"; - } + ~runner() { HIP_CHECK(hipFree(states)); } - __device__ data_type operator()(Engine* state) const - { - return rocrand(state); - } + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } }; -template -struct generator_uniform : public generator_type -{ - typedef float data_type; +template <> struct runner { + rocrand_state_scrambled_sobol64 *states; + size_t dimensions; + + runner(const size_t dimensions, const size_t blocks, const size_t threads, + const unsigned long long /* seed */, const unsigned long long offset) { + this->dimensions = dimensions; + + const unsigned long long *h_directions; + const unsigned long long *h_constants; + + rocrand_get_direction_vectors64( + &h_directions, ROCRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6); + rocrand_get_scramble_constants64(&h_constants); + + const size_t states_size = blocks * threads * dimensions; + HIP_CHECK(hipMalloc(&states, + states_size * sizeof(rocrand_state_scrambled_sobol64))); + + unsigned long long int *directions; + const size_t directions_size = + dimensions * 64 * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&directions, directions_size)); + HIP_CHECK(hipMemcpy(directions, h_directions, directions_size, + hipMemcpyHostToDevice)); + + unsigned long long int *scramble_constants; + const size_t constants_size = dimensions * sizeof(unsigned long long int); + HIP_CHECK(hipMalloc(&scramble_constants, constants_size)); + HIP_CHECK(hipMemcpy(scramble_constants, h_constants, constants_size, + hipMemcpyHostToDevice)); + + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(init_scrambled_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, + directions, scramble_constants, offset); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + HIP_CHECK(hipFree(directions)); + HIP_CHECK(hipFree(scramble_constants)); + } + + ~runner() { HIP_CHECK(hipFree(states)); } + + template + void generate(const size_t blocks, const size_t threads, hipStream_t stream, + T *data, const size_t size, const Generator &generator) { + const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); + hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_sobol_kernel), + dim3(blocks_x, dimensions), dim3(threads), 0, stream, + states, data, size / dimensions, generator); + } +}; - std::string name() - { - return "uniform-float"; - } +// Provide optional create and destroy functions for the generators. +struct generator_type { + static void create() {} - __device__ data_type operator()(Engine* state) const - { - return rocrand_uniform(state); - } + static void destroy() {} }; -template -struct generator_uniform_double : public generator_type -{ - typedef double data_type; +template struct generator_uint : public generator_type { + typedef unsigned int data_type; - std::string name() - { - return "uniform-double"; - } + std::string name() { return "uniform-uint"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_uniform_double(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand(state); + } }; -template -struct generator_normal : public generator_type -{ - typedef float data_type; +template struct generator_ullong : public generator_type { + typedef unsigned long long int data_type; - std::string name() - { - return "normal-float"; - } + std::string name() { return "uniform-ullong"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_normal(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand(state); + } }; -template -struct generator_normal_double : public generator_type -{ - typedef double data_type; +template struct generator_uniform : public generator_type { + typedef float data_type; - std::string name() - { - return "normal-double"; - } + std::string name() { return "uniform-float"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_normal_double(state); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_uniform(state); + } }; -template -struct generator_log_normal : public generator_type -{ - typedef float data_type; +template +struct generator_uniform_double : public generator_type { + typedef double data_type; - std::string name() - { - return "log-normal-float"; - } + std::string name() { return "uniform-double"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_log_normal(state, 0.f, 1.f); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_uniform_double(state); + } }; -template -struct generator_log_normal_double : public generator_type -{ - typedef double data_type; +template struct generator_normal : public generator_type { + typedef float data_type; - std::string name() - { - return "log-normal-double"; - } + std::string name() { return "normal-float"; } - __device__ data_type operator()(Engine* state) const - { - return rocrand_log_normal_double(state, 0., 1.); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_normal(state); + } }; -template -struct generator_poisson : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_normal_double : public generator_type { + typedef double data_type; - std::string name() - { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "poisson(lambda=" + stream.str() + ")"; - } + std::string name() { return "normal-double"; } - __device__ data_type operator()(Engine* state) - { - return rocrand_poisson(state, lambda); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_normal_double(state); + } +}; - double lambda; +template struct generator_log_normal : public generator_type { + typedef float data_type; + + std::string name() { return "log-normal-float"; } + + __device__ data_type operator()(Engine *state) const { + return rocrand_log_normal(state, 0.f, 1.f); + } }; -template -struct generator_discrete_poisson : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_log_normal_double : public generator_type { + typedef double data_type; - std::string name() - { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << lambda; - return "discrete-poisson(lambda=" + stream.str() + ")"; - } + std::string name() { return "log-normal-double"; } - void create() - { - ROCRAND_CHECK(rocrand_create_poisson_distribution(lambda, &discrete_distribution)); - } + __device__ data_type operator()(Engine *state) const { + return rocrand_log_normal_double(state, 0., 1.); + } +}; - void destroy() - { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } +template struct generator_poisson : public generator_type { + typedef unsigned int data_type; - __device__ data_type operator()(Engine* state) - { - return rocrand_discrete(state, discrete_distribution); - } + std::string name() { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "poisson(lambda=" + stream.str() + ")"; + } + + __device__ data_type operator()(Engine *state) { + return rocrand_poisson(state, lambda); + } - rocrand_discrete_distribution discrete_distribution; - double lambda; + double lambda; }; -template -struct generator_discrete_custom : public generator_type -{ - typedef unsigned int data_type; +template +struct generator_discrete_poisson : public generator_type { + typedef unsigned int data_type; - std::string name() - { - return "discrete-custom"; - } + std::string name() { + std::stringstream stream; + stream << std::fixed << std::setprecision(1) << lambda; + return "discrete-poisson(lambda=" + stream.str() + ")"; + } - void create() - { - const unsigned int offset = 1234; - std::vector probabilities = {10, 10, 1, 120, 8, 6, 140, 2, 150, 150, 10, 80}; - - double sum = std::accumulate(probabilities.begin(), probabilities.end(), 0.); - std::transform(probabilities.begin(), - probabilities.end(), - probabilities.begin(), - [=](double p) { return p / sum; }); - ROCRAND_CHECK(rocrand_create_discrete_distribution(probabilities.data(), - probabilities.size(), - offset, - &discrete_distribution)); - } + void create() { + ROCRAND_CHECK( + rocrand_create_poisson_distribution(lambda, &discrete_distribution)); + } - void destroy() - { - ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); - } + void destroy() { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } - __device__ data_type operator()(Engine* state) - { - return rocrand_discrete(state, discrete_distribution); - } + __device__ data_type operator()(Engine *state) { + return rocrand_discrete(state, discrete_distribution); + } - rocrand_discrete_distribution discrete_distribution; + rocrand_discrete_distribution discrete_distribution; + double lambda; }; -struct benchmark_context -{ - size_t size; - size_t dimensions; - size_t trials; - size_t blocks; - size_t threads; - std::vector lambdas; -}; +template +struct generator_discrete_custom : public generator_type { + typedef unsigned int data_type; -template -void run_benchmark(benchmark::State& state, - const hipStream_t stream, - const benchmark_context& context, - Generator generator) -{ - typedef typename Generator::data_type data_type; - - const size_t size = context.size; - const size_t dimensions = context.dimensions; - const size_t trials = context.trials; - const size_t blocks = context.blocks; - const size_t threads = context.threads; - - // Optional initialization of the generator - generator.create(); - - data_type* data; - HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); - - constexpr unsigned long long int seed = 12345ULL; - constexpr unsigned long long int offset = 6789ULL; - - runner r(dimensions, blocks, threads, seed, offset); - - // Warm-up - for(size_t i = 0; i < 5; i++) - { - r.generate(blocks, threads, stream, data, size, generator); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - } + std::string name() { return "discrete-custom"; } - // Measurement - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for(auto _ : state) - { - HIP_CHECK(hipEventRecord(start, stream)); - for(size_t i = 0; i < trials; i++) - { - r.generate(blocks, threads, stream, data, size, generator); - } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * size * sizeof(data_type)); - state.SetItemsProcessed(trials * state.iterations() * size); + void create() { + const unsigned int offset = 1234; + std::vector probabilities = {10, 10, 1, 120, 8, 6, + 140, 2, 150, 150, 10, 80}; - // Optional de-initialization of the generator - generator.destroy(); + double sum = + std::accumulate(probabilities.begin(), probabilities.end(), 0.); + std::transform(probabilities.begin(), probabilities.end(), + probabilities.begin(), [=](double p) { return p / sum; }); + ROCRAND_CHECK(rocrand_create_discrete_distribution( + probabilities.data(), probabilities.size(), offset, + &discrete_distribution)); + } - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipFree(data)); -} + void destroy() { + ROCRAND_CHECK(rocrand_destroy_discrete_distribution(discrete_distribution)); + } -template -void add_benchmark(const benchmark_context& context, - const hipStream_t stream, - std::vector& benchmarks, - const std::string& name, - Generator generator) -{ - static_assert(std::is_trivially_copyable::value - && std::is_trivially_destructible::value, - "Generator gets copied to device at kernel launch."); - const std::string benchmark_name = "device_kernel<" + name + "," + generator.name() + ">"; - benchmarks.emplace_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), - &run_benchmark, - stream, - context, - generator)); -} + __device__ data_type operator()(Engine *state) { + return rocrand_discrete(state, discrete_distribution); + } -template -void add_benchmarks(const benchmark_context& ctx, - const hipStream_t stream, - std::vector& benchmarks, - const rocrand_rng_type engine_type) -{ - constexpr bool is_64_bits = std::is_same::value - || std::is_same::value - || std::is_same::value - || std::is_same::value; - - const std::string name = engine_name(engine_type); - - if(is_64_bits) - { - add_benchmark(ctx, stream, benchmarks, name, generator_ullong()); - } - else - { - add_benchmark(ctx, stream, benchmarks, name, generator_uint()); - } + rocrand_discrete_distribution discrete_distribution; +}; - add_benchmark(ctx, stream, benchmarks, name, generator_uniform()); - add_benchmark(ctx, stream, benchmarks, name, generator_uniform_double()); - add_benchmark(ctx, stream, benchmarks, name, generator_normal()); - add_benchmark(ctx, stream, benchmarks, name, generator_normal_double()); - add_benchmark(ctx, stream, benchmarks, name, generator_log_normal()); - add_benchmark(ctx, stream, benchmarks, name, generator_log_normal_double()); - - for(size_t i = 0; i < ctx.lambdas.size(); i++) - { - generator_poisson gen_poisson; - gen_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_poisson); - } +struct benchmark_context { + size_t size; + size_t dimensions; + size_t trials; + size_t blocks; + size_t threads; + std::vector lambdas; +}; - for(size_t i = 0; i < ctx.lambdas.size(); i++) - { - generator_discrete_poisson gen_discrete_poisson; - gen_discrete_poisson.lambda = ctx.lambdas[i]; - add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); - } +template +void run_benchmark(benchmark::State &state, const hipStream_t stream, + const benchmark_context &context, Generator generator) { + typedef typename Generator::data_type data_type; + + const size_t size = context.size; + const size_t dimensions = context.dimensions; + const size_t trials = context.trials; + const size_t blocks = context.blocks; + const size_t threads = context.threads; + + // Optional initialization of the generator + generator.create(); + + data_type *data; + HIP_CHECK(hipMalloc(&data, size * sizeof(data_type))); + + constexpr unsigned long long int seed = 12345ULL; + constexpr unsigned long long int offset = 6789ULL; + + runner r(dimensions, blocks, threads, seed, offset); + + // Warm-up + for (size_t i = 0; i < 5; i++) { + r.generate(blocks, threads, stream, data, size, generator); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + } + + // Measurement + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for (auto _ : state) { + HIP_CHECK(hipEventRecord(start, stream)); + for (size_t i = 0; i < trials; i++) { + r.generate(blocks, threads, stream, data, size, generator); + } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * size * + sizeof(data_type)); + state.SetItemsProcessed(trials * state.iterations() * size); + + // Optional de-initialization of the generator + generator.destroy(); + + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipFree(data)); +} - add_benchmark(ctx, stream, benchmarks, name, generator_discrete_custom()); +template +void add_benchmark(const benchmark_context &context, const hipStream_t stream, + std::vector &benchmarks, + const std::string &name, Generator generator) { + static_assert(std::is_trivially_copyable::value && + std::is_trivially_destructible::value, + "Generator gets copied to device at kernel launch."); + const std::string benchmark_name = + "device_kernel<" + name + "," + generator.name() + ">"; + benchmarks.emplace_back(benchmark::RegisterBenchmark( + benchmark_name.c_str(), &run_benchmark, stream, + context, generator)); } -int main(int argc, char* argv[]) -{ - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("dimensions", - "dimensions", - 1, - "number of dimensions of quasi-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional("blocks", "blocks", 256, "number of blocks"); - parser.set_optional("threads", "threads", 256, "number of threads in each block"); - parser.set_optional>( - "lambda", - "lambda", - {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - add_common_benchmark_rocrand_info(); - - benchmark_context ctx{}; - - ctx.size = parser.get("size"); - ctx.dimensions = parser.get("dimensions"); - ctx.trials = parser.get("trials"); - ctx.blocks = parser.get("blocks"); - ctx.threads = parser.get("threads"); - ctx.lambdas = parser.get>("lambda"); - - benchmark::AddCustomContext("size", std::to_string(ctx.size)); - benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); - benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); - benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); - benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); - - std::vector benchmarks = {}; - - // MT19937 has no kernel implementation - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_LFSR113); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG31K3P); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MRG32K3A); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_MTGP32); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_PHILOX4_32_10); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL32); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SOBOL64); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); - add_benchmarks(ctx, - stream, - benchmarks, - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); - add_benchmarks(ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_XORWOW); - - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); - } +template +void add_benchmarks(const benchmark_context &ctx, const hipStream_t stream, + std::vector &benchmarks, + const rocrand_rng_type engine_type) { + constexpr bool is_64_bits = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const std::string name = engine_name(engine_type); + + if (is_64_bits) { + add_benchmark(ctx, stream, benchmarks, name, + generator_ullong()); + } else { + add_benchmark(ctx, stream, benchmarks, name, + generator_uint()); + } + + add_benchmark(ctx, stream, benchmarks, name, + generator_uniform()); + add_benchmark(ctx, stream, benchmarks, name, + generator_uniform_double()); + add_benchmark(ctx, stream, benchmarks, name, + generator_normal()); + add_benchmark(ctx, stream, benchmarks, name, + generator_normal_double()); + add_benchmark(ctx, stream, benchmarks, name, + generator_log_normal()); + add_benchmark(ctx, stream, benchmarks, name, + generator_log_normal_double()); + + for (size_t i = 0; i < ctx.lambdas.size(); i++) { + generator_poisson gen_poisson; + gen_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_poisson); + } + + for (size_t i = 0; i < ctx.lambdas.size(); i++) { + generator_discrete_poisson gen_discrete_poisson; + gen_discrete_poisson.lambda = ctx.lambdas[i]; + add_benchmark(ctx, stream, benchmarks, name, gen_discrete_poisson); + } + + add_benchmark(ctx, stream, benchmarks, name, + generator_discrete_custom()); +} +int main(int argc, char *argv[]) { + + // get the out format and out file name thats being passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string outFile = ""; + std::string filter = ""; + for (int i = 1; i < argc; i++) { + std::string input(argv[i]); + + int equalPos = input.find("="); + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if (arg == "benchmark_out_format") + outFormat = argVal; + else if (arg == "benchmark_out") + outFile = argVal; + else if (arg == "benchmark_filter") + filter = argVal; + } + + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, + "number of values"); + parser.set_optional("dimensions", "dimensions", 1, + "number of dimensions of quasi-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional("blocks", "blocks", 256, "number of blocks"); + parser.set_optional("threads", "threads", 256, + "number of threads in each block"); + parser.set_optional>( + "lambda", "lambda", {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + add_common_benchmark_rocrand_info(); + + benchmark_context ctx{}; + + ctx.size = parser.get("size"); + ctx.dimensions = parser.get("dimensions"); + ctx.trials = parser.get("trials"); + ctx.blocks = parser.get("blocks"); + ctx.threads = parser.get("threads"); + ctx.lambdas = parser.get>("lambda"); + + benchmark::AddCustomContext("size", std::to_string(ctx.size)); + benchmark::AddCustomContext("dimensions", std::to_string(ctx.dimensions)); + benchmark::AddCustomContext("trials", std::to_string(ctx.trials)); + benchmark::AddCustomContext("blocks", std::to_string(ctx.blocks)); + benchmark::AddCustomContext("threads", std::to_string(ctx.threads)); + + std::vector benchmarks = {}; + + // MT19937 has no kernel implementation + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_LFSR113); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MRG31K3P); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MRG32K3A); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_MTGP32); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_PHILOX4_32_10); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_QUASI_SOBOL32); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_QUASI_SOBOL64); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_32_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_32_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20); + add_benchmarks( + ctx, stream, benchmarks, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20); + add_benchmarks(ctx, stream, benchmarks, + ROCRAND_RNG_PSEUDO_XORWOW); + + // Use manual timing + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + if (outFormat == "csv") { + std::string spec = (filter == "" || filter == "all") ? "." : filter; + std::ofstream output_file; + + benchmark::ConsoleReporter console_reporter; + benchmark::customCSVReporter csv_reporter; + + auto &Err = console_reporter.GetErrorStream(); + + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&Err); + + benchmark::BenchmarkReporter *console_ptr = &console_reporter; + benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; + + benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); + + } else { // Run benchmarks benchmark::RunSpecifiedBenchmarks(); - HIP_CHECK(hipStreamDestroy(stream)); + } + HIP_CHECK(hipStreamDestroy(stream)); - return 0; + return 0; } diff --git a/benchmark/benchmark_rocrand_host_api.cpp b/benchmark/benchmark_rocrand_host_api.cpp index c29b21b91..8cac13d30 100644 --- a/benchmark/benchmark_rocrand_host_api.cpp +++ b/benchmark/benchmark_rocrand_host_api.cpp @@ -23,10 +23,11 @@ #include +#include "custom_csv_formater.hpp" +#include #include -#include - #include +#include #include #include @@ -34,175 +35,175 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; #endif +typedef std::unique_ptr PtrType; typedef rocrand_rng_type rng_type_t; -template -using generate_func_type = std::function; - -template -void run_benchmark(benchmark::State& state, - generate_func_type generate_func, - const size_t size, - const bool byte_size, - const size_t trials, - const size_t dimensions, - const size_t offset, - const rng_type_t rng_type, - const rocrand_ordering ordering, - const bool benchmark_host, - hipStream_t stream) -{ - const size_t binary_div = byte_size ? sizeof(T) : 1; - const size_t rounded_size = (size / binary_div / dimensions) * dimensions; - - T* data; - rocrand_generator generator; - - if(benchmark_host) - { - data = new T[rounded_size]; - ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); - } - else - { - HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); - ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); - } - - ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); - - rocrand_status status = rocrand_set_quasi_random_generator_dimensions(generator, dimensions); - if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random - { - ROCRAND_CHECK(status); - } - - ROCRAND_CHECK(rocrand_set_stream(generator, stream)); - - status = rocrand_set_offset(generator, offset); - if(status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random - { - ROCRAND_CHECK(status); - } - - // Warm-up - for(size_t i = 0; i < 15; i++) - { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); - } - HIP_CHECK(hipDeviceSynchronize()); - - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - for(auto _ : state) - { - HIP_CHECK(hipEventRecord(start, stream)); - for(size_t i = 0; i < trials; i++) - { - ROCRAND_CHECK(generate_func(generator, data, rounded_size)); - } - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed = 0.0f; - HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); - - state.SetIterationTime(elapsed / 1000.f); - } - state.SetBytesProcessed(trials * state.iterations() * rounded_size * sizeof(T)); - state.SetItemsProcessed(trials * state.iterations() * rounded_size); - - HIP_CHECK(hipEventDestroy(stop)); - HIP_CHECK(hipEventDestroy(start)); - ROCRAND_CHECK(rocrand_destroy_generator(generator)); - if(benchmark_host) - { - delete[] data; - } - else - { - HIP_CHECK(hipFree(data)); +template +using generate_func_type = + std::function; + +template +void run_benchmark(benchmark::State &state, generate_func_type generate_func, + const size_t size, const bool byte_size, const size_t trials, + const size_t dimensions, const size_t offset, + const rng_type_t rng_type, const rocrand_ordering ordering, + const bool benchmark_host, hipStream_t stream) { + const size_t binary_div = byte_size ? sizeof(T) : 1; + const size_t rounded_size = (size / binary_div / dimensions) * dimensions; + + T *data; + rocrand_generator generator; + + if (benchmark_host) { + data = new T[rounded_size]; + ROCRAND_CHECK(rocrand_create_generator_host(&generator, rng_type)); + } else { + HIP_CHECK(hipMalloc(&data, rounded_size * sizeof(T))); + ROCRAND_CHECK(rocrand_create_generator(&generator, rng_type)); + } + + ROCRAND_CHECK(rocrand_set_ordering(generator, ordering)); + + rocrand_status status = + rocrand_set_quasi_random_generator_dimensions(generator, dimensions); + if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random + { + ROCRAND_CHECK(status); + } + + ROCRAND_CHECK(rocrand_set_stream(generator, stream)); + + status = rocrand_set_offset(generator, offset); + if (status != ROCRAND_STATUS_TYPE_ERROR) // If the RNG is not pseudo-random + { + ROCRAND_CHECK(status); + } + + // Warm-up + for (size_t i = 0; i < 15; i++) { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); + } + HIP_CHECK(hipDeviceSynchronize()); + + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + for (auto _ : state) { + HIP_CHECK(hipEventRecord(start, stream)); + for (size_t i = 0; i < trials; i++) { + ROCRAND_CHECK(generate_func(generator, data, rounded_size)); } + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed = 0.0f; + HIP_CHECK(hipEventElapsedTime(&elapsed, start, stop)); + + state.SetIterationTime(elapsed / 1000.f); + } + state.SetBytesProcessed(trials * state.iterations() * rounded_size * + sizeof(T)); + state.SetItemsProcessed(trials * state.iterations() * rounded_size); + + HIP_CHECK(hipEventDestroy(stop)); + HIP_CHECK(hipEventDestroy(start)); + ROCRAND_CHECK(rocrand_destroy_generator(generator)); + + if (benchmark_host) { + delete[] data; + } else { + HIP_CHECK(hipFree(data)); + } } -int main(int argc, char* argv[]) -{ - - // Parse argv - benchmark::Initialize(&argc, argv); - - cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("byte-size", - "byte-size", - false, - "--size is interpreted as the number of generated bytes"); - parser.set_optional("dimensions", - "dimensions", - 1, - "number of dimensions of quasi-random values"); - parser.set_optional("offset", "offset", 0, "offset of generated pseudo-random values"); - parser.set_optional("trials", "trials", 20, "number of trials"); - parser.set_optional>( - "lambda", - "lambda", - {10.0}, - "space-separated list of lambdas of Poisson distribution"); - parser.set_optional("host", - "host", - false, - "run benchmarks on the host instead of on the device"); - parser.run_and_exit_if_error(); - - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - - // Benchmark info - add_common_benchmark_rocrand_info(); - - const size_t size = parser.get("size"); - const bool byte_size = parser.get("byte-size"); - const size_t trials = parser.get("trials"); - const size_t dimensions = parser.get("dimensions"); - const size_t offset = parser.get("offset"); - const std::vector poisson_lambdas = parser.get>("lambda"); - const bool benchmark_host = parser.get("host"); - - benchmark::AddCustomContext("size", std::to_string(size)); - benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); - benchmark::AddCustomContext("trials", std::to_string(trials)); - benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); - benchmark::AddCustomContext("offset", std::to_string(offset)); - benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); - - std::vector benchmarked_engine_types{ROCRAND_RNG_PSEUDO_LFSR113, - ROCRAND_RNG_PSEUDO_MRG31K3P, - ROCRAND_RNG_PSEUDO_MRG32K3A, - ROCRAND_RNG_PSEUDO_MTGP32, - ROCRAND_RNG_PSEUDO_MT19937, - ROCRAND_RNG_PSEUDO_PHILOX4_32_10, - ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, - ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, - ROCRAND_RNG_PSEUDO_XORWOW, - ROCRAND_RNG_QUASI_SOBOL32, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, - ROCRAND_RNG_QUASI_SOBOL64, - ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; - - const std::map ordering_name_map{ - {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, - { ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, - { ROCRAND_ORDERING_PSEUDO_BEST, "best"}, - {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, - { ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, - { ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, - }; - - const std::map> benchmarked_orderings{ - // clang-format off +int main(int argc, char *argv[]) { + + // get the out format and out file name thats being passed into + // benchmark::Initialize() + std::string outFormat = ""; + std::string outFile = ""; + std::string filter = ""; + for (int i = 1; i < argc; i++) { + std::string input(argv[i]); + + int equalPos = input.find("="); + std::string arg = std::string(input.begin() + 2, input.begin() + equalPos); + std::string argVal = std::string(input.begin() + 1 + equalPos, input.end()); + + if (arg == "benchmark_out_format") + outFormat = argVal; + else if (arg == "benchmark_out") + outFile = argVal; + else if (arg == "benchmark_filter") + filter = argVal; + } + + // Parse argv + benchmark::Initialize(&argc, argv); + + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_RAND_N, + "number of values"); + parser.set_optional( + "byte-size", "byte-size", false, + "--size is interpreted as the number of generated bytes"); + parser.set_optional("dimensions", "dimensions", 1, + "number of dimensions of quasi-random values"); + parser.set_optional("offset", "offset", 0, + "offset of generated pseudo-random values"); + parser.set_optional("trials", "trials", 20, "number of trials"); + parser.set_optional>( + "lambda", "lambda", {10.0}, + "space-separated list of lambdas of Poisson distribution"); + parser.set_optional( + "host", "host", false, + "run benchmarks on the host instead of on the device"); + parser.run_and_exit_if_error(); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Benchmark info + add_common_benchmark_rocrand_info(); + + const size_t size = parser.get("size"); + const bool byte_size = parser.get("byte-size"); + const size_t trials = parser.get("trials"); + const size_t dimensions = parser.get("dimensions"); + const size_t offset = parser.get("offset"); + const std::vector poisson_lambdas = + parser.get>("lambda"); + const bool benchmark_host = parser.get("host"); + + benchmark::AddCustomContext("size", std::to_string(size)); + benchmark::AddCustomContext("byte-size", std::to_string(byte_size)); + benchmark::AddCustomContext("trials", std::to_string(trials)); + benchmark::AddCustomContext("dimensions", std::to_string(dimensions)); + benchmark::AddCustomContext("offset", std::to_string(offset)); + benchmark::AddCustomContext("benchmark_host", std::to_string(benchmark_host)); + + std::vector benchmarked_engine_types{ + ROCRAND_RNG_PSEUDO_LFSR113, ROCRAND_RNG_PSEUDO_MRG31K3P, + ROCRAND_RNG_PSEUDO_MRG32K3A, ROCRAND_RNG_PSEUDO_MTGP32, + ROCRAND_RNG_PSEUDO_MT19937, ROCRAND_RNG_PSEUDO_PHILOX4_32_10, + ROCRAND_RNG_PSEUDO_THREEFRY2_32_20, ROCRAND_RNG_PSEUDO_THREEFRY2_64_20, + ROCRAND_RNG_PSEUDO_THREEFRY4_32_20, ROCRAND_RNG_PSEUDO_THREEFRY4_64_20, + ROCRAND_RNG_PSEUDO_XORWOW, ROCRAND_RNG_QUASI_SOBOL32, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, ROCRAND_RNG_QUASI_SOBOL64, + ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64}; + + const std::map ordering_name_map{ + {ROCRAND_ORDERING_PSEUDO_DEFAULT, "default"}, + {ROCRAND_ORDERING_PSEUDO_LEGACY, "legacy"}, + {ROCRAND_ORDERING_PSEUDO_BEST, "best"}, + {ROCRAND_ORDERING_PSEUDO_DYNAMIC, "dynamic"}, + {ROCRAND_ORDERING_PSEUDO_SEEDED, "seeded"}, + {ROCRAND_ORDERING_QUASI_DEFAULT, "default"}, + }; + + const std::map> + benchmarked_orderings{ + // clang-format off { ROCRAND_RNG_PSEUDO_MTGP32, {ROCRAND_ORDERING_PSEUDO_DEFAULT, ROCRAND_ORDERING_PSEUDO_DYNAMIC}}, { ROCRAND_RNG_PSEUDO_MT19937, {ROCRAND_ORDERING_PSEUDO_DEFAULT}}, @@ -228,242 +229,174 @@ int main(int argc, char* argv[]) {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL32, {ROCRAND_ORDERING_QUASI_DEFAULT}}, { ROCRAND_RNG_QUASI_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, {ROCRAND_RNG_QUASI_SCRAMBLED_SOBOL64, {ROCRAND_ORDERING_QUASI_DEFAULT}}, - // clang-format on - }; - - const std::string benchmark_name_prefix = "device_generate"; - // Add benchmarks - std::vector benchmarks = {}; - for(const rocrand_rng_type engine_type : benchmarked_engine_types) - { - const std::string name = engine_name(engine_type); - for(const rocrand_ordering ordering : benchmarked_orderings.at(engine_type)) - { - const std::string name_engine_prefix - = benchmark_name_prefix + "<" + name + "," + ordering_name_map.at(ordering) + ","; - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uint>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned int* data, size_t size_gen) - { return rocrand_generate(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-uchar>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned char* data, size_t size_gen) - { return rocrand_generate_char(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-ushort>").c_str(), - &run_benchmark, - [](rocrand_generator gen, unsigned short* data, size_t size_gen) - { return rocrand_generate_short(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { return rocrand_generate_uniform_half(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_uniform(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "uniform-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_uniform_double(gen, data, size_gen); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { - return rocrand_generate_normal_half(gen, - data, - size_gen, - __float2half(0.0f), - __float2half(1.0f)); - }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_normal_double(gen, data, size_gen, 0.0, 1.0); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-half>").c_str(), - &run_benchmark<__half>, - [](rocrand_generator gen, __half* data, size_t size_gen) - { - return rocrand_generate_log_normal_half(gen, - data, - size_gen, - __float2half(0.0f), - __float2half(1.0f)); - }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-float>").c_str(), - &run_benchmark, - [](rocrand_generator gen, float* data, size_t size_gen) - { return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + "log-normal-double>").c_str(), - &run_benchmark, - [](rocrand_generator gen, double* data, size_t size_gen) - { return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, 1.0); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - - for(auto lambda : poisson_lambdas) - { - const std::string poisson_dis_name - = std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; - benchmarks.emplace_back(benchmark::RegisterBenchmark( - (name_engine_prefix + poisson_dis_name).c_str(), - &run_benchmark, - [lambda](rocrand_generator gen, unsigned int* data, size_t size_gen) - { return rocrand_generate_poisson(gen, data, size_gen, lambda); }, - size, - byte_size, - trials, - dimensions, - offset, - engine_type, - ordering, - benchmark_host, - stream)); - } - } - } - // Use manual timing - for(auto& b : benchmarks) - { - b->UseManualTime(); - b->Unit(benchmark::kMillisecond); + // clang-format on + }; + + const std::string benchmark_name_prefix = "device_generate"; + // Add benchmarks + std::vector benchmarks = {}; + for (const rocrand_rng_type engine_type : benchmarked_engine_types) { + const std::string name = engine_name(engine_type); + for (const rocrand_ordering ordering : + benchmarked_orderings.at(engine_type)) { + const std::string name_engine_prefix = + benchmark_name_prefix + "<" + name + "," + + ordering_name_map.at(ordering) + ","; + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uint>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned int *data, size_t size_gen) { + return rocrand_generate(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-uchar>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned char *data, size_t size_gen) { + return rocrand_generate_char(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-ushort>").c_str(), + &run_benchmark, + [](rocrand_generator gen, unsigned short *data, size_t size_gen) { + return rocrand_generate_short(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_uniform_half(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_uniform(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "uniform-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_uniform_double(gen, data, size_gen); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-half>").c_str(), &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_normal_half( + gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-float>").c_str(), &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_normal(gen, data, size_gen, 0.0f, 1.0f); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_normal_double(gen, data, size_gen, 0.0, + 1.0); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-half>").c_str(), + &run_benchmark<__half>, + [](rocrand_generator gen, __half *data, size_t size_gen) { + return rocrand_generate_log_normal_half( + gen, data, size_gen, __float2half(0.0f), __float2half(1.0f)); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-float>").c_str(), + &run_benchmark, + [](rocrand_generator gen, float *data, size_t size_gen) { + return rocrand_generate_log_normal(gen, data, size_gen, 0.0f, 1.0f); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + "log-normal-double>").c_str(), + &run_benchmark, + [](rocrand_generator gen, double *data, size_t size_gen) { + return rocrand_generate_log_normal_double(gen, data, size_gen, 0.0, + 1.0); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + + for (auto lambda : poisson_lambdas) { + const std::string poisson_dis_name = + std::string("poisson(lambda=") + std::to_string(lambda) + ")>"; + benchmarks.emplace_back(benchmark::RegisterBenchmark( + (name_engine_prefix + poisson_dis_name).c_str(), + &run_benchmark, + [lambda](rocrand_generator gen, unsigned int *data, + size_t size_gen) { + return rocrand_generate_poisson(gen, data, size_gen, lambda); + }, + size, byte_size, trials, dimensions, offset, engine_type, ordering, + benchmark_host, stream)); + } } + } + + for (auto &b : benchmarks) { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + if (outFormat == "csv") { + std::string spec = (filter == "" || filter == "all") ? "." : filter; + std::ofstream output_file; + + benchmark::ConsoleReporter console_reporter; + benchmark::customCSVReporter csv_reporter; + + auto &Err = console_reporter.GetErrorStream(); + + csv_reporter.SetOutputStream(&output_file); + csv_reporter.SetErrorStream(&Err); + + benchmark::BenchmarkReporter *console_ptr = &console_reporter; + benchmark::BenchmarkReporter *csv_ptr = &csv_reporter; + + benchmark::RunSpecifiedBenchmarks(console_ptr, csv_ptr, spec); + + } else { // Run benchmarks benchmark::RunSpecifiedBenchmarks(); - HIP_CHECK(hipStreamDestroy(stream)); + } + + HIP_CHECK(hipStreamDestroy(stream)); - return 0; + return 0; } diff --git a/benchmark/custom_csv_formater.hpp b/benchmark/custom_csv_formater.hpp new file mode 100644 index 000000000..c5ecb04e9 --- /dev/null +++ b/benchmark/custom_csv_formater.hpp @@ -0,0 +1,182 @@ +#pragma once + +#include +#include +namespace benchmark { + +class customCSVReporter : public BenchmarkReporter { +public: + customCSVReporter() : printed_header_(false) {} + bool ReportContext(const Context &context) override; + void ReportRuns(const std::vector &reports) override; + +private: + std::string CsvEscape(const std::string &s) { + std::string tmp; + tmp.reserve(s.size() + 2); + for (char c : s) { + switch (c) { + case '"': + tmp += "\"\""; + break; + default: + tmp += c; + break; + } + } + return '"' + tmp + '"'; + } + + // Function to return an string for the calculated complexity + std::string GetBigOString(BigO complexity) { + switch (complexity) { + case oN: + return "N"; + case oNSquared: + return "N^2"; + case oNCubed: + return "N^3"; + case oLogN: + return "lgN"; + case oNLogN: + return "NlgN"; + case o1: + return "(1)"; + default: + return "f(N)"; + } + } + + void PrintRunData(const Run &report); + bool printed_header_; + std::set user_counter_names_; + + std::ostream *nullLog = nullptr; + + std::vector elements = { + "engine", "distribution", "name", "iterations", + "real_time", "cpu_time", "time_unit", "bytes_per_second", + "items_per_second", "label", "error_occurred", "error_message"}; +}; + +bool customCSVReporter::ReportContext(const Context &context) { + PrintBasicContext(&GetErrorStream(), context); + return true; +} + +void customCSVReporter::ReportRuns(const std::vector &reports) { + std::ostream &Out = GetOutputStream(); + + if (!printed_header_) { + // save the names of all the user counters + for (const auto &run : reports) { + for (const auto &cnt : run.counters) { + if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + user_counter_names_.insert(cnt.first); + } + } + + // print the header + for (auto B = elements.begin(); B != elements.end();) { + Out << *B++; + if (B != elements.end()) + Out << ","; + } + for (auto B = user_counter_names_.begin(); + B != user_counter_names_.end();) { + Out << ",\"" << *B++ << "\""; + } + Out << "\n"; + + printed_header_ = true; + } else { + // check that all the current counters are saved in the name set + for (const auto &run : reports) { + for (const auto &cnt : run.counters) { + if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second") + continue; + + // benchmark::internal::GetNullLogInstance() + *nullLog << "All counters must be present in each run. " + << "Counter named \"" << cnt.first + << "\" was not in a run after being added to the header"; + } + } + } + + // print results for each run + for (const auto &run : reports) { + PrintRunData(run); + } +} + +void customCSVReporter::PrintRunData(const Run &run) { + std::ostream &Out = GetOutputStream(); + std::ostream &Err = GetErrorStream(); + + //get the name of the engine and distribution: + + std::string temp = run.benchmark_name(); + temp.erase(0, temp.find("<") + 1); + + std::string engineName = std::string(temp.begin(), temp.begin() + temp.find(",")); + + temp.erase(0, engineName.size() + 1); + temp.erase(0, temp.find(",") + 1); + std::string disName = std::string(temp.begin(), temp.begin() + temp.find(">")); + + + Out << engineName << ","; + Out << disName << ","; + Out << CsvEscape(run.benchmark_name()) << ","; + if (run.error_occurred) { + Err << std::string(elements.size() - 3, ','); + Err << "true,"; + Err << CsvEscape(run.error_message) << "\n"; + return; + } + + // Do not print iteration on bigO and RMS report + if (!run.report_big_o && !run.report_rms) { + Out << run.iterations; + } + Out << ","; + + Out << run.GetAdjustedRealTime() << ","; + Out << run.GetAdjustedCPUTime() << ","; + + // Do not print timeLabel on bigO and RMS report + if (run.report_big_o) { + Out << GetBigOString(run.complexity); + } else if (!run.report_rms) { + Out << GetTimeUnitString(run.time_unit); + } + Out << ","; + + if (run.counters.find("bytes_per_second") != run.counters.end()) { + Out << run.counters.at("bytes_per_second"); + } + Out << ","; + if (run.counters.find("items_per_second") != run.counters.end()) { + Out << run.counters.at("items_per_second"); + } + Out << ","; + if (!run.report_label.empty()) { + Out << CsvEscape(run.report_label); + } + Out << ",,"; // for error_occurred and error_message + + // Print user counters + for (const auto &ucn : user_counter_names_) { + auto it = run.counters.find(ucn); + if (it == run.counters.end()) { + Out << ","; + } else { + Out << "," << it->second; + } + } + Out << '\n'; +} + +} // namespace benchmark