From db3a0b4165258e8af9232cd9e37d9714a0ad6a4e Mon Sep 17 00:00:00 2001 From: Robin Kobus Date: Mon, 5 Oct 2020 18:16:46 +0200 Subject: [PATCH] refactored benchmark; retrieve max key range --- benchmark/Makefile | 35 +++++++---------- benchmark/src/bucket_list_benchmark.cu | 18 ++++++--- benchmark/src/common.cuh | 52 +++++++++++++++++++++++++ benchmark/src/counting_benchmark.cu | 4 +- benchmark/src/multi_bucket_benchmark.cu | 27 +++++++++---- benchmark/src/multi_value_benchmark.cu | 21 +++++++--- benchmark/src/single_value_benchmark.cu | 4 +- 7 files changed, 119 insertions(+), 42 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 8875b36..bb57972 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -10,36 +10,29 @@ NVCCFLAGS := -std=$(STD) $(NVCCGENCODE) --expt-extended-lambda --expt-relaxed-co INCS := $(foreach dir, $(INCDIRS), $(wildcard $(dir)/*.cuh $(dir)/*.h $(dir)/*.hpp)) INCPARAMS := $(addprefix -I, $(INCDIRS)) -all: single_value multi_value multi_bucket counting bloom_filter bucket_list - -single_value: bin/single_value_benchmark.out - -bin/single_value_benchmark.out: ${INCS} | bin +all: bin/single_value_benchmark.out \ + bin/multi_value_benchmark.out \ + bin/multi_bucket_benchmark.out \ + bin/counting_benchmark.out \ + bin/bloom_filter_benchmark.out \ + bin/bucket_list_benchmark.out + +bin/single_value_benchmark.out: src/single_value_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/single_value_benchmark.cu -o bin/single_value_benchmark.out -multi_value: bin/multi_value_benchmark.out - -bin/multi_value_benchmark.out: ${INCS} | bin +bin/multi_value_benchmark.out: src/multi_value_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/multi_value_benchmark.cu -o bin/multi_value_benchmark.out -multi_bucket: bin/multi_bucket_benchmark.out - -bin/multi_bucket_benchmark.out: ${INCS} | bin +bin/multi_bucket_benchmark.out: src/multi_bucket_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/multi_bucket_benchmark.cu -o bin/multi_bucket_benchmark.out -counting: bin/counting_benchmark.out - -bin/counting_benchmark.out: ${INCS} | bin +bin/counting_benchmark.out: src/counting_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/counting_benchmark.cu -o bin/counting_benchmark.out -bloom_filter: bin/bloom_filter_benchmark.out - -bin/bloom_filter_benchmark.out: ${INCS} | bin +bin/bloom_filter_benchmark.out: src/bloom_filter_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/bloom_filter_benchmark.cu -o bin/bloom_filter_benchmark.out -bucket_list: bin/bucket_list_benchmark.out - -bin/bucket_list_benchmark.out: ${INCS} | bin +bin/bucket_list_benchmark.out: src/bucket_list_benchmark.cu src/common.cuh ${INCS} | bin $(NVCC) $(NVCCFLAGS) $(INCPARAMS) src/bucket_list_benchmark.cu -o bin/bucket_list_benchmark.out debug: OPT := 0 @@ -57,4 +50,4 @@ clean: bin: mkdir -p $@ -.PHONY: clean all bin multi_value multi_bucket counting bloom_filter bucket_list +.PHONY: clean all debug profile diff --git a/benchmark/src/bucket_list_benchmark.cu b/benchmark/src/bucket_list_benchmark.cu index 0f952ed..b439e99 100644 --- a/benchmark/src/bucket_list_benchmark.cu +++ b/benchmark/src/bucket_list_benchmark.cu @@ -51,8 +51,8 @@ void bucket_list_benchmark( const uint64_t key_store_capacity = max_unique_size / key_load_factor; const uint64_t value_store_capacity = max_keys / value_load_factor; - key_t* unique_keys_d = nullptr; - cudaMalloc(&unique_keys_d, sizeof(key_t)*max_unique_size); CUERR + key_t* query_keys_d = nullptr; + cudaMalloc(&query_keys_d, sizeof(key_t)*max_keys); CUERR value_t* values_d = nullptr; cudaMalloc(&values_d, sizeof(value_t)*max_keys); CUERR index_t * offsets_d = nullptr; @@ -98,9 +98,14 @@ void bucket_list_benchmark( iters, thermal_backoff); output.query_ms = benchmark_query_multi( - hash_table, unique_keys_d, offsets_d, values_d, + hash_table, query_keys_d, size, + offsets_d, values_d, iters, thermal_backoff); + // output.query_ms = benchmark_query_unique( + // hash_table, query_keys_d, offsets_d, values_d, + // iters, thermal_backoff); + output.key_load_factor = hash_table.key_load_factor(); output.value_load_factor = hash_table.value_load_factor(); output.density = hash_table.storage_density(); @@ -120,7 +125,7 @@ void bucket_list_benchmark( } } - cudaFree(unique_keys_d); CUERR + cudaFree(query_keys_d); CUERR cudaFree(values_d); CUERR cudaFree(offsets_d); CUERR } @@ -134,6 +139,8 @@ int main(int argc, char* argv[]) const uint64_t max_keys = 1UL << 27; + const bool print_headers = true; + uint64_t dev_id = 0; if(argc > 2) dev_id = std::atoi(argv[2]); cudaSetDevice(dev_id); CUERR @@ -158,7 +165,8 @@ int main(int argc, char* argv[]) 0.50, {max_keys}, {{1.1, 1, 0}}, - 0x5ad0ded); + 0x5ad0ded, + print_headers); cudaFree(keys_d); CUERR } diff --git a/benchmark/src/common.cuh b/benchmark/src/common.cuh index ba19d63..888a2cc 100644 --- a/benchmark/src/common.cuh +++ b/benchmark/src/common.cuh @@ -175,6 +175,58 @@ float benchmark_query( template float benchmark_query_multi( + HashTable& hash_table, + typename HashTable::key_type * keys_d, + const uint64_t size, + typename HashTable::index_type * offsets_d, + typename HashTable::value_type * values_d, + const uint8_t iters, + const std::chrono::milliseconds thermal_backoff) +{ + using index_t = typename HashTable::index_type; + + helpers::lambda_kernel + <<>> + ([=] DEVICEQUALIFIER + { + const uint64_t tid = blockDim.x * blockIdx.x + threadIdx.x; + + if(tid < size) + { + keys_d[tid] = tid + 1; + } + }); + cudaDeviceSynchronize(); CUERR + + index_t value_size_out = 0; + + std::vector query_times(iters); + for(uint64_t i = 0; i < iters; i++) + { + cudaEvent_t query_start, query_stop; + float t; + cudaEventCreate(&query_start); + cudaEventCreate(&query_stop); + cudaEventRecord(query_start, 0); + hash_table.retrieve( + keys_d, + size, + offsets_d, + offsets_d+1, + values_d, + value_size_out); + cudaEventRecord(query_stop, 0); + cudaEventSynchronize(query_stop); + cudaEventElapsedTime(&t, query_start, query_stop); + cudaDeviceSynchronize(); CUERR + query_times[i] = t; + std::this_thread::sleep_for(thermal_backoff); + } + return *std::min_element(query_times.begin(), query_times.end()); +} + +template +float benchmark_query_unique( HashTable& hash_table, typename HashTable::key_type * unique_keys_d, typename HashTable::index_type * offsets_d, diff --git a/benchmark/src/counting_benchmark.cu b/benchmark/src/counting_benchmark.cu index 39e2ba5..392b34f 100644 --- a/benchmark/src/counting_benchmark.cu +++ b/benchmark/src/counting_benchmark.cu @@ -84,6 +84,8 @@ int main(int argc, char* argv[]) const uint64_t max_keys = 1UL << 28; + const bool print_headers = true; + uint64_t dev_id = 0; if(argc > 2) dev_id = std::atoi(argv[2]); cudaSetDevice(dev_id); CUERR @@ -97,7 +99,7 @@ int main(int argc, char* argv[]) using hash_table_t = warpcore::CountingHashTable; counting_benchmark( - keys_d, max_keys, {max_keys}, {0.9}, true); + keys_d, max_keys, {max_keys}, {0.9}, print_headers); cudaFree(keys_d); CUERR } \ No newline at end of file diff --git a/benchmark/src/multi_bucket_benchmark.cu b/benchmark/src/multi_bucket_benchmark.cu index 0d60ad7..3c4f8a1 100644 --- a/benchmark/src/multi_bucket_benchmark.cu +++ b/benchmark/src/multi_bucket_benchmark.cu @@ -22,8 +22,8 @@ void multi_value_benchmark( const uint64_t max_unique_size = num_unique(keys_d, max_keys); - key_t* unique_keys_d = nullptr; - cudaMalloc(&unique_keys_d, sizeof(key_t)*max_unique_size); CUERR + key_t* query_keys_d = nullptr; + cudaMalloc(&query_keys_d, sizeof(key_t)*max_keys); CUERR value_t* values_d = nullptr; cudaMalloc(&values_d, sizeof(value_t)*max_keys); CUERR index_t * offsets_d = nullptr; @@ -75,9 +75,14 @@ void multi_value_benchmark( // std::cerr << "keys in set: " << key_set.size() << '\n'; output.query_ms = benchmark_query_multi( - hash_table, unique_keys_d, offsets_d, values_d, + hash_table, query_keys_d, size, + offsets_d, values_d, iters, thermal_backoff); + // output.query_ms = benchmark_query_unique( + // hash_table, query_keys_d, offsets_d, values_d, + // iters, thermal_backoff); + output.key_load_factor = hash_table.key_load_factor(); output.value_load_factor = hash_table.value_load_factor(); output.density = hash_table.storage_density(); @@ -91,7 +96,7 @@ void multi_value_benchmark( } } - cudaFree(unique_keys_d); CUERR + cudaFree(query_keys_d); CUERR cudaFree(values_d); CUERR cudaFree(offsets_d); CUERR } @@ -105,6 +110,8 @@ int main(int argc, char* argv[]) const uint64_t max_keys = 1UL << 27; + const bool print_headers = true; + uint64_t dev_id = 0; if(argc > 2) dev_id = std::atoi(argv[2]); cudaSetDevice(dev_id); CUERR @@ -154,22 +161,26 @@ int main(int argc, char* argv[]) multi_value_benchmark( keys_d, max_keys, {max_keys}, - {0.8}); + {0.8}, + print_headers); multi_value_benchmark( keys_d, max_keys, {max_keys}, - {0.8}); + {0.8}, + print_headers); multi_value_benchmark( keys_d, max_keys, {max_keys}, - {0.8}); + {0.8}, + print_headers); multi_value_benchmark( keys_d, max_keys, {max_keys}, - {0.8}); + {0.8}, + print_headers); cudaFree(keys_d); CUERR } diff --git a/benchmark/src/multi_value_benchmark.cu b/benchmark/src/multi_value_benchmark.cu index c3bc440..32a7022 100755 --- a/benchmark/src/multi_value_benchmark.cu +++ b/benchmark/src/multi_value_benchmark.cu @@ -22,8 +22,8 @@ void multi_value_benchmark( const uint64_t max_unique_size = num_unique(keys_d, max_keys); - key_t* unique_keys_d = nullptr; - cudaMalloc(&unique_keys_d, sizeof(key_t)*max_unique_size); CUERR + key_t* query_keys_d = nullptr; + cudaMalloc(&query_keys_d, sizeof(key_t)*max_keys); CUERR value_t* values_d = nullptr; cudaMalloc(&values_d, sizeof(value_t)*max_keys); CUERR index_t * offsets_d = nullptr; @@ -65,9 +65,14 @@ void multi_value_benchmark( iters, thermal_backoff); output.query_ms = benchmark_query_multi( - hash_table, unique_keys_d, offsets_d, values_d, + hash_table, query_keys_d, size, + offsets_d, values_d, iters, thermal_backoff); + // output.query_ms = benchmark_query_unique( + // hash_table, query_keys_d, offsets_d, values_d, + // iters, thermal_backoff); + output.key_load_factor = hash_table.load_factor(); output.density = output.key_load_factor; output.status = hash_table.pop_status(); @@ -79,7 +84,7 @@ void multi_value_benchmark( } } - cudaFree(unique_keys_d); CUERR + cudaFree(query_keys_d); CUERR cudaFree(values_d); CUERR cudaFree(offsets_d); CUERR } @@ -93,6 +98,8 @@ int main(int argc, char* argv[]) const uint64_t max_keys = 1UL << 27; + const bool print_headers = true; + uint64_t dev_id = 0; if(argc > 2) dev_id = std::atoi(argv[2]); cudaSetDevice(dev_id); CUERR @@ -123,12 +130,14 @@ int main(int argc, char* argv[]) multi_value_benchmark( keys_d, max_keys, {max_keys}, - {0.8}); + {0.8}, + print_headers); // multi_value_benchmark( // keys_d, max_keys, // {max_keys}, - // {0.8}); + // {0.8}, + // print_headers); cudaFree(keys_d); CUERR } \ No newline at end of file diff --git a/benchmark/src/single_value_benchmark.cu b/benchmark/src/single_value_benchmark.cu index 0fd0a58..6351a1d 100644 --- a/benchmark/src/single_value_benchmark.cu +++ b/benchmark/src/single_value_benchmark.cu @@ -82,6 +82,8 @@ int main(int argc, char* argv[]) const uint64_t max_keys = 1UL << 27; + const bool print_headers = true; + uint64_t dev_id = 0; if(argc > 2) dev_id = std::atoi(argv[2]); cudaSetDevice(dev_id); CUERR @@ -101,7 +103,7 @@ int main(int argc, char* argv[]) storage::key_value::AoSStore>; single_value_benchmark( - keys_d, max_keys, {max_keys}, {0.8}, true); + keys_d, max_keys, {max_keys}, {0.8}, print_headers); cudaFree(keys_d); CUERR }