From 5b20e9776ef559d59f35857c4da08005f1f5cf9e Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Thu, 11 Jun 2020 09:28:43 -0700 Subject: [PATCH 01/61] updated SlabAlloc to place super blocks on separate arrays --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index d655028..4e27bf1 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit d6550289344cd4474d237e959bfd6bc962981af7 +Subproject commit 4e27bf1c0678a2bd60d5a05e0c020936ebf6c9cb From 7da2ae7964c8996b6d2b718de7fcee6a91dcf23e Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Thu, 11 Jun 2020 16:33:20 -0700 Subject: [PATCH 02/61] buildBulk now reports which keys were successfully inserted --- SlabAlloc | 2 +- src/concurrent_map/cmap_class.cuh | 7 +++- src/concurrent_map/cmap_implementation.cuh | 44 ++++++++++++++++++++-- src/concurrent_map/device/build.cuh | 10 ++++- src/concurrent_map/warp/insert.cuh | 8 +++- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 4e27bf1..e9464bb 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 4e27bf1c0678a2bd60d5a05e0c020936ebf6c9cb +Subproject commit e9464bb6d019c1239f608c743fae67b407471bb8 diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 1329740..8967d66 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -86,7 +86,8 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert key-value pairs // into the slab hash - __device__ __forceinline__ void insertPair(bool& to_be_inserted, + __device__ __forceinline__ void insertPair(bool& mySuccess, + bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, @@ -245,9 +246,13 @@ class GpuSlabHash { hf_ = {0u, 0u}; } + std::cout << "Initializing context params " << std::endl; + // initializing the gpu_context_: gpu_context_.initParameters( num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr()); + + std::cout << "Done initializing context params" << std::endl; } ~GpuSlabHash() { diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 99880a8..e4fa4ad 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -21,11 +21,47 @@ void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { + + int h_retry = 1; + int *d_retry; + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); + CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); + bool* d_success; + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); + CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; - // calling the kernel for bulk build: - CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - build_table_kernel - <<>>(d_key, d_value, num_keys, gpu_context_); + + + + + bool *h_success = (bool*) malloc(num_keys * sizeof(bool)); + + + + + + while(h_retry) { + // calling the kernel for bulk build: + CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); + build_table_kernel + <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); + + // + CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost)); + for(auto i = 0; i < num_keys; ++i) { + if(h_success[i] == false) { + std::cout << "Key " << i << " evaluated to false" << std::endl; + break; + } + } + + std::cout << "Evaluating need to resize" << std::endl; + // resize the pool here if necessary + + CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); + } } template void GpuSlabHash::buildBulkWithUniqueKeys( diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index f48b5e5..3e8895a 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -20,6 +20,8 @@ */ template __global__ void build_table_kernel( + int* d_retry, + bool* d_success, KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -34,19 +36,23 @@ __global__ void build_table_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); + bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { + mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); - to_insert = true; + to_insert = !mySuccess; } - slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + d_success[tid] = mySuccess; + atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry } template diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 8653ddf..9b7fe0e 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -24,6 +24,7 @@ template __device__ __forceinline__ void GpuSlabHashContext::insertPair( + bool& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, @@ -54,6 +55,10 @@ GpuSlabHashContext::insertPair( if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); + if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased + mySuccess = true; // signal that this key needs to be reinserted + to_be_inserted = false; + } // TODO: experiment if it's better to use lane 0 instead if (laneId == 31) { @@ -88,7 +93,8 @@ GpuSlabHashContext::insertPair( *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) - to_be_inserted = false; // succesfful insertion + mySuccess = true; + to_be_inserted = false; // successful insertion } } last_work_queue = work_queue; From e5257c03e1661914a147a8981cbd72de2050172a Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 12 Jun 2020 08:19:29 -0700 Subject: [PATCH 03/61] buildBulk function can detect if allocation pool needs to be resized --- src/concurrent_map/cmap_implementation.cuh | 22 +++------------------- src/concurrent_map/device/build.cuh | 7 +++++-- src/concurrent_map/warp/insert.cuh | 5 +++-- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index e4fa4ad..cb68beb 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -31,38 +31,21 @@ void GpuSlabHash::buildBulk( CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; - - - - - bool *h_success = (bool*) malloc(num_keys * sizeof(bool)); - - - - - while(h_retry) { // calling the kernel for bulk build: CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); build_table_kernel <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); - // - CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost)); - for(auto i = 0; i < num_keys; ++i) { - if(h_success[i] == false) { - std::cout << "Key " << i << " evaluated to false" << std::endl; - break; - } - } - std::cout << "Evaluating need to resize" << std::endl; // resize the pool here if necessary CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } } + template void GpuSlabHash::buildBulkWithUniqueKeys( KeyT* d_key, @@ -74,6 +57,7 @@ void GpuSlabHash::buildBulkWithUniqu build_table_with_unique_keys_kernel <<>>(d_key, d_value, num_keys, gpu_context_); } + template void GpuSlabHash::searchIndividual( KeyT* d_query, diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index 3e8895a..50c236d 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -51,8 +51,11 @@ __global__ void build_table_kernel( } slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); - d_success[tid] = mySuccess; - atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry + + if (tid < num_keys) { + d_success[tid] = mySuccess; + atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry + } } template diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 9b7fe0e..dfb127b 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -56,7 +56,7 @@ GpuSlabHashContext::insertPair( // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased - mySuccess = true; // signal that this key needs to be reinserted + mySuccess = false; // signal that this key needs to be reinserted to_be_inserted = false; } @@ -92,9 +92,10 @@ GpuSlabHashContext::insertPair( << 32) | *reinterpret_cast( reinterpret_cast(&myKey))); - if (old_key_value_pair == EMPTY_PAIR_64) + if (old_key_value_pair == EMPTY_PAIR_64) { mySuccess = true; to_be_inserted = false; // successful insertion + } } } last_work_queue = work_queue; From a314c19c053cb4c8dfa97158773b07de094b87e0 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 12 Jun 2020 12:04:03 -0700 Subject: [PATCH 04/61] Added support for resizing dynamic allocation pool --- SlabAlloc | 2 +- src/concurrent_map/cmap_class.cuh | 4 ++++ src/concurrent_map/cmap_implementation.cuh | 26 +++++++++++++++++++--- src/concurrent_map/warp/insert.cuh | 1 + src/slab_hash_global.cuh | 4 ++-- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index e9464bb..ff0696a 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit e9464bb6d019c1239f608c743fae67b407471bb8 +Subproject commit ff0696a8d0b6829076495263dd2e1dd4b7866d28 diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 8967d66..8bf3836 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -67,6 +67,10 @@ class GpuSlabHashContext { global_allocator_ctx_ = *allocator_ctx; } + __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) { + global_allocator_ctx_ = *allocator_ctx; + } + __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { return global_allocator_ctx_; } diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index cb68beb..5844c38 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -21,7 +21,10 @@ void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { - + + bool *h_success = (bool*) malloc(sizeof(bool) * num_keys); + + int h_retry = 1; int *d_retry; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); @@ -31,17 +34,34 @@ void GpuSlabHash::buildBulk( CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); while(h_retry) { + std::cout << "Calling build kernel" << std::endl; // calling the kernel for bulk build: - CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); build_table_kernel <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); + + /* + CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost)); + + int num_false = 0; + for(auto i = 0; i < num_keys; ++i) { + if(h_success[i] == false) num_false++; + } + + std::cout << "numfalse " << num_false << std::endl; + */ std::cout << "Evaluating need to resize" << std::endl; // resize the pool here if necessary - + if(h_retry) { + std::cout << "Resizing pool" << std::endl; + dynamic_allocator_->growPool(); + gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); + std::cout << "Done resizing pool" << std::endl; + } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } } diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index dfb127b..9955b42 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -58,6 +58,7 @@ GpuSlabHashContext::insertPair( if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased mySuccess = false; // signal that this key needs to be reinserted to_be_inserted = false; + continue; } // TODO: experiment if it's better to use lane 0 instead diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 81c9d17..ace129e 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -127,8 +127,8 @@ class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 8; -constexpr uint32_t num_super_blocks = 32; +constexpr uint32_t log_num_mem_blocks = 12; +constexpr uint32_t num_super_blocks = 2; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From 49161a2c282e3e8222716c30a4f9f127bfc8fbd4 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 12 Jun 2020 14:29:20 -0700 Subject: [PATCH 05/61] removed prints --- src/concurrent_map/cmap_class.cuh | 4 ---- src/concurrent_map/cmap_implementation.cuh | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 8bf3836..c601c30 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -250,13 +250,9 @@ class GpuSlabHash { hf_ = {0u, 0u}; } - std::cout << "Initializing context params " << std::endl; - // initializing the gpu_context_: gpu_context_.initParameters( num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr()); - - std::cout << "Done initializing context params" << std::endl; } ~GpuSlabHash() { diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 5844c38..d46f832 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -36,7 +36,6 @@ void GpuSlabHash::buildBulk( const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); while(h_retry) { - std::cout << "Calling build kernel" << std::endl; // calling the kernel for bulk build: build_table_kernel <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); @@ -54,13 +53,10 @@ void GpuSlabHash::buildBulk( std::cout << "numfalse " << num_false << std::endl; */ - std::cout << "Evaluating need to resize" << std::endl; // resize the pool here if necessary if(h_retry) { - std::cout << "Resizing pool" << std::endl; dynamic_allocator_->growPool(); gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); - std::cout << "Done resizing pool" << std::endl; } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } From ff5ba413b625efc85d8d1c29c4d0ede0600352ee Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Mon, 15 Jun 2020 16:17:32 -0700 Subject: [PATCH 06/61] added pool growth support for unique bulk insert and temporarily removed some device allocations in gpu_hash_table for testing --- SlabAlloc | 2 +- src/concurrent_map/cmap_class.cuh | 1 + src/concurrent_map/cmap_implementation.cuh | 45 ++++++++++++++-------- src/concurrent_map/device/build.cuh | 13 ++++++- src/concurrent_map/warp/insert.cuh | 11 +++++- src/gpu_hash_table.cuh | 14 +++---- src/slab_hash_global.cuh | 2 +- 7 files changed, 61 insertions(+), 27 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index ff0696a..36cfa6f 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit ff0696a8d0b6829076495263dd2e1dd4b7866d28 +Subproject commit 36cfa6f95cd47fd1825e59f51a5ca7d7192a4713 diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index c601c30..c0dae4b 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -101,6 +101,7 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash __device__ __forceinline__ bool insertPairUnique( + bool& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index d46f832..0a9d9e1 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -21,10 +21,6 @@ void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { - - bool *h_success = (bool*) malloc(sizeof(bool) * num_keys); - - int h_retry = 1; int *d_retry; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); @@ -42,17 +38,6 @@ void GpuSlabHash::buildBulk( CHECK_CUDA_ERROR(cudaDeviceSynchronize()); CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); - /* - CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost)); - - int num_false = 0; - for(auto i = 0; i < num_keys; ++i) { - if(h_success[i] == false) num_false++; - } - - std::cout << "numfalse " << num_false << std::endl; - */ - // resize the pool here if necessary if(h_retry) { dynamic_allocator_->growPool(); @@ -60,6 +45,8 @@ void GpuSlabHash::buildBulk( } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } + CHECK_CUDA_ERROR(cudaFree(d_retry)); + CHECK_CUDA_ERROR(cudaFree(d_success)); } template @@ -67,11 +54,39 @@ void GpuSlabHash::buildBulkWithUniqu KeyT* d_key, ValueT* d_value, uint32_t num_keys) { + int h_retry = 1; + int *d_retry; + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); + CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); + bool* d_success; + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); + CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); + + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; + CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); + while(h_retry) { + // calling the kernel for bulk build: + build_table_with_unique_keys_kernel + <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); + + // resize the pool here if necessary + if(h_retry) { + dynamic_allocator_->growPool(); + gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); + } + CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); + } + CHECK_CUDA_ERROR(cudaFree(d_retry)); + CHECK_CUDA_ERROR(cudaFree(d_success)); + /* const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; // calling the kernel for bulk build: CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); build_table_with_unique_keys_kernel <<>>(d_key, d_value, num_keys, gpu_context_); + */ } template diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index 50c236d..f24b5cb 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -60,6 +60,8 @@ __global__ void build_table_kernel( template __global__ void build_table_with_unique_keys_kernel( + int* d_retry, + bool* d_success, KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -74,18 +76,25 @@ __global__ void build_table_with_unique_keys_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); + bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { + mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); - to_insert = true; + to_insert = !mySuccess; } - slab_hash.insertPairUnique( + slab_hash.insertPairUnique(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + + if (tid < num_keys) { + d_success[tid] = mySuccess; + atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry + } } \ No newline at end of file diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 9955b42..4b94d81 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -113,6 +113,7 @@ GpuSlabHashContext::insertPair( template __device__ __forceinline__ bool GpuSlabHashContext::insertPairUnique( + bool& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, @@ -143,14 +144,21 @@ GpuSlabHashContext::insertPairUnique uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) & SlabHashT::REGULAR_NODE_KEY_MASK; if (isExisting) { // key exist in the hash table - if (laneId == src_lane) + if (laneId == src_lane) { + mySuccess = true; to_be_inserted = false; + } } else { if (isEmpty == 0) { // no empty slot available: uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); + if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased + mySuccess = false; // signal that this key needs to be reinserted + to_be_inserted = false; + continue; + } if (laneId == 31) { const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) @@ -184,6 +192,7 @@ GpuSlabHashContext::insertPairUnique *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { + mySuccess = true; to_be_inserted = false; // successful insertion new_insertion = true; } diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index 167245e..b41a969 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -72,10 +72,10 @@ class gpu_hash_table { if (req_values_) { CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_)); } - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); - + //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); + //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); + //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); + // allocate an initialize the allocator: dynamic_allocator_ = new DynamicAllocatorT(); @@ -93,9 +93,9 @@ class gpu_hash_table { if (req_values_) { CHECK_CUDA_ERROR(cudaFree(d_value_)); } - CHECK_CUDA_ERROR(cudaFree(d_query_)); - CHECK_CUDA_ERROR(cudaFree(d_result_)); - CHECK_CUDA_ERROR(cudaFree(d_count_)); + //CHECK_CUDA_ERROR(cudaFree(d_query_)); + //CHECK_CUDA_ERROR(cudaFree(d_result_)); + //CHECK_CUDA_ERROR(cudaFree(d_count_)); // delete the dynamic allocator: delete dynamic_allocator_; diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index ace129e..38437a0 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -127,7 +127,7 @@ class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 12; +constexpr uint32_t log_num_mem_blocks = 11; constexpr uint32_t num_super_blocks = 2; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From 44ab2f5e57df9734f1fa2c806c550927c77ead84 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 16 Jun 2020 11:52:40 -0700 Subject: [PATCH 07/61] added support for preemptive pool growth --- SlabAlloc | 2 +- src/concurrent_map/cmap_class.cuh | 20 ++++++++- src/concurrent_map/cmap_implementation.cuh | 48 +++++++++++++++++----- src/gpu_hash_table.cuh | 8 ++-- 4 files changed, 60 insertions(+), 18 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 36cfa6f..6abbedb 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 36cfa6f95cd47fd1825e59f51a5ca7d7192a4713 +Subproject commit 6abbedb680bca342863589afd152b12ced0c0fd1 diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index c0dae4b..6b4a9cc 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -31,12 +31,15 @@ class GpuSlabHashContext { #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext() - : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {} + : num_buckets_(0), total_num_slabs_(0), total_num_keys_(0), + hash_x_(0), hash_y_(0), d_table_(nullptr) {} #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext( GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); + total_num_slabs_ = rhs.getTotalNumSlabs(); + total_num_keys_ = rhs.getTotalNumKeys(); hash_x_ = rhs.getHashX(); hash_y_ = rhs.getHashY(); d_table_ = rhs.getDeviceTablePointer(); @@ -65,10 +68,17 @@ class GpuSlabHashContext { d_table_ = reinterpret_cast::SlabTypeT*>(d_table); global_allocator_ctx_ = *allocator_ctx; + total_num_slabs_ = num_buckets + global_allocator_ctx_.getNumSlabsInPool(); + total_num_keys_ = 0; } __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) { global_allocator_ctx_ = *allocator_ctx; + total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool(); + } + + __host__ void updateTotalNumKeys(uint32_t keysAdded) { + total_num_keys_ += keysAdded; } __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { @@ -80,6 +90,8 @@ class GpuSlabHashContext { return d_table_; } + __device__ __host__ __forceinline__ uint32_t getTotalNumSlabs() {return total_num_slabs_; } + __device__ __host__ __forceinline__ uint32_t getTotalNumKeys() {return total_num_keys_; } __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; } __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; } __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; } @@ -171,6 +183,8 @@ class GpuSlabHashContext { // === members: uint32_t num_buckets_; + uint32_t total_num_slabs_; + uint32_t total_num_keys_; uint32_t hash_x_; uint32_t hash_y_; typename ConcurrentMapT::SlabTypeT* d_table_; @@ -264,7 +278,9 @@ class GpuSlabHash { // returns some debug information about the slab hash std::string to_string(); double computeLoadFactor(int flag); - + + void resize(); + uint32_t checkForPreemptiveResize(uint32_t keysAdded); void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys); void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys); void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries); diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 0a9d9e1..30e155d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -16,6 +16,30 @@ #pragma once +template +void GpuSlabHash::resize() { + dynamic_allocator_->growPool(); + gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); +} + +template +uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { + auto numSlabs = gpu_context_.getTotalNumSlabs(); + auto maxElemCapacity = numSlabs * 15; + auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; + auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity; + auto numResizes = 0; + + if(finalSlabLoadFactor > 0.85) { + numResizes = 1; + } + if(finalSlabLoadFactor > 1.95) { + numResizes = 2; + } + + return numResizes; +} + template void GpuSlabHash::buildBulk( KeyT* d_key, @@ -40,8 +64,7 @@ void GpuSlabHash::buildBulk( // resize the pool here if necessary if(h_retry) { - dynamic_allocator_->growPool(); - gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); + resize(); } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } @@ -65,6 +88,13 @@ void GpuSlabHash::buildBulkWithUniqu const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); while(h_retry) { + + // predict whether resizing will be necessary, and take preemptive action. + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } + // calling the kernel for bulk build: build_table_with_unique_keys_kernel <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); @@ -73,20 +103,16 @@ void GpuSlabHash::buildBulkWithUniqu // resize the pool here if necessary if(h_retry) { - dynamic_allocator_->growPool(); - gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); + resize(); } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); } CHECK_CUDA_ERROR(cudaFree(d_retry)); CHECK_CUDA_ERROR(cudaFree(d_success)); - /* - const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; - // calling the kernel for bulk build: - CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - build_table_with_unique_keys_kernel - <<>>(d_key, d_value, num_keys, gpu_context_); - */ + + // now that the bulk insert has completed successfully, we can + // update the total number of keys in the table + gpu_context_.updateTotalNumKeys(num_keys); } template diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index b41a969..03930b0 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -72,8 +72,8 @@ class gpu_hash_table { if (req_values_) { CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_)); } - //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); - //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); + CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); // allocate an initialize the allocator: @@ -93,8 +93,8 @@ class gpu_hash_table { if (req_values_) { CHECK_CUDA_ERROR(cudaFree(d_value_)); } - //CHECK_CUDA_ERROR(cudaFree(d_query_)); - //CHECK_CUDA_ERROR(cudaFree(d_result_)); + CHECK_CUDA_ERROR(cudaFree(d_query_)); + CHECK_CUDA_ERROR(cudaFree(d_result_)); //CHECK_CUDA_ERROR(cudaFree(d_count_)); // delete the dynamic allocator: From 48b21dfa76d30bb18d4bf6a51308492e443e1253 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 17 Jun 2020 09:13:13 -0700 Subject: [PATCH 08/61] added preemptive resizing for non unique build kernel --- src/concurrent_map/cmap_implementation.cuh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 30e155d..fc31fe7 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -56,6 +56,12 @@ void GpuSlabHash::buildBulk( const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); while(h_retry) { + // predict whether resizing will be necessary, and take preemptive action. + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } + // calling the kernel for bulk build: build_table_kernel <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); @@ -70,6 +76,10 @@ void GpuSlabHash::buildBulk( } CHECK_CUDA_ERROR(cudaFree(d_retry)); CHECK_CUDA_ERROR(cudaFree(d_success)); + + // now that the bulk insert has completed successfully, we can + // update the total number of keys in the table + gpu_context_.updateTotalNumKeys(num_keys); } template @@ -77,6 +87,7 @@ void GpuSlabHash::buildBulkWithUniqu KeyT* d_key, ValueT* d_value, uint32_t num_keys) { + int h_retry = 1; int *d_retry; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); @@ -88,7 +99,6 @@ void GpuSlabHash::buildBulkWithUniqu const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); while(h_retry) { - // predict whether resizing will be necessary, and take preemptive action. auto numResizes = checkForPreemptiveResize(num_keys); for(auto i = 0; i < numResizes; ++i) { From 1978da6630771dde9693d7fd7d0f379ed68083cd Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 27 Sep 2020 00:14:02 -0400 Subject: [PATCH 09/61] decreased suberblock size --- src/slab_hash_global.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 38437a0..e9f21aa 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -127,7 +127,7 @@ class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 11; +constexpr uint32_t log_num_mem_blocks = 10; // 128 MB constexpr uint32_t num_super_blocks = 2; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From 01c972b5f6a5c7b4e8baa5871d4157ba2d1cbf49 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Mon, 28 Dec 2020 10:24:15 -0500 Subject: [PATCH 10/61] updated gitmodules --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index fcfa184..1c94e86 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "SlabAlloc"] path = SlabAlloc - url = https://github.com/owensgroup/SlabAlloc + url = https://github.com/Nicolas-Iskos/SlabAlloc [submodule "ThirdParty/rapidjson"] path = ThirdParty/rapidjson url = https://github.com/Tencent/rapidjson From c3c9bccf834b5636b6d96b3b483d567d8cf4bda7 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Mon, 28 Dec 2020 14:07:49 -0500 Subject: [PATCH 11/61] removed explicit checks for need to resize --- src/concurrent_map/cmap_class.cuh | 4 +-- src/concurrent_map/cmap_implementation.cuh | 37 +++++++++++++--------- src/concurrent_map/device/build.cuh | 30 ++++++++++-------- src/concurrent_map/warp/insert.cuh | 14 ++++---- 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 6b4a9cc..a66e98b 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -102,7 +102,7 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert key-value pairs // into the slab hash - __device__ __forceinline__ void insertPair(bool& mySuccess, + __device__ __forceinline__ void insertPair(/*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, @@ -113,7 +113,7 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash __device__ __forceinline__ bool insertPairUnique( - bool& mySuccess, + /*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index fc31fe7..af272f1 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -45,6 +45,8 @@ void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { + + /* int h_retry = 1; int *d_retry; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); @@ -52,10 +54,10 @@ void GpuSlabHash::buildBulk( bool* d_success; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); - + */ const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - while(h_retry) { + //while(h_retry) { // predict whether resizing will be necessary, and take preemptive action. auto numResizes = checkForPreemptiveResize(num_keys); for(auto i = 0; i < numResizes; ++i) { @@ -64,18 +66,20 @@ void GpuSlabHash::buildBulk( // calling the kernel for bulk build: build_table_kernel - <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); + <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); + //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); // resize the pool here if necessary + /* if(h_retry) { resize(); } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - } - CHECK_CUDA_ERROR(cudaFree(d_retry)); - CHECK_CUDA_ERROR(cudaFree(d_success)); + */ + //} + //CHECK_CUDA_ERROR(cudaFree(d_retry)); + //CHECK_CUDA_ERROR(cudaFree(d_success)); // now that the bulk insert has completed successfully, we can // update the total number of keys in the table @@ -87,7 +91,8 @@ void GpuSlabHash::buildBulkWithUniqu KeyT* d_key, ValueT* d_value, uint32_t num_keys) { - + + /* int h_retry = 1; int *d_retry; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); @@ -95,10 +100,10 @@ void GpuSlabHash::buildBulkWithUniqu bool* d_success; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); - + */ const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - while(h_retry) { + //while(h_retry) { // predict whether resizing will be necessary, and take preemptive action. auto numResizes = checkForPreemptiveResize(num_keys); for(auto i = 0; i < numResizes; ++i) { @@ -107,18 +112,20 @@ void GpuSlabHash::buildBulkWithUniqu // calling the kernel for bulk build: build_table_with_unique_keys_kernel - <<>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_); + <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); + //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); // resize the pool here if necessary + /* if(h_retry) { resize(); } CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - } - CHECK_CUDA_ERROR(cudaFree(d_retry)); - CHECK_CUDA_ERROR(cudaFree(d_success)); + */ + //} + //CHECK_CUDA_ERROR(cudaFree(d_retry)); + //CHECK_CUDA_ERROR(cudaFree(d_success)); // now that the bulk insert has completed successfully, we can // update the total number of keys in the table diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index f24b5cb..de5c68e 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -20,8 +20,8 @@ */ template __global__ void build_table_kernel( - int* d_retry, - bool* d_success, + /*int* d_retry,*/ + /*bool* d_success,*/ KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -36,32 +36,34 @@ __global__ void build_table_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); - bool mySuccess = true; + //bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { - mySuccess = d_success[tid]; + //mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); - to_insert = !mySuccess; + to_insert = true; } - slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + slab_hash.insertPair(/*mySuccess,*/ to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + /* if (tid < num_keys) { d_success[tid] = mySuccess; atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry } + */ } template __global__ void build_table_with_unique_keys_kernel( - int* d_retry, - bool* d_success, + /*int* d_retry,*/ + /*bool* d_success,*/ KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -76,25 +78,27 @@ __global__ void build_table_with_unique_keys_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); - bool mySuccess = true; + //bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { - mySuccess = d_success[tid]; + //mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); - to_insert = !mySuccess; + to_insert = true; } - slab_hash.insertPairUnique(mySuccess, + slab_hash.insertPairUnique(/*mySuccess,*/ to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); - + + /* if (tid < num_keys) { d_success[tid] = mySuccess; atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry } + */ } \ No newline at end of file diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 4b94d81..c52ed11 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -24,7 +24,7 @@ template __device__ __forceinline__ void GpuSlabHashContext::insertPair( - bool& mySuccess, + /*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, @@ -56,7 +56,7 @@ GpuSlabHashContext::insertPair( // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased - mySuccess = false; // signal that this key needs to be reinserted + //mySuccess = false; // signal that this key needs to be reinserted to_be_inserted = false; continue; } @@ -94,7 +94,7 @@ GpuSlabHashContext::insertPair( *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { - mySuccess = true; + //mySuccess = true; to_be_inserted = false; // successful insertion } } @@ -113,7 +113,7 @@ GpuSlabHashContext::insertPair( template __device__ __forceinline__ bool GpuSlabHashContext::insertPairUnique( - bool& mySuccess, + /*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, @@ -145,7 +145,7 @@ GpuSlabHashContext::insertPairUnique SlabHashT::REGULAR_NODE_KEY_MASK; if (isExisting) { // key exist in the hash table if (laneId == src_lane) { - mySuccess = true; + //mySuccess = true; to_be_inserted = false; } } else { @@ -155,7 +155,7 @@ GpuSlabHashContext::insertPairUnique // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased - mySuccess = false; // signal that this key needs to be reinserted + //mySuccess = false; // signal that this key needs to be reinserted to_be_inserted = false; continue; } @@ -192,7 +192,7 @@ GpuSlabHashContext::insertPairUnique *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { - mySuccess = true; + //mySuccess = true; to_be_inserted = false; // successful insertion new_insertion = true; } From c1f8e7ba9b98de0e29138919b4ccad30c19d9d7b Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Mon, 28 Dec 2020 20:13:21 -0500 Subject: [PATCH 12/61] removed dead code --- src/concurrent_map/cmap_implementation.cuh | 86 +++++----------------- src/concurrent_map/device/build.cuh | 26 +------ 2 files changed, 22 insertions(+), 90 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index af272f1..b87ea77 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -46,40 +46,17 @@ void GpuSlabHash::buildBulk( ValueT* d_value, uint32_t num_keys) { - /* - int h_retry = 1; - int *d_retry; - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); - CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - bool* d_success; - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); - CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); - */ const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - //while(h_retry) { - // predict whether resizing will be necessary, and take preemptive action. - auto numResizes = checkForPreemptiveResize(num_keys); - for(auto i = 0; i < numResizes; ++i) { - resize(); - } + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } - // calling the kernel for bulk build: - build_table_kernel - <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); - - // resize the pool here if necessary - /* - if(h_retry) { - resize(); - } - CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - */ - //} - //CHECK_CUDA_ERROR(cudaFree(d_retry)); - //CHECK_CUDA_ERROR(cudaFree(d_success)); + // calling the kernel for bulk build: + build_table_kernel + <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); // now that the bulk insert has completed successfully, we can // update the total number of keys in the table @@ -88,45 +65,22 @@ void GpuSlabHash::buildBulk( template void GpuSlabHash::buildBulkWithUniqueKeys( - KeyT* d_key, - ValueT* d_value, - uint32_t num_keys) { + KeyT* d_key, + ValueT* d_value, + uint32_t num_keys) { - /* - int h_retry = 1; - int *d_retry; - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int))); - CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - bool* d_success; - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool))); - CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool))); - */ const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); - //while(h_retry) { - // predict whether resizing will be necessary, and take preemptive action. - auto numResizes = checkForPreemptiveResize(num_keys); - for(auto i = 0; i < numResizes; ++i) { - resize(); - } - - // calling the kernel for bulk build: - build_table_with_unique_keys_kernel - <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost)); - - // resize the pool here if necessary - /* - if(h_retry) { - resize(); - } - CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int))); - */ - //} - //CHECK_CUDA_ERROR(cudaFree(d_retry)); - //CHECK_CUDA_ERROR(cudaFree(d_success)); + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } + // calling the kernel for bulk build: + build_table_with_unique_keys_kernel + <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + // now that the bulk insert has completed successfully, we can // update the total number of keys in the table gpu_context_.updateTotalNumKeys(num_keys); diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index de5c68e..f48b5e5 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -20,8 +20,6 @@ */ template __global__ void build_table_kernel( - /*int* d_retry,*/ - /*bool* d_success,*/ KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -36,34 +34,23 @@ __global__ void build_table_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); - //bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { - //mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); to_insert = true; } - slab_hash.insertPair(/*mySuccess,*/ to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); - - /* - if (tid < num_keys) { - d_success[tid] = mySuccess; - atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry - } - */ + slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); } template __global__ void build_table_with_unique_keys_kernel( - /*int* d_retry,*/ - /*bool* d_success,*/ KeyT* d_key, ValueT* d_value, uint32_t num_keys, @@ -78,27 +65,18 @@ __global__ void build_table_with_unique_keys_kernel( AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); - //bool mySuccess = true; KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; bool to_insert = false; if (tid < num_keys) { - //mySuccess = d_success[tid]; myKey = d_key[tid]; myValue = d_value[tid]; myBucket = slab_hash.computeBucket(myKey); to_insert = true; } - slab_hash.insertPairUnique(/*mySuccess,*/ + slab_hash.insertPairUnique( to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); - - /* - if (tid < num_keys) { - d_success[tid] = mySuccess; - atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry - } - */ } \ No newline at end of file From a4f6ba0acd0c2db55232a53820e188c1727e474c Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Mon, 28 Dec 2020 23:48:55 -0500 Subject: [PATCH 13/61] resizing parameter tuning --- src/concurrent_map/cmap_implementation.cuh | 5 +---- src/slab_hash_global.cuh | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index b87ea77..7d39ae5 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -30,12 +30,9 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.85) { + if(finalSlabLoadFactor > 0.60) { numResizes = 1; } - if(finalSlabLoadFactor > 1.95) { - numResizes = 2; - } return numResizes; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index e9f21aa..c2b6b2f 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -127,8 +127,8 @@ class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 2; +constexpr uint32_t log_num_mem_blocks = 10; // 128 MB +constexpr uint32_t num_super_blocks = 1; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From b29b9886f9256b0fe47df6f652ef4e56d823c5b7 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Tue, 29 Dec 2020 21:07:54 -0500 Subject: [PATCH 14/61] added cub --- .gitmodules | 3 +++ CMakeLists.txt | 3 ++- SlabAlloc | 2 +- cub | 1 + src/slab_hash_global.cuh | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) create mode 160000 cub diff --git a/.gitmodules b/.gitmodules index 1c94e86..201a205 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "ThirdParty/googletest"] path = ThirdParty/googletest url = https://github.com/google/googletest +[submodule "cub"] + path = cub + url = https://github.com/NVIDIA/cub.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 2310713..72c7a6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ find_package(CUDA 8.0 REQUIRED) option(CMAKE_VERBOSE_MAKEFILE ON) option(DGTEST, "DGTEST" ON) -set(CUDA_NVCC_FLAGS -std=c++11) +set(CUDA_NVCC_FLAGS -std=c++14) set (CMAKE_CXX_STANDARD 11) if (CUDA_VERBOSE_PTXAS) @@ -83,6 +83,7 @@ if(SLABHASH_GENCODE_SM75) endif(SLABHASH_GENCODE_SM75) include_directories(SlabAlloc/src) +include_directories(cub) include_directories(src src/concurrent) include_directories(ThirdParty/rapidjson/include) include_directories(ThirdParty/googletest/googletest) diff --git a/SlabAlloc b/SlabAlloc index 6abbedb..3b98001 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 6abbedb680bca342863589afd152b12ced0c0fd1 +Subproject commit 3b980011e912c32de4ca1f4756258309aee4776c diff --git a/cub b/cub new file mode 160000 index 0000000..c3be9a9 --- /dev/null +++ b/cub @@ -0,0 +1 @@ +Subproject commit c3be9a94273b5049520aacc7db00c738668aaa3f diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index c2b6b2f..15ec263 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -17,6 +17,7 @@ #pragma once #include "slab_alloc.cuh" +#include "cub/cub.cuh" #define CHECK_CUDA_ERROR(call) \ do { \ From e3c64636c9184cc406e03e37a50112e69c894103 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 31 Dec 2020 14:39:41 -0500 Subject: [PATCH 15/61] added block reduce key counting for buildBulkWithUniqueKeys --- src/concurrent_map/cmap_class.cuh | 2 +- src/concurrent_map/cmap_implementation.cuh | 9 ++++++--- src/concurrent_map/device/build.cuh | 13 ++++++++++++- src/concurrent_map/warp/insert.cuh | 9 +++++---- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index a66e98b..c1e8020 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -113,7 +113,7 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash __device__ __forceinline__ bool insertPairUnique( - /*bool& mySuccess,*/ + int& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 7d39ae5..420cc9c 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -51,8 +51,9 @@ void GpuSlabHash::buildBulk( } // calling the kernel for bulk build: + //int num_successes = 0; build_table_kernel - <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); + <<>>(d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); // now that the bulk insert has completed successfully, we can @@ -74,13 +75,15 @@ void GpuSlabHash::buildBulkWithUniqu } // calling the kernel for bulk build: + int num_successes = 0; build_table_with_unique_keys_kernel - <<>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_); + <<>>(&num_successes, d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); // now that the bulk insert has completed successfully, we can // update the total number of keys in the table - gpu_context_.updateTotalNumKeys(num_keys); + gpu_context_.updateTotalNumKeys(num_successes); + std::cout << "num_successes: " << num_successes << std::endl; } template diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index f48b5e5..b391fb6 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -51,10 +51,15 @@ __global__ void build_table_kernel( template __global__ void build_table_with_unique_keys_kernel( + int *num_successes, KeyT* d_key, ValueT* d_value, uint32_t num_keys, GpuSlabHashContext slab_hash) { + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -68,6 +73,7 @@ __global__ void build_table_with_unique_keys_kernel( KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; + int mySuccess = 0; bool to_insert = false; if (tid < num_keys) { @@ -77,6 +83,11 @@ __global__ void build_table_with_unique_keys_kernel( to_insert = true; } - slab_hash.insertPairUnique( + slab_hash.insertPairUnique(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess); + if(threadIdx.x == 0) { + atomicAdd(num_successes, block_num_successes); + } } \ No newline at end of file diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index c52ed11..ad8f2c8 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -113,13 +113,14 @@ GpuSlabHashContext::insertPair( template __device__ __forceinline__ bool GpuSlabHashContext::insertPairUnique( - /*bool& mySuccess,*/ + int& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, AllocatorContextT& local_allocator_ctx) { + using SlabHashT = ConcurrentMapT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -145,7 +146,7 @@ GpuSlabHashContext::insertPairUnique SlabHashT::REGULAR_NODE_KEY_MASK; if (isExisting) { // key exist in the hash table if (laneId == src_lane) { - //mySuccess = true; + mySuccess = 0; to_be_inserted = false; } } else { @@ -155,7 +156,7 @@ GpuSlabHashContext::insertPairUnique // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased - //mySuccess = false; // signal that this key needs to be reinserted + mySuccess = 0; // signal that this key needs to be reinserted to_be_inserted = false; continue; } @@ -192,7 +193,7 @@ GpuSlabHashContext::insertPairUnique *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { - //mySuccess = true; + mySuccess = 1; to_be_inserted = false; // successful insertion new_insertion = true; } From 36ee554d3c38563fbc52551417ba15d8311bef2f Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 31 Dec 2020 15:03:29 -0500 Subject: [PATCH 16/61] bug fixes in insert unique call --- src/concurrent_map/cmap_implementation.cuh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 420cc9c..748958a 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -75,15 +75,18 @@ void GpuSlabHash::buildBulkWithUniqu } // calling the kernel for bulk build: - int num_successes = 0; + int *num_successes; + CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int))); + *num_successes = 0; + build_table_with_unique_keys_kernel - <<>>(&num_successes, d_key, d_value, num_keys, gpu_context_); + <<>>(num_successes, d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); // now that the bulk insert has completed successfully, we can // update the total number of keys in the table - gpu_context_.updateTotalNumKeys(num_successes); - std::cout << "num_successes: " << num_successes << std::endl; + gpu_context_.updateTotalNumKeys(*num_successes); + std::cout << "num_successes: " << *num_successes << std::endl; } template From 610af7bb7fa64db5e33229aeac4d1e29a0494408 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 31 Dec 2020 15:19:57 -0500 Subject: [PATCH 17/61] removed print --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 748958a..ad6837d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -86,7 +86,7 @@ void GpuSlabHash::buildBulkWithUniqu // now that the bulk insert has completed successfully, we can // update the total number of keys in the table gpu_context_.updateTotalNumKeys(*num_successes); - std::cout << "num_successes: " << *num_successes << std::endl; + //std::cout << "num_successes: " << *num_successes << std::endl; } template From 8b006f9a1f4d2047c08aac16ea03d8e6a39c4be7 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 01:11:38 -0500 Subject: [PATCH 18/61] 37.5 split --- SlabAlloc | 2 +- src/slab_hash_global.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 3b98001..f85aec8 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 3b980011e912c32de4ca1f4756258309aee4776c +Subproject commit f85aec8640315f01d0a8d4f3333f5b851e5f4b1f diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 15ec263..de0de30 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 1; +constexpr uint32_t num_super_blocks = 5; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From c1fb64258b3c2f2caa14707bd63f5efda6d4538e Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 14:48:51 -0500 Subject: [PATCH 19/61] resize at 70% lf --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index ad6837d..43d9c88 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -30,7 +30,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.60) { + if(finalSlabLoadFactor > 0.70) { numResizes = 1; } From d2d2926492ff727c477b27be84433cc0668aec14 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 15:10:18 -0500 Subject: [PATCH 20/61] prints --- src/concurrent_map/cmap_implementation.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 43d9c88..a5fc273 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -27,6 +27,7 @@ uint32_t GpuSlabHash::checkForPreemp auto numSlabs = gpu_context_.getTotalNumSlabs(); auto maxElemCapacity = numSlabs * 15; auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; + std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity; auto numResizes = 0; From de70e25420ede59f8c10f0febd281631513ebd11 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 15:49:54 -0500 Subject: [PATCH 21/61] changes to SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index f85aec8..99ea8d8 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit f85aec8640315f01d0a8d4f3333f5b851e5f4b1f +Subproject commit 99ea8d883d6adcef043c5029ffc27dbd3319fe2a From b090ecea74be21c0920885f5558259eeee9d47f9 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 16:14:06 -0500 Subject: [PATCH 22/61] prints --- src/concurrent_map/cmap_implementation.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index a5fc273..8a16ea3 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -25,6 +25,7 @@ void GpuSlabHash::resize() { template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); + std::cout << "numSlabs " << numSlabs << std::endl; auto maxElemCapacity = numSlabs * 15; auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; std::cout << "finalNumKeys " << finalNumKeys << std::endl; From 5769e2030f05945e1f1b9beebc9ec7363122e995 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 16:54:20 -0500 Subject: [PATCH 23/61] capacity calculation correction --- src/concurrent_map/cmap_implementation.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 8a16ea3..acb7df2 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -26,10 +26,11 @@ template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); std::cout << "numSlabs " << numSlabs << std::endl; - auto maxElemCapacity = numSlabs * 15; + + auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; std::cout << "finalNumKeys " << finalNumKeys << std::endl; - auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity; + auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; if(finalSlabLoadFactor > 0.70) { From 90909ee7f6a2fcce142170a440318d3339377837 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 1 Jan 2021 17:02:17 -0500 Subject: [PATCH 24/61] removed prints --- src/concurrent_map/cmap_implementation.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index acb7df2..0f6c660 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -25,11 +25,11 @@ void GpuSlabHash::resize() { template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); - std::cout << "numSlabs " << numSlabs << std::endl; + //std::cout << "numSlabs " << numSlabs << std::endl; auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //Ã¥std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; From 855db99e58f6d6ee645de57e4cf1fb29f94c3fe6 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Tue, 5 Jan 2021 12:57:54 -0500 Subject: [PATCH 25/61] 87.5% splut --- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 0f6c660..ef86a43 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -33,7 +33,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.70) { + if(finalSlabLoadFactor > 0.60) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index de0de30..15ec263 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 5; +constexpr uint32_t num_super_blocks = 1; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From b82f0aaf13e9cde49c556028c053b1fabade103f Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 16:33:27 -0500 Subject: [PATCH 26/61] 37.5% split --- SlabAlloc | 2 +- src/concurrent_map/cmap_implementation.cuh | 4 ++-- src/slab_hash_global.cuh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 99ea8d8..8677195 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 99ea8d883d6adcef043c5029ffc27dbd3319fe2a +Subproject commit 8677195a86fb73193434385d42c42471e1a4006b diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index ef86a43..ace2ca4 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -29,11 +29,11 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //Ã¥std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.60) { + if(finalSlabLoadFactor > 0.80) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 15ec263..de0de30 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 1; +constexpr uint32_t num_super_blocks = 5; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From df66adc0a86a578674ce10f977596e97665a29eb Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 17:34:43 -0500 Subject: [PATCH 27/61] prints --- src/concurrent_map/cmap_implementation.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index ace2ca4..9fa56f7 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -25,11 +25,11 @@ void GpuSlabHash::resize() { template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); - //std::cout << "numSlabs " << numSlabs << std::endl; + std::cout << "numSlabs " << numSlabs << std::endl; auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //std::cout << "finalNumKeys " << finalNumKeys << std::endl; + std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; From 12d4d9cfb0d823f01c8bb20af8ef4b40e1d86b20 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 18:17:17 -0500 Subject: [PATCH 28/61] bug fix in insertUnique device function --- src/concurrent_map/warp/insert.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index ad8f2c8..b6d85be 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -193,7 +193,7 @@ GpuSlabHashContext::insertPairUnique *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { - mySuccess = 1; + mySuccess += 1; to_be_inserted = false; // successful insertion new_insertion = true; } From 1f0853564c894bed41a1ae7ca9c993f00607c64b Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 19:07:54 -0500 Subject: [PATCH 29/61] bug fix in build --- src/concurrent_map/device/build.cuh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index b391fb6..9b05f63 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -63,9 +63,9 @@ __global__ void build_table_with_unique_keys_kernel( uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; - if ((tid - laneId) >= num_keys) { - return; - } + //if ((tid - laneId) >= num_keys) { + // return; + //} AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); @@ -83,8 +83,10 @@ __global__ void build_table_with_unique_keys_kernel( to_insert = true; } - slab_hash.insertPairUnique(mySuccess, - to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + if ((tid - laneId) < num_keys) { + slab_hash.insertPairUnique(mySuccess, + to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + } std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess); if(threadIdx.x == 0) { From 7c511750f4fc3f664417daedd20a870a66133f82 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 19:31:31 -0500 Subject: [PATCH 30/61] bug fix in insert unique function --- src/concurrent_map/warp/insert.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index b6d85be..124b863 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -146,7 +146,6 @@ GpuSlabHashContext::insertPairUnique SlabHashT::REGULAR_NODE_KEY_MASK; if (isExisting) { // key exist in the hash table if (laneId == src_lane) { - mySuccess = 0; to_be_inserted = false; } } else { @@ -156,7 +155,6 @@ GpuSlabHashContext::insertPairUnique // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased - mySuccess = 0; // signal that this key needs to be reinserted to_be_inserted = false; continue; } From d1a116cfa2661c32c23c2ce7ed1190dedeacf456 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Wed, 6 Jan 2021 19:54:11 -0500 Subject: [PATCH 31/61] changes to SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index 8677195..ffdc551 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 8677195a86fb73193434385d42c42471e1a4006b +Subproject commit ffdc551ad6d5c6224b909731264b790415790a2b From 854b1f94df4451a5d894363eb22a3ac4c848fe86 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 01:11:56 -0500 Subject: [PATCH 32/61] resize at 70% --- src/concurrent_map/cmap_implementation.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 9fa56f7..780f78c 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -25,15 +25,15 @@ void GpuSlabHash::resize() { template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); - std::cout << "numSlabs " << numSlabs << std::endl; + //std::cout << "numSlabs " << numSlabs << std::endl; auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.80) { + if(finalSlabLoadFactor > 0.70) { numResizes = 1; } From 34afe70d437f3ebbfc1f8ec4b5e7df2d202f8bfd Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 01:25:49 -0500 Subject: [PATCH 33/61] 87.5 split --- SlabAlloc | 2 +- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index ffdc551..bcfbc76 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit ffdc551ad6d5c6224b909731264b790415790a2b +Subproject commit bcfbc7621d07acf1a637c3b32aad435439fa0f69 diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 780f78c..f8e2e3d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -33,7 +33,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.70) { + if(finalSlabLoadFactor > 0.60) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index de0de30..15ec263 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 5; +constexpr uint32_t num_super_blocks = 1; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From 57c013367bb696e9173313acb0cc3d32150fbd74 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 01:32:38 -0500 Subject: [PATCH 34/61] resize at 50% --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index f8e2e3d..a0d1ae9 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -33,7 +33,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.60) { + if(finalSlabLoadFactor > 0.50) { numResizes = 1; } From 79eccec25a242a3be7b8840ba859241a6e422ce2 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 01:35:42 -0500 Subject: [PATCH 35/61] prints --- src/concurrent_map/cmap_implementation.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index a0d1ae9..34e827d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -25,11 +25,10 @@ void GpuSlabHash::resize() { template uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); - //std::cout << "numSlabs " << numSlabs << std::endl; auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //std::cout << "finalNumKeys " << finalNumKeys << std::endl; + std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; From 8314c45c3c076d8e218036420729cac4d5c308e3 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 01:52:05 -0500 Subject: [PATCH 36/61] resize at 60% occupancy --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 34e827d..36ebb4a 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.50) { + if(finalSlabLoadFactor > 0.60) { numResizes = 1; } From 3a73666fa004d1828ad103723ee3d2f9f738b9d3 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 02:04:01 -0500 Subject: [PATCH 37/61] resize at 65% lf --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 36ebb4a..4442998 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.60) { + if(finalSlabLoadFactor > 0.65) { numResizes = 1; } From 31d7a0fa143ea695ecb56879273c1bfb30a53c1f Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 23:15:12 -0500 Subject: [PATCH 38/61] prints --- src/concurrent_map/cmap_class.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index c1e8020..2860f48 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -256,12 +256,14 @@ class GpuSlabHash { // creating a random number generator: if (!identity_hash) { + std::cout << "chicken" << std::endl; std::mt19937 rng(seed ? seed : time(0)); hf_.x = rng() % PRIME_DIVISOR_; if (hf_.x < 1) hf_.x = 1; hf_.y = rng() % PRIME_DIVISOR_; } else { + std::cout << "cow" << std::endl; hf_ = {0u, 0u}; } From 8f7b436bc1c8304718d30346f1bdbd827fcc567f Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Thu, 7 Jan 2021 23:21:57 -0500 Subject: [PATCH 39/61] prints --- src/concurrent_map/cmap_implementation.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 4442998..7e27814 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -81,6 +81,8 @@ void GpuSlabHash::buildBulkWithUniqu CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int))); *num_successes = 0; + std::cout << "hashes" << gpu_context_.getHashX() << ", " << gpu_context_.getHashY() << std::endl; + build_table_with_unique_keys_kernel <<>>(num_successes, d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); From 42d25689b851a93194a1144f3eb9c1eabc1da5b7 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 00:17:11 -0500 Subject: [PATCH 40/61] prints --- src/concurrent_map/cmap_class.cuh | 2 -- src/concurrent_map/cmap_implementation.cuh | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 2860f48..c1e8020 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -256,14 +256,12 @@ class GpuSlabHash { // creating a random number generator: if (!identity_hash) { - std::cout << "chicken" << std::endl; std::mt19937 rng(seed ? seed : time(0)); hf_.x = rng() % PRIME_DIVISOR_; if (hf_.x < 1) hf_.x = 1; hf_.y = rng() % PRIME_DIVISOR_; } else { - std::cout << "cow" << std::endl; hf_ = {0u, 0u}; } diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 7e27814..2393a05 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -28,11 +28,11 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.65) { + if(finalSlabLoadFactor > 0.60) { numResizes = 1; } @@ -81,8 +81,6 @@ void GpuSlabHash::buildBulkWithUniqu CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int))); *num_successes = 0; - std::cout << "hashes" << gpu_context_.getHashX() << ", " << gpu_context_.getHashY() << std::endl; - build_table_with_unique_keys_kernel <<>>(num_successes, d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); From 191188975ade8fa95a03a9c31358197730dc05f4 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 14:43:55 -0500 Subject: [PATCH 41/61] prints --- src/concurrent_map/cmap_implementation.cuh | 4 ++-- src/slab_hash_global.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 2393a05..eed8bf2 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -28,11 +28,11 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //std::cout << "finalNumKeys " << finalNumKeys << std::endl; + std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 0.60) { + if(finalSlabLoadFactor > 1) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 15ec263..b5fa3b6 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -128,8 +128,8 @@ class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 10; // 128 MB -constexpr uint32_t num_super_blocks = 1; +constexpr uint32_t log_num_mem_blocks = 9; // 64 MB +constexpr uint32_t num_super_blocks = 15; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From c2db909b8873569f69501138d40a9b8b26a9b0f4 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 14:51:43 -0500 Subject: [PATCH 42/61] resize at 90% --- src/concurrent_map/cmap_implementation.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index eed8bf2..3f71747 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -28,11 +28,11 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor > 1) { + if(finalSlabLoadFactor >= 0.90) { numResizes = 1; } From c1df972af5901b577dc71c3af974cb60104b0200 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 14:54:56 -0500 Subject: [PATCH 43/61] changes to SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index bcfbc76..794194f 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit bcfbc7621d07acf1a637c3b32aad435439fa0f69 +Subproject commit 794194faea5a184c574c6811426acf476d3c08fa From 32558d2f233cc6704e51b42c2161d88bb1878cfe Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 15:00:32 -0500 Subject: [PATCH 44/61] changes to SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index 794194f..b4711d1 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 794194faea5a184c574c6811426acf476d3c08fa +Subproject commit b4711d1cf11d805d7eb47530f6a5a93ddd2c2591 From 936ae4586ceb5bee1e7831dad99a5d3a463b4fd1 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 15:03:13 -0500 Subject: [PATCH 45/61] SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index b4711d1..ee8f226 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit b4711d1cf11d805d7eb47530f6a5a93ddd2c2591 +Subproject commit ee8f226a235ca1c86812876eecbfbae5eeab2b53 From 7053d9ca1e43051f4af6bc5e2cd762c70ce02dc8 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 8 Jan 2021 15:14:30 -0500 Subject: [PATCH 46/61] fixes to SlabAlloc --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index ee8f226..1e9093d 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit ee8f226a235ca1c86812876eecbfbae5eeab2b53 +Subproject commit 1e9093d82e85fce18994c9bb8c4735bad19d0c5b From b6ae38e286e314d149ef339087aedc05c38a53c4 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sat, 9 Jan 2021 00:12:35 -0500 Subject: [PATCH 47/61] reisze at 75 --- SlabAlloc | 2 +- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 1e9093d..3851e82 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 1e9093d82e85fce18994c9bb8c4735bad19d0c5b +Subproject commit 3851e82d11f68e57fdf10cf9152538513857aac4 diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 3f71747..ca599b5 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor >= 0.90) { + if(finalSlabLoadFactor >= 0.75) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index b5fa3b6..99dd521 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 9; // 64 MB -constexpr uint32_t num_super_blocks = 15; +constexpr uint32_t num_super_blocks = 10; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From ba0b0c3b8fd96f493cc55a961e8973432bd72b9f Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sat, 9 Jan 2021 00:24:52 -0500 Subject: [PATCH 48/61] prints --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index ca599b5..e0be948 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -28,7 +28,7 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //std::cout << "finalNumKeys " << finalNumKeys << std::endl; + std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; From 946aa03bbb241c0da730ea6770b981a0edf13a69 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sat, 9 Jan 2021 00:29:54 -0500 Subject: [PATCH 49/61] changes to SlabAlloc --- SlabAlloc | 2 +- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index 3851e82..c165af5 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 3851e82d11f68e57fdf10cf9152538513857aac4 +Subproject commit c165af51d2cea7bfed9659eb2f6fc0f0665cb8dc diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index e0be948..ca599b5 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -28,7 +28,7 @@ uint32_t GpuSlabHash::checkForPreemp auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - std::cout << "finalNumKeys " << finalNumKeys << std::endl; + //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 99dd521..9586188 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 9; // 64 MB -constexpr uint32_t num_super_blocks = 10; +constexpr uint32_t num_super_blocks = 11; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From a217079bf41fd8e5c8b48deaebcc2d5c31afd8a1 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sat, 9 Jan 2021 01:23:13 -0500 Subject: [PATCH 50/61] resize 60 --- SlabAlloc | 2 +- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SlabAlloc b/SlabAlloc index c165af5..bf15666 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit c165af51d2cea7bfed9659eb2f6fc0f0665cb8dc +Subproject commit bf15666694570184b7289e2d3939315349c92bcc diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index ca599b5..5867c7e 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor >= 0.75) { + if(finalSlabLoadFactor >= 0.60) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 9586188..30d174e 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 9; // 64 MB -constexpr uint32_t num_super_blocks = 11; +constexpr uint32_t num_super_blocks = 2; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From f5f6f00ff4167394d351966c8f050fa849ed7942 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 15 Jan 2021 23:02:51 -0500 Subject: [PATCH 51/61] resize at 55% --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 5867c7e..7f0951d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor >= 0.60) { + if(finalSlabLoadFactor >= 0.55) { numResizes = 1; } From d9f39f92c29d7e0539e90f36b6d43eec8b01fbd9 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 2 Apr 2021 16:34:28 -0400 Subject: [PATCH 52/61] resize 60 --- src/concurrent_map/cmap_implementation.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 7f0951d..5867c7e 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor >= 0.55) { + if(finalSlabLoadFactor >= 0.60) { numResizes = 1; } From 6a57ca86e3503f8d8f44607fdfedb63a79aa89e6 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 2 Apr 2021 16:44:00 -0400 Subject: [PATCH 53/61] resize 90 --- src/concurrent_map/cmap_implementation.cuh | 2 +- src/slab_hash_global.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 5867c7e..3f71747 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash::checkForPreemp auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; - if(finalSlabLoadFactor >= 0.60) { + if(finalSlabLoadFactor >= 0.90) { numResizes = 1; } diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 30d174e..b5fa3b6 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -129,7 +129,7 @@ class GpuSlabHashContext; // this might need to be a template paramater itself namespace slab_alloc_par { constexpr uint32_t log_num_mem_blocks = 9; // 64 MB -constexpr uint32_t num_super_blocks = 2; +constexpr uint32_t num_super_blocks = 15; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par From 38b5a03d26b7de31c2acdeeb478d5619025887cc Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Fri, 2 Apr 2021 17:27:00 -0400 Subject: [PATCH 54/61] SlabAlloc changes --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index bf15666..95cb758 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit bf15666694570184b7289e2d3939315349c92bcc +Subproject commit 95cb7581b33ba13b29db4429a1f2213241bb1d47 From b24b99850cc645cc1663549e420d8cfd3ffe2d86 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sat, 17 Apr 2021 21:56:41 -0400 Subject: [PATCH 55/61] number of mem blocks and super blocks configurable from user code --- src/concurrent_map/cmap_class.cuh | 33 +++++++------- src/concurrent_map/cmap_implementation.cuh | 44 +++++++++---------- src/concurrent_map/device/build.cuh | 12 ++--- .../device/concurrent_kernel.cuh | 6 +-- src/concurrent_map/device/count_kernel.cuh | 4 +- src/concurrent_map/device/delete_kernel.cuh | 4 +- src/concurrent_map/device/misc_kernels.cuh | 4 +- src/concurrent_map/device/search_kernel.cuh | 8 ++-- src/concurrent_map/warp/count.cuh | 4 +- src/concurrent_map/warp/delete.cuh | 4 +- src/concurrent_map/warp/insert.cuh | 12 ++--- src/concurrent_map/warp/search.cuh | 8 ++-- src/concurrent_set/cset_class.cuh | 30 ++++++------- src/concurrent_set/cset_helper_kernels.cuh | 10 ++--- src/concurrent_set/cset_implementation.cuh | 12 ++--- src/concurrent_set/cset_warp_operations.cuh | 14 +++--- src/gpu_hash_table.cuh | 15 ++++--- src/slab_hash_global.cuh | 4 +- src/slab_iterator.cuh | 6 +-- test/iterator_test.cu | 6 +-- 20 files changed, 123 insertions(+), 117 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index c1e8020..d89c5ef 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -22,8 +22,8 @@ * used at runtime. This class does not own the allocated memory on the gpu * (i.e., d_table_) */ -template -class GpuSlabHashContext { +template +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -36,7 +36,7 @@ class GpuSlabHashContext { #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext( - GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); total_num_slabs_ = rhs.getTotalNumSlabs(); total_num_keys_ = rhs.getTotalNumKeys(); @@ -61,7 +61,7 @@ class GpuSlabHashContext { const uint32_t hash_x, const uint32_t hash_y, int8_t* d_table, - AllocatorContextT* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; @@ -72,7 +72,7 @@ class GpuSlabHashContext { total_num_keys_ = 0; } - __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) { + __host__ void updateAllocatorContext(SlabAllocLightContext* allocator_ctx) { global_allocator_ctx_ = *allocator_ctx; total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool(); } @@ -81,7 +81,7 @@ class GpuSlabHashContext { total_num_keys_ += keysAdded; } - __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -108,7 +108,7 @@ class GpuSlabHashContext { const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash @@ -119,7 +119,7 @@ class GpuSlabHashContext { const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooperate with each other to search for keys // if found, it returns the corresponding value, else SEARCH_NOT_FOUND @@ -172,7 +172,7 @@ class GpuSlabHashContext { } __device__ __forceinline__ SlabAllocAddressT - allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -189,14 +189,14 @@ class GpuSlabHashContext { uint32_t hash_y_; typename ConcurrentMapT::SlabTypeT* d_table_; // a copy of dynamic allocator's context to be used on the GPU - AllocatorContextT global_allocator_ctx_; + SlabAllocLightContext global_allocator_ctx_; }; /* * This class owns the allocated memory for the hash table */ -template -class GpuSlabHash { +template +class GpuSlabHash { private: // fixed known parameters: static constexpr uint32_t BLOCKSIZE_ = 128; @@ -218,16 +218,17 @@ class GpuSlabHash { // slab hash context, contains everything that a GPU application needs to be // able to use this data structure - GpuSlabHashContext gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class - DynamicAllocatorT* dynamic_allocator_; + //DynamicAllocatorT* dynamic_allocator_; + SlabAllocLight *dynamic_allocator_; uint32_t device_idx_; public: GpuSlabHash(const uint32_t num_buckets, - DynamicAllocatorT* dynamic_allocator, + SlabAllocLight* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, const bool identity_hash = false) @@ -247,7 +248,7 @@ class GpuSlabHash { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); slab_unit_size_ = - GpuSlabHashContext::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 3f71747..a76ec7d 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -16,14 +16,14 @@ #pragma once -template -void GpuSlabHash::resize() { +template +void GpuSlabHash::resize() { dynamic_allocator_->growPool(); gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); } -template -uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { +template +uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); auto capacity = numSlabs * 16; // capacity in key-value size multiples @@ -39,8 +39,8 @@ uint32_t GpuSlabHash::checkForPreemp return numResizes; } -template -void GpuSlabHash::buildBulk( +template +void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -63,8 +63,8 @@ void GpuSlabHash::buildBulk( gpu_context_.updateTotalNumKeys(num_keys); } -template -void GpuSlabHash::buildBulkWithUniqueKeys( +template +void GpuSlabHash::buildBulkWithUniqueKeys( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -91,8 +91,8 @@ void GpuSlabHash::buildBulkWithUniqu //std::cout << "num_successes: " << *num_successes << std::endl; } -template -void GpuSlabHash::searchIndividual( +template +void GpuSlabHash::searchIndividual( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -102,8 +102,8 @@ void GpuSlabHash::searchIndividual( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -void GpuSlabHash::searchBulk( +template +void GpuSlabHash::searchBulk( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -113,8 +113,8 @@ void GpuSlabHash::searchBulk( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -void GpuSlabHash::countIndividual( +template +void GpuSlabHash::countIndividual( KeyT* d_query, uint32_t* d_count, uint32_t num_queries) { @@ -124,8 +124,8 @@ void GpuSlabHash::countIndividual( <<>>(d_query, d_count, num_queries, gpu_context_); } -template -void GpuSlabHash::deleteIndividual( +template +void GpuSlabHash::deleteIndividual( KeyT* d_key, uint32_t num_keys) { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); @@ -135,8 +135,8 @@ void GpuSlabHash::deleteIndividual( } // perform a batch of (a mixture of) updates/searches -template -void GpuSlabHash::batchedOperation( +template +void GpuSlabHash::batchedOperation( KeyT* d_key, ValueT* d_result, uint32_t num_ops) { @@ -146,8 +146,8 @@ void GpuSlabHash::batchedOperation( <<>>(d_key, d_result, num_ops, gpu_context_); } -template -std::string GpuSlabHash::to_string() { +template +std::string GpuSlabHash::to_string() { std::string result; result += " ==== GpuSlabHash: \n"; result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; @@ -161,8 +161,8 @@ std::string GpuSlabHash::to_string() return result; } -template -double GpuSlabHash::computeLoadFactor( +template +double GpuSlabHash::computeLoadFactor( int flag = 0) { uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_]; uint32_t* d_bucket_pairs_count; diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index 9b05f63..3f00207 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -18,12 +18,12 @@ /* * */ -template +template __global__ void build_table_kernel( KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -31,7 +31,7 @@ __global__ void build_table_kernel( return; } - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; @@ -49,13 +49,13 @@ __global__ void build_table_kernel( slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); } -template +template __global__ void build_table_with_unique_keys_kernel( int *num_successes, KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -67,7 +67,7 @@ __global__ void build_table_with_unique_keys_kernel( // return; //} - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; diff --git a/src/concurrent_map/device/concurrent_kernel.cuh b/src/concurrent_map/device/concurrent_kernel.cuh index d72c546..8cd7626 100644 --- a/src/concurrent_map/device/concurrent_kernel.cuh +++ b/src/concurrent_map/device/concurrent_kernel.cuh @@ -16,12 +16,12 @@ #pragma once -template +template __global__ void batched_operations( uint32_t* d_operations, uint32_t* d_results, uint32_t num_operations, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -29,7 +29,7 @@ __global__ void batched_operations( return; // initializing the memory allocator on each warp: - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); uint32_t myOperation = 0; diff --git a/src/concurrent_map/device/count_kernel.cuh b/src/concurrent_map/device/count_kernel.cuh index 99fffba..d71da3d 100644 --- a/src/concurrent_map/device/count_kernel.cuh +++ b/src/concurrent_map/device/count_kernel.cuh @@ -16,12 +16,12 @@ #pragma once -template +template __global__ void count_key( KeyT* d_queries, uint32_t* d_counts, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/device/delete_kernel.cuh b/src/concurrent_map/device/delete_kernel.cuh index 9542623..edc47cc 100644 --- a/src/concurrent_map/device/delete_kernel.cuh +++ b/src/concurrent_map/device/delete_kernel.cuh @@ -16,11 +16,11 @@ #pragma once -template +template __global__ void delete_table_keys( KeyT* d_key_deleted, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/device/misc_kernels.cuh b/src/concurrent_map/device/misc_kernels.cuh index 5f9e5b8..0dfef23 100644 --- a/src/concurrent_map/device/misc_kernels.cuh +++ b/src/concurrent_map/device/misc_kernels.cuh @@ -21,9 +21,9 @@ * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and * d_slabs_count_result arrays respectively */ -template +template __global__ void bucket_count_kernel( - GpuSlabHashContext slab_hash, + GpuSlabHashContext slab_hash, uint32_t* d_pairs_count_result, uint32_t* d_slabs_count_result, uint32_t num_buckets) { diff --git a/src/concurrent_map/device/search_kernel.cuh b/src/concurrent_map/device/search_kernel.cuh index c5f38e6..e01ec56 100644 --- a/src/concurrent_map/device/search_kernel.cuh +++ b/src/concurrent_map/device/search_kernel.cuh @@ -17,12 +17,12 @@ #pragma once //=== Individual search kernel: -template +template __global__ void search_table( KeyT* d_queries, ValueT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -49,12 +49,12 @@ __global__ void search_table( } //=== Bulk search kernel: -template +template __global__ void search_table_bulk( KeyT* d_queries, ValueT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/warp/count.cuh b/src/concurrent_map/warp/count.cuh index 4e9a70f..e3d0ede 100644 --- a/src/concurrent_map/warp/count.cuh +++ b/src/concurrent_map/warp/count.cuh @@ -19,9 +19,9 @@ //================================================ // Individual Count Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::countKey( +GpuSlabHashContext::countKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/warp/delete.cuh b/src/concurrent_map/warp/delete.cuh index d8d6685..dc65ced 100644 --- a/src/concurrent_map/warp/delete.cuh +++ b/src/concurrent_map/warp/delete.cuh @@ -16,9 +16,9 @@ #pragma once -template +template __device__ __forceinline__ bool -GpuSlabHashContext::deleteKey( +GpuSlabHashContext::deleteKey( bool& to_be_deleted, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 124b863..2a88bd7 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -21,16 +21,16 @@ * it is assumed all threads within a warp are present and collaborating with * each other with a warp-cooperative work sharing (WCWS) strategy. */ -template +template __device__ __forceinline__ void -GpuSlabHashContext::insertPair( +GpuSlabHashContext::insertPair( /*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { using SlabHashT = ConcurrentMapT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -110,16 +110,16 @@ GpuSlabHashContext::insertPair( * each other with a warp-cooperative work sharing (WCWS) strategy. * returns true only if a new key was inserted into the hash table */ -template +template __device__ __forceinline__ bool -GpuSlabHashContext::insertPairUnique( +GpuSlabHashContext::insertPairUnique( int& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { using SlabHashT = ConcurrentMapT; uint32_t work_queue = 0; diff --git a/src/concurrent_map/warp/search.cuh b/src/concurrent_map/warp/search.cuh index de490de..0bb39be 100644 --- a/src/concurrent_map/warp/search.cuh +++ b/src/concurrent_map/warp/search.cuh @@ -19,9 +19,9 @@ //================================================ // Individual Search Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::searchKey( +GpuSlabHashContext::searchKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, @@ -73,9 +73,9 @@ GpuSlabHashContext::searchKey( //================================================ // Bulk Search Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::searchKeyBulk( +GpuSlabHashContext::searchKeyBulk( const uint32_t& laneId, const KeyT& myKey, ValueT& myValue, diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh index 3a0fd6a..9510ef7 100644 --- a/src/concurrent_set/cset_class.cuh +++ b/src/concurrent_set/cset_class.cuh @@ -21,8 +21,8 @@ * used at runtime. This class does not own the allocated memory on the gpu * (i.e., d_table_) */ -template -class GpuSlabHashContext { +template +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -36,7 +36,7 @@ class GpuSlabHashContext { #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext( - GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); hash_x_ = rhs.getHashX(); hash_y_ = rhs.getHashY(); @@ -57,7 +57,7 @@ class GpuSlabHashContext { const uint32_t hash_x, const uint32_t hash_y, int8_t* d_table, - AllocatorContextT* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; @@ -65,7 +65,7 @@ class GpuSlabHashContext { global_allocator_ctx_ = *allocator_ctx; } - __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -88,7 +88,7 @@ class GpuSlabHashContext { const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooeparte with each other to search for keys // if found, it returns the true, else false @@ -124,7 +124,7 @@ class GpuSlabHashContext { } __device__ __forceinline__ SlabAllocAddressT - allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -139,14 +139,14 @@ class GpuSlabHashContext { uint32_t hash_y_; typename ConcurrentSetT::SlabTypeT* d_table_; // a copy of dynamic allocator's context to be used on the GPU - AllocatorContextT global_allocator_ctx_; + SlabAllocLightContext global_allocator_ctx_; }; /* * This class owns the allocated memory for the hash table */ -template -class GpuSlabHash { +template +class GpuSlabHash { private: // fixed known parameters: static constexpr uint32_t BLOCKSIZE_ = 128; @@ -168,16 +168,16 @@ class GpuSlabHash { // slab hash context, contains everything that a GPU application needs to be // able to use this data structure - GpuSlabHashContext gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class - DynamicAllocatorT* dynamic_allocator_; + SlabAllocLight* dynamic_allocator_; uint32_t device_idx_; public: GpuSlabHash(const uint32_t num_buckets, - DynamicAllocatorT* dynamic_allocator, + SlabAllocLight* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, const bool identity_hash = false) @@ -197,7 +197,7 @@ class GpuSlabHash { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); slab_unit_size_ = - GpuSlabHashContext::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); @@ -228,7 +228,7 @@ class GpuSlabHash { // returns some debug information about the slab hash std::string to_string(); double computeLoadFactor(int flag) {} - GpuSlabHashContext& getSlabHashContext() { + GpuSlabHashContext& getSlabHashContext() { return gpu_context_; } diff --git a/src/concurrent_set/cset_helper_kernels.cuh b/src/concurrent_set/cset_helper_kernels.cuh index 211f7f7..3c4c3a2 100644 --- a/src/concurrent_set/cset_helper_kernels.cuh +++ b/src/concurrent_set/cset_helper_kernels.cuh @@ -16,11 +16,11 @@ #pragma once namespace cset { -template +template __global__ void build_table_kernel( KeyT* d_key, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -29,7 +29,7 @@ __global__ void build_table_kernel( } // initializing the memory allocator on each warp: - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; @@ -46,12 +46,12 @@ __global__ void build_table_kernel( } //=== Individual search kernel: -template +template __global__ void search_table( KeyT* d_queries, KeyT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_set/cset_implementation.cuh b/src/concurrent_set/cset_implementation.cuh index d189054..dfa4375 100644 --- a/src/concurrent_set/cset_implementation.cuh +++ b/src/concurrent_set/cset_implementation.cuh @@ -16,8 +16,8 @@ #pragma once -template -void GpuSlabHash::buildBulk( +template +void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -28,8 +28,8 @@ void GpuSlabHash::buildBulk( <<>>(d_key, num_keys, gpu_context_); } -template -void GpuSlabHash::searchIndividual( +template +void GpuSlabHash::searchIndividual( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -39,8 +39,8 @@ void GpuSlabHash::searchIndividual( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -std::string GpuSlabHash::to_string() { +template +std::string GpuSlabHash::to_string() { std::string result; result += " ==== GpuSlabHash: \n"; result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; diff --git a/src/concurrent_set/cset_warp_operations.cuh b/src/concurrent_set/cset_warp_operations.cuh index 5211c75..3b1240a 100644 --- a/src/concurrent_set/cset_warp_operations.cuh +++ b/src/concurrent_set/cset_warp_operations.cuh @@ -16,14 +16,14 @@ #pragma once -template +template __device__ __forceinline__ bool -GpuSlabHashContext::insertKey( +GpuSlabHashContext::insertKey( bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { using SlabHashT = ConcurrentSetT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -91,9 +91,9 @@ GpuSlabHashContext::insertKey( } // ======== -template +template __device__ __forceinline__ bool -GpuSlabHashContext::searchKey( +GpuSlabHashContext::searchKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, @@ -138,9 +138,9 @@ GpuSlabHashContext::searchKey( return myResult; } -template +template __device__ __forceinline__ bool -GpuSlabHashContext::searchKeyBulk( +GpuSlabHashContext::searchKeyBulk( const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id) {} \ No newline at end of file diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index 03930b0..665088c 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -21,7 +21,8 @@ * This class acts as a helper class to simplify simulations around different * kinds of slab hash implementations */ -template +template class gpu_hash_table { private: uint32_t max_keys_; @@ -32,10 +33,13 @@ class gpu_hash_table { public: // Slab hash invariant - GpuSlabHash* slab_hash_; + GpuSlabHash* slab_hash_; + + SlabAllocLight* dynamic_allocator_; // the dynamic allocator that is being used for slab hash - DynamicAllocatorT* dynamic_allocator_; + //DynamicAllocatorT* dynamic_allocator_; + uint32_t device_idx_; @@ -77,10 +81,11 @@ class gpu_hash_table { //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); // allocate an initialize the allocator: - dynamic_allocator_ = new DynamicAllocatorT(); + //dynamic_allocator_ = new DynamicAllocatorT(); + dynamic_allocator_ = new SlabAllocLight; // slab hash: - slab_hash_ = new GpuSlabHash( + slab_hash_ = new GpuSlabHash( num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_); if (verbose) { std::cout << slab_hash_->to_string() << std::endl; diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index b5fa3b6..5ee1ea1 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -119,10 +119,10 @@ class PhaseConcurrentMapT { }; // the main class to be specialized for different types of hash tables -template +template class GpuSlabHash; -template +template class GpuSlabHashContext; // The custom allocator that is being used for this code: diff --git a/src/slab_iterator.cuh b/src/slab_iterator.cuh index 3cd0586..35199e6 100644 --- a/src/slab_iterator.cuh +++ b/src/slab_iterator.cuh @@ -19,12 +19,12 @@ // a forward iterator for the slab hash data structure: // currently just specialized for concurrent set // TODO implement for other types -template +template class SlabIterator { public: using SlabHashT = ConcurrentSetT; - GpuSlabHashContext& slab_hash_; + GpuSlabHashContext& slab_hash_; // current position of the iterator KeyT* cur_ptr_; @@ -35,7 +35,7 @@ class SlabIterator { // initialize the iterator with the first bucket's pointer address of the slab // hash __host__ __device__ - SlabIterator(GpuSlabHashContext& slab_hash) + SlabIterator(GpuSlabHashContext& slab_hash) : slab_hash_(slab_hash) , cur_ptr_(reinterpret_cast(slab_hash_.getDeviceTablePointer())) , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE) diff --git a/test/iterator_test.cu b/test/iterator_test.cu index bb0461f..7b5b9ee 100644 --- a/test/iterator_test.cu +++ b/test/iterator_test.cu @@ -28,9 +28,9 @@ #define DEVICE_ID 0 //======================================= -template +template __global__ void print_table( - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t wid = tid >> 5; uint32_t laneId = threadIdx.x & 0x1F; @@ -44,7 +44,7 @@ __global__ void print_table( if (tid == 0) { printf(" == Printing the base array\n"); - SlabIterator iter(slab_hash); + SlabIterator iter(slab_hash); for (int i = 0; i < iter.cur_size_; i++) { if ((i & 0x1F) == 0) printf(" == bucket %d:\n", i >> 5); From 10c00747ab97e689074e956dc1648ef7d14c9690 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 18 Apr 2021 17:11:31 -0400 Subject: [PATCH 56/61] user configurable threshold resizing lf --- src/concurrent_map/cmap_class.cuh | 9 ++++++--- src/concurrent_map/cmap_implementation.cuh | 2 +- src/concurrent_set/cset_class.cuh | 3 ++- src/gpu_hash_table.cuh | 9 ++++++--- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index d89c5ef..542f3b0 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -222,21 +222,24 @@ class GpuSlabHash *dynamic_allocator_; uint32_t device_idx_; + float thresh_lf_; + public: GpuSlabHash(const uint32_t num_buckets, SlabAllocLight* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, - const bool identity_hash = false) + const bool identity_hash = false, + float thresh_lf = 0.60) : num_buckets_(num_buckets) , d_table_(nullptr) , slab_unit_size_(0) , dynamic_allocator_(dynamic_allocator) - , device_idx_(device_idx) { + , device_idx_(device_idx) + , thresh_lf_(thresh_lf) { assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash."); assert(sizeof(typename ConcurrentMapT::SlabTypeT) == (WARP_WIDTH_ * sizeof(uint32_t)) && diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index a76ec7d..0016e54 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -32,7 +32,7 @@ uint32_t GpuSlabHash= 0.90) { + if(finalSlabLoadFactor >= thresh_lf_) { numResizes = 1; } diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh index 9510ef7..8c86734 100644 --- a/src/concurrent_set/cset_class.cuh +++ b/src/concurrent_set/cset_class.cuh @@ -180,7 +180,8 @@ class GpuSlabHash* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, - const bool identity_hash = false) + const bool identity_hash = false, + float thresh_lf = 0.60) : num_buckets_(num_buckets) , d_table_(nullptr) , slab_unit_size_(0) diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index 665088c..0300b2d 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -50,16 +50,20 @@ class gpu_hash_table { ValueT* d_result_; uint32_t* d_count_; + float thresh_lf_; + gpu_hash_table(uint32_t max_keys, uint32_t num_buckets, const uint32_t device_idx, const int64_t seed, const bool req_values = true, const bool identity_hash = false, - const bool verbose = false) + const bool verbose = false, + float thresh_lf = 0.60) : max_keys_(max_keys) , num_buckets_(num_buckets) , seed_(seed) + , thresh_lf_(thresh_lf) , req_values_(req_values) , slab_hash_(nullptr) , identity_hash_(identity_hash) @@ -81,12 +85,11 @@ class gpu_hash_table { //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); // allocate an initialize the allocator: - //dynamic_allocator_ = new DynamicAllocatorT(); dynamic_allocator_ = new SlabAllocLight; // slab hash: slab_hash_ = new GpuSlabHash( - num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_); + num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_, thresh_lf_); if (verbose) { std::cout << slab_hash_->to_string() << std::endl; } From 148f1b8da3c6cccc33da148bcd11b6c76dd0791e Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 18 Apr 2021 18:31:22 -0400 Subject: [PATCH 57/61] minor changes --- src/gpu_hash_table.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index 0300b2d..14de391 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -85,7 +85,7 @@ class gpu_hash_table { //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); // allocate an initialize the allocator: - dynamic_allocator_ = new SlabAllocLight; + dynamic_allocator_ = new SlabAllocLight(num_buckets); // slab hash: slab_hash_ = new GpuSlabHash( From e701dbf6a5bd4770c970d90da5683e6a2b4069bb Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 18 Apr 2021 18:52:01 -0400 Subject: [PATCH 58/61] SlabAlloc changes --- SlabAlloc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SlabAlloc b/SlabAlloc index 95cb758..13983dd 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit 95cb7581b33ba13b29db4429a1f2213241bb1d47 +Subproject commit 13983dd0f92d2ef9b9079651d9a28ffe6da74a43 From b44ccde7b122d0c20f6a0b49e201c97f746760b9 Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 18 Apr 2021 23:22:45 -0400 Subject: [PATCH 59/61] style fixes --- src/concurrent_map/cmap_implementation.cuh | 33 +++++++++++++--------- src/concurrent_map/device/build.cuh | 16 +++++------ 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 0016e54..80890a0 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -23,12 +23,12 @@ void GpuSlabHash -uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { +uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { auto numSlabs = gpu_context_.getTotalNumSlabs(); auto capacity = numSlabs * 16; // capacity in key-value size multiples auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; - //std::cout << "finalNumKeys " << finalNumKeys << std::endl; auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; auto numResizes = 0; @@ -40,7 +40,8 @@ uint32_t GpuSlabHash -void GpuSlabHash::buildBulk( +void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -53,7 +54,6 @@ void GpuSlabHash <<>>(d_key, d_value, num_keys, gpu_context_); CHECK_CUDA_ERROR(cudaDeviceSynchronize()); @@ -64,7 +64,8 @@ void GpuSlabHash -void GpuSlabHash::buildBulkWithUniqueKeys( +void GpuSlabHash::buildBulkWithUniqueKeys( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -88,11 +89,11 @@ void GpuSlabHash -void GpuSlabHash::searchIndividual( +void GpuSlabHash::searchIndividual( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -103,7 +104,8 @@ void GpuSlabHash -void GpuSlabHash::searchBulk( +void GpuSlabHash::searchBulk( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -114,7 +116,8 @@ void GpuSlabHash -void GpuSlabHash::countIndividual( +void GpuSlabHash::countIndividual( KeyT* d_query, uint32_t* d_count, uint32_t num_queries) { @@ -125,7 +128,8 @@ void GpuSlabHash -void GpuSlabHash::deleteIndividual( +void GpuSlabHash::deleteIndividual( KeyT* d_key, uint32_t num_keys) { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); @@ -136,7 +140,8 @@ void GpuSlabHash -void GpuSlabHash::batchedOperation( +void GpuSlabHash::batchedOperation( KeyT* d_key, ValueT* d_result, uint32_t num_ops) { @@ -147,7 +152,8 @@ void GpuSlabHash -std::string GpuSlabHash::to_string() { +std::string GpuSlabHash::to_string() { std::string result; result += " ==== GpuSlabHash: \n"; result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; @@ -162,7 +168,8 @@ std::string GpuSlabHash -double GpuSlabHash::computeLoadFactor( +double GpuSlabHash::computeLoadFactor( int flag = 0) { uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_]; uint32_t* d_bucket_pairs_count; diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index 3f00207..3530d21 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -23,7 +23,8 @@ __global__ void build_table_kernel( KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -31,7 +32,8 @@ __global__ void build_table_kernel( return; } - SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext + local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; @@ -55,7 +57,8 @@ __global__ void build_table_with_unique_keys_kernel( KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -63,11 +66,8 @@ __global__ void build_table_with_unique_keys_kernel( uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; - //if ((tid - laneId) >= num_keys) { - // return; - //} - - SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext + local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; From c956e08ee147bdcce4801a9c82ee7030d0130dcf Mon Sep 17 00:00:00 2001 From: Nicolas-Iskos Date: Sun, 18 Apr 2021 23:28:57 -0400 Subject: [PATCH 60/61] style fixes --- src/concurrent_map/cmap_class.cuh | 27 ++++++++++++++++++--------- src/concurrent_set/cset_class.cuh | 27 ++++++++++++++++++--------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 542f3b0..bbd74b1 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -23,7 +23,8 @@ * (i.e., d_table_) */ template -class GpuSlabHashContext { +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -36,7 +37,8 @@ class GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); total_num_slabs_ = rhs.getTotalNumSlabs(); total_num_keys_ = rhs.getTotalNumKeys(); @@ -61,7 +63,8 @@ class GpuSlabHashContext* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; @@ -72,7 +75,8 @@ class GpuSlabHashContext* allocator_ctx) { + __host__ void updateAllocatorContext(SlabAllocLightContext* allocator_ctx) { global_allocator_ctx_ = *allocator_ctx; total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool(); } @@ -81,7 +85,8 @@ class GpuSlabHashContext& getAllocatorContext() { + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -108,7 +113,8 @@ class GpuSlabHashContext& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash @@ -172,7 +178,8 @@ class GpuSlabHashContext& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& + local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -218,7 +225,8 @@ class GpuSlabHash gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class @@ -251,7 +259,8 @@ class GpuSlabHash::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh index 8c86734..518d556 100644 --- a/src/concurrent_set/cset_class.cuh +++ b/src/concurrent_set/cset_class.cuh @@ -22,7 +22,8 @@ * (i.e., d_table_) */ template -class GpuSlabHashContext { +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -36,7 +37,8 @@ class GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); hash_x_ = rhs.getHashX(); hash_y_ = rhs.getHashY(); @@ -57,7 +59,8 @@ class GpuSlabHashContext* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; @@ -65,7 +68,8 @@ class GpuSlabHashContext& getAllocatorContext() { + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -88,7 +92,8 @@ class GpuSlabHashContext& local_allocator_context); + SlabAllocLightContext& + local_allocator_context); // threads in a warp cooeparte with each other to search for keys // if found, it returns the true, else false @@ -124,7 +129,8 @@ class GpuSlabHashContext& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& + local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -168,7 +174,8 @@ class GpuSlabHash gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class @@ -198,7 +205,8 @@ class GpuSlabHash::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); @@ -229,7 +237,8 @@ class GpuSlabHash& getSlabHashContext() { + GpuSlabHashContext& getSlabHashContext() { return gpu_context_; } From cd2319d39c37b74c514abba2d469a1e3b33872f4 Mon Sep 17 00:00:00 2001 From: Nicolas Iskos <46684817+Nicolas-Iskos@users.noreply.github.com> Date: Tue, 20 Apr 2021 08:53:42 -0400 Subject: [PATCH 61/61] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2c8c171..8a20910 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # SlabHash -A warp-oriented dynamic hash table for GPUs +A warp-oriented dynamic hash table for GPUs. + +This fork of SlabHash is modified to allow for dynamic growth of the slab pool, allowing the slab hash to increase its total memory footprint as keys are inserted during run time. ## Publication: This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper: @@ -489,4 +491,4 @@ init lf final lf num buckets init build rate(M/s) concurre 0.59 0.58 80660 501.725 639.850 0.64 0.62 69906 493.702 601.822 -``` \ No newline at end of file +```