diff --git a/.gitmodules b/.gitmodules index fcfa184..201a205 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,12 @@ [submodule "SlabAlloc"] path = SlabAlloc - url = https://github.com/owensgroup/SlabAlloc + url = https://github.com/Nicolas-Iskos/SlabAlloc [submodule "ThirdParty/rapidjson"] path = ThirdParty/rapidjson url = https://github.com/Tencent/rapidjson [submodule "ThirdParty/googletest"] path = ThirdParty/googletest url = https://github.com/google/googletest +[submodule "cub"] + path = cub + url = https://github.com/NVIDIA/cub.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 2310713..72c7a6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ find_package(CUDA 8.0 REQUIRED) option(CMAKE_VERBOSE_MAKEFILE ON) option(DGTEST, "DGTEST" ON) -set(CUDA_NVCC_FLAGS -std=c++11) +set(CUDA_NVCC_FLAGS -std=c++14) set (CMAKE_CXX_STANDARD 11) if (CUDA_VERBOSE_PTXAS) @@ -83,6 +83,7 @@ if(SLABHASH_GENCODE_SM75) endif(SLABHASH_GENCODE_SM75) include_directories(SlabAlloc/src) +include_directories(cub) include_directories(src src/concurrent) include_directories(ThirdParty/rapidjson/include) include_directories(ThirdParty/googletest/googletest) diff --git a/README.md b/README.md index 2c8c171..8a20910 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # SlabHash -A warp-oriented dynamic hash table for GPUs +A warp-oriented dynamic hash table for GPUs. + +This fork of SlabHash is modified to allow for dynamic growth of the slab pool, allowing the slab hash to increase its total memory footprint as keys are inserted during run time. ## Publication: This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper: @@ -489,4 +491,4 @@ init lf final lf num buckets init build rate(M/s) concurre 0.59 0.58 80660 501.725 639.850 0.64 0.62 69906 493.702 601.822 -``` \ No newline at end of file +``` diff --git a/SlabAlloc b/SlabAlloc index d655028..13983dd 160000 --- a/SlabAlloc +++ b/SlabAlloc @@ -1 +1 @@ -Subproject commit d6550289344cd4474d237e959bfd6bc962981af7 +Subproject commit 13983dd0f92d2ef9b9079651d9a28ffe6da74a43 diff --git a/cub b/cub new file mode 160000 index 0000000..c3be9a9 --- /dev/null +++ b/cub @@ -0,0 +1 @@ +Subproject commit c3be9a94273b5049520aacc7db00c738668aaa3f diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh index 1329740..bbd74b1 100644 --- a/src/concurrent_map/cmap_class.cuh +++ b/src/concurrent_map/cmap_class.cuh @@ -22,8 +22,9 @@ * used at runtime. This class does not own the allocated memory on the gpu * (i.e., d_table_) */ -template -class GpuSlabHashContext { +template +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -31,12 +32,16 @@ class GpuSlabHashContext { #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext() - : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {} + : num_buckets_(0), total_num_slabs_(0), total_num_keys_(0), + hash_x_(0), hash_y_(0), d_table_(nullptr) {} #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext( - GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); + total_num_slabs_ = rhs.getTotalNumSlabs(); + total_num_keys_ = rhs.getTotalNumKeys(); hash_x_ = rhs.getHashX(); hash_y_ = rhs.getHashY(); d_table_ = rhs.getDeviceTablePointer(); @@ -58,16 +63,30 @@ class GpuSlabHashContext { const uint32_t hash_x, const uint32_t hash_y, int8_t* d_table, - AllocatorContextT* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; d_table_ = reinterpret_cast::SlabTypeT*>(d_table); global_allocator_ctx_ = *allocator_ctx; + total_num_slabs_ = num_buckets + global_allocator_ctx_.getNumSlabsInPool(); + total_num_keys_ = 0; } - __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { + __host__ void updateAllocatorContext(SlabAllocLightContext* allocator_ctx) { + global_allocator_ctx_ = *allocator_ctx; + total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool(); + } + + __host__ void updateTotalNumKeys(uint32_t keysAdded) { + total_num_keys_ += keysAdded; + } + + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -76,6 +95,8 @@ class GpuSlabHashContext { return d_table_; } + __device__ __host__ __forceinline__ uint32_t getTotalNumSlabs() {return total_num_slabs_; } + __device__ __host__ __forceinline__ uint32_t getTotalNumKeys() {return total_num_keys_; } __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; } __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; } __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; } @@ -86,22 +107,25 @@ class GpuSlabHashContext { // threads in a warp cooperate with each other to insert key-value pairs // into the slab hash - __device__ __forceinline__ void insertPair(bool& to_be_inserted, + __device__ __forceinline__ void insertPair(/*bool& mySuccess,*/ + bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooperate with each other to insert a unique key (and its value) // into the slab hash __device__ __forceinline__ bool insertPairUnique( + int& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& local_allocator_context); // threads in a warp cooperate with each other to search for keys // if found, it returns the corresponding value, else SEARCH_NOT_FOUND @@ -154,7 +178,8 @@ class GpuSlabHashContext { } __device__ __forceinline__ SlabAllocAddressT - allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& + local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -165,18 +190,20 @@ class GpuSlabHashContext { // === members: uint32_t num_buckets_; + uint32_t total_num_slabs_; + uint32_t total_num_keys_; uint32_t hash_x_; uint32_t hash_y_; typename ConcurrentMapT::SlabTypeT* d_table_; // a copy of dynamic allocator's context to be used on the GPU - AllocatorContextT global_allocator_ctx_; + SlabAllocLightContext global_allocator_ctx_; }; /* * This class owns the allocated memory for the hash table */ -template -class GpuSlabHash { +template +class GpuSlabHash { private: // fixed known parameters: static constexpr uint32_t BLOCKSIZE_ = 128; @@ -198,24 +225,29 @@ class GpuSlabHash { // slab hash context, contains everything that a GPU application needs to be // able to use this data structure - GpuSlabHashContext gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class - DynamicAllocatorT* dynamic_allocator_; + SlabAllocLight *dynamic_allocator_; uint32_t device_idx_; + float thresh_lf_; + public: GpuSlabHash(const uint32_t num_buckets, - DynamicAllocatorT* dynamic_allocator, + SlabAllocLight* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, - const bool identity_hash = false) + const bool identity_hash = false, + float thresh_lf = 0.60) : num_buckets_(num_buckets) , d_table_(nullptr) , slab_unit_size_(0) , dynamic_allocator_(dynamic_allocator) - , device_idx_(device_idx) { + , device_idx_(device_idx) + , thresh_lf_(thresh_lf) { assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash."); assert(sizeof(typename ConcurrentMapT::SlabTypeT) == (WARP_WIDTH_ * sizeof(uint32_t)) && @@ -227,7 +259,8 @@ class GpuSlabHash { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); slab_unit_size_ = - GpuSlabHashContext::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); @@ -258,7 +291,9 @@ class GpuSlabHash { // returns some debug information about the slab hash std::string to_string(); double computeLoadFactor(int flag); - + + void resize(); + uint32_t checkForPreemptiveResize(uint32_t keysAdded); void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys); void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys); void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries); diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh index 99880a8..80890a0 100644 --- a/src/concurrent_map/cmap_implementation.cuh +++ b/src/concurrent_map/cmap_implementation.cuh @@ -16,30 +16,84 @@ #pragma once -template -void GpuSlabHash::buildBulk( +template +void GpuSlabHash::resize() { + dynamic_allocator_->growPool(); + gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr()); +} + +template +uint32_t GpuSlabHash::checkForPreemptiveResize(uint32_t keysAdded) { + auto numSlabs = gpu_context_.getTotalNumSlabs(); + + auto capacity = numSlabs * 16; // capacity in key-value size multiples + auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded; + auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity; + auto numResizes = 0; + + if(finalSlabLoadFactor >= thresh_lf_) { + numResizes = 1; + } + + return numResizes; +} + +template +void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; - // calling the kernel for bulk build: CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } + + // calling the kernel for bulk build: build_table_kernel <<>>(d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // now that the bulk insert has completed successfully, we can + // update the total number of keys in the table + gpu_context_.updateTotalNumKeys(num_keys); } -template -void GpuSlabHash::buildBulkWithUniqueKeys( - KeyT* d_key, - ValueT* d_value, - uint32_t num_keys) { + +template +void GpuSlabHash::buildBulkWithUniqueKeys( + KeyT* d_key, + ValueT* d_value, + uint32_t num_keys) { + const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_; - // calling the kernel for bulk build: CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); + auto numResizes = checkForPreemptiveResize(num_keys); + for(auto i = 0; i < numResizes; ++i) { + resize(); + } + + // calling the kernel for bulk build: + int *num_successes; + CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int))); + *num_successes = 0; + build_table_with_unique_keys_kernel - <<>>(d_key, d_value, num_keys, gpu_context_); + <<>>(num_successes, d_key, d_value, num_keys, gpu_context_); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // now that the bulk insert has completed successfully, we can + // update the total number of keys in the table + gpu_context_.updateTotalNumKeys(*num_successes); } -template -void GpuSlabHash::searchIndividual( + +template +void GpuSlabHash::searchIndividual( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -49,8 +103,9 @@ void GpuSlabHash::searchIndividual( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -void GpuSlabHash::searchBulk( +template +void GpuSlabHash::searchBulk( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -60,8 +115,9 @@ void GpuSlabHash::searchBulk( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -void GpuSlabHash::countIndividual( +template +void GpuSlabHash::countIndividual( KeyT* d_query, uint32_t* d_count, uint32_t num_queries) { @@ -71,8 +127,9 @@ void GpuSlabHash::countIndividual( <<>>(d_query, d_count, num_queries, gpu_context_); } -template -void GpuSlabHash::deleteIndividual( +template +void GpuSlabHash::deleteIndividual( KeyT* d_key, uint32_t num_keys) { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); @@ -82,8 +139,9 @@ void GpuSlabHash::deleteIndividual( } // perform a batch of (a mixture of) updates/searches -template -void GpuSlabHash::batchedOperation( +template +void GpuSlabHash::batchedOperation( KeyT* d_key, ValueT* d_result, uint32_t num_ops) { @@ -93,8 +151,9 @@ void GpuSlabHash::batchedOperation( <<>>(d_key, d_result, num_ops, gpu_context_); } -template -std::string GpuSlabHash::to_string() { +template +std::string GpuSlabHash::to_string() { std::string result; result += " ==== GpuSlabHash: \n"; result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; @@ -108,8 +167,9 @@ std::string GpuSlabHash::to_string() return result; } -template -double GpuSlabHash::computeLoadFactor( +template +double GpuSlabHash::computeLoadFactor( int flag = 0) { uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_]; uint32_t* d_bucket_pairs_count; diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh index f48b5e5..3530d21 100644 --- a/src/concurrent_map/device/build.cuh +++ b/src/concurrent_map/device/build.cuh @@ -18,12 +18,13 @@ /* * */ -template +template __global__ void build_table_kernel( KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -31,7 +32,8 @@ __global__ void build_table_kernel( return; } - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext + local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; @@ -49,25 +51,29 @@ __global__ void build_table_kernel( slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); } -template +template __global__ void build_table_with_unique_keys_kernel( + int *num_successes, KeyT* d_key, ValueT* d_value, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; - if ((tid - laneId) >= num_keys) { - return; - } - - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext + local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; ValueT myValue = 0; uint32_t myBucket = 0; + int mySuccess = 0; bool to_insert = false; if (tid < num_keys) { @@ -77,6 +83,13 @@ __global__ void build_table_with_unique_keys_kernel( to_insert = true; } - slab_hash.insertPairUnique( - to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + if ((tid - laneId) < num_keys) { + slab_hash.insertPairUnique(mySuccess, + to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx); + } + + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess); + if(threadIdx.x == 0) { + atomicAdd(num_successes, block_num_successes); + } } \ No newline at end of file diff --git a/src/concurrent_map/device/concurrent_kernel.cuh b/src/concurrent_map/device/concurrent_kernel.cuh index d72c546..8cd7626 100644 --- a/src/concurrent_map/device/concurrent_kernel.cuh +++ b/src/concurrent_map/device/concurrent_kernel.cuh @@ -16,12 +16,12 @@ #pragma once -template +template __global__ void batched_operations( uint32_t* d_operations, uint32_t* d_results, uint32_t num_operations, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -29,7 +29,7 @@ __global__ void batched_operations( return; // initializing the memory allocator on each warp: - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); uint32_t myOperation = 0; diff --git a/src/concurrent_map/device/count_kernel.cuh b/src/concurrent_map/device/count_kernel.cuh index 99fffba..d71da3d 100644 --- a/src/concurrent_map/device/count_kernel.cuh +++ b/src/concurrent_map/device/count_kernel.cuh @@ -16,12 +16,12 @@ #pragma once -template +template __global__ void count_key( KeyT* d_queries, uint32_t* d_counts, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/device/delete_kernel.cuh b/src/concurrent_map/device/delete_kernel.cuh index 9542623..edc47cc 100644 --- a/src/concurrent_map/device/delete_kernel.cuh +++ b/src/concurrent_map/device/delete_kernel.cuh @@ -16,11 +16,11 @@ #pragma once -template +template __global__ void delete_table_keys( KeyT* d_key_deleted, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/device/misc_kernels.cuh b/src/concurrent_map/device/misc_kernels.cuh index 5f9e5b8..0dfef23 100644 --- a/src/concurrent_map/device/misc_kernels.cuh +++ b/src/concurrent_map/device/misc_kernels.cuh @@ -21,9 +21,9 @@ * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and * d_slabs_count_result arrays respectively */ -template +template __global__ void bucket_count_kernel( - GpuSlabHashContext slab_hash, + GpuSlabHashContext slab_hash, uint32_t* d_pairs_count_result, uint32_t* d_slabs_count_result, uint32_t num_buckets) { diff --git a/src/concurrent_map/device/search_kernel.cuh b/src/concurrent_map/device/search_kernel.cuh index c5f38e6..e01ec56 100644 --- a/src/concurrent_map/device/search_kernel.cuh +++ b/src/concurrent_map/device/search_kernel.cuh @@ -17,12 +17,12 @@ #pragma once //=== Individual search kernel: -template +template __global__ void search_table( KeyT* d_queries, ValueT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -49,12 +49,12 @@ __global__ void search_table( } //=== Bulk search kernel: -template +template __global__ void search_table_bulk( KeyT* d_queries, ValueT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_map/warp/count.cuh b/src/concurrent_map/warp/count.cuh index 4e9a70f..e3d0ede 100644 --- a/src/concurrent_map/warp/count.cuh +++ b/src/concurrent_map/warp/count.cuh @@ -19,9 +19,9 @@ //================================================ // Individual Count Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::countKey( +GpuSlabHashContext::countKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/warp/delete.cuh b/src/concurrent_map/warp/delete.cuh index d8d6685..dc65ced 100644 --- a/src/concurrent_map/warp/delete.cuh +++ b/src/concurrent_map/warp/delete.cuh @@ -16,9 +16,9 @@ #pragma once -template +template __device__ __forceinline__ bool -GpuSlabHashContext::deleteKey( +GpuSlabHashContext::deleteKey( bool& to_be_deleted, const uint32_t& laneId, const KeyT& myKey, diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh index 8653ddf..2a88bd7 100644 --- a/src/concurrent_map/warp/insert.cuh +++ b/src/concurrent_map/warp/insert.cuh @@ -21,15 +21,16 @@ * it is assumed all threads within a warp are present and collaborating with * each other with a warp-cooperative work sharing (WCWS) strategy. */ -template +template __device__ __forceinline__ void -GpuSlabHashContext::insertPair( +GpuSlabHashContext::insertPair( + /*bool& mySuccess,*/ bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { using SlabHashT = ConcurrentMapT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -54,6 +55,11 @@ GpuSlabHashContext::insertPair( if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); + if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased + //mySuccess = false; // signal that this key needs to be reinserted + to_be_inserted = false; + continue; + } // TODO: experiment if it's better to use lane 0 instead if (laneId == 31) { @@ -87,8 +93,10 @@ GpuSlabHashContext::insertPair( << 32) | *reinterpret_cast( reinterpret_cast(&myKey))); - if (old_key_value_pair == EMPTY_PAIR_64) - to_be_inserted = false; // succesfful insertion + if (old_key_value_pair == EMPTY_PAIR_64) { + //mySuccess = true; + to_be_inserted = false; // successful insertion + } } } last_work_queue = work_queue; @@ -102,15 +110,17 @@ GpuSlabHashContext::insertPair( * each other with a warp-cooperative work sharing (WCWS) strategy. * returns true only if a new key was inserted into the hash table */ -template +template __device__ __forceinline__ bool -GpuSlabHashContext::insertPairUnique( +GpuSlabHashContext::insertPairUnique( + int& mySuccess, bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const ValueT& myValue, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { + using SlabHashT = ConcurrentMapT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -135,14 +145,19 @@ GpuSlabHashContext::insertPairUnique uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) & SlabHashT::REGULAR_NODE_KEY_MASK; if (isExisting) { // key exist in the hash table - if (laneId == src_lane) + if (laneId == src_lane) { to_be_inserted = false; + } } else { if (isEmpty == 0) { // no empty slot available: uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32); if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) { // allocate a new node: uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId); + if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased + to_be_inserted = false; + continue; + } if (laneId == 31) { const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER) @@ -176,6 +191,7 @@ GpuSlabHashContext::insertPairUnique *reinterpret_cast( reinterpret_cast(&myKey))); if (old_key_value_pair == EMPTY_PAIR_64) { + mySuccess += 1; to_be_inserted = false; // successful insertion new_insertion = true; } diff --git a/src/concurrent_map/warp/search.cuh b/src/concurrent_map/warp/search.cuh index de490de..0bb39be 100644 --- a/src/concurrent_map/warp/search.cuh +++ b/src/concurrent_map/warp/search.cuh @@ -19,9 +19,9 @@ //================================================ // Individual Search Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::searchKey( +GpuSlabHashContext::searchKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, @@ -73,9 +73,9 @@ GpuSlabHashContext::searchKey( //================================================ // Bulk Search Unit: //================================================ -template +template __device__ __forceinline__ void -GpuSlabHashContext::searchKeyBulk( +GpuSlabHashContext::searchKeyBulk( const uint32_t& laneId, const KeyT& myKey, ValueT& myValue, diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh index 3a0fd6a..518d556 100644 --- a/src/concurrent_set/cset_class.cuh +++ b/src/concurrent_set/cset_class.cuh @@ -21,8 +21,9 @@ * used at runtime. This class does not own the allocated memory on the gpu * (i.e., d_table_) */ -template -class GpuSlabHashContext { +template +class GpuSlabHashContext { public: // fixed known parameters: static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u; @@ -36,7 +37,8 @@ class GpuSlabHashContext { #pragma hd_warning_disable __host__ __device__ GpuSlabHashContext( - GpuSlabHashContext& rhs) { + GpuSlabHashContext& rhs) { num_buckets_ = rhs.getNumBuckets(); hash_x_ = rhs.getHashX(); hash_y_ = rhs.getHashY(); @@ -57,7 +59,8 @@ class GpuSlabHashContext { const uint32_t hash_x, const uint32_t hash_y, int8_t* d_table, - AllocatorContextT* allocator_ctx) { + SlabAllocLightContext* allocator_ctx) { num_buckets_ = num_buckets; hash_x_ = hash_x; hash_y_ = hash_y; @@ -65,7 +68,8 @@ class GpuSlabHashContext { global_allocator_ctx_ = *allocator_ctx; } - __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() { + __device__ __host__ __forceinline__ SlabAllocLightContext& getAllocatorContext() { return global_allocator_ctx_; } @@ -88,7 +92,8 @@ class GpuSlabHashContext { const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id, - AllocatorContextT& local_allocator_context); + SlabAllocLightContext& + local_allocator_context); // threads in a warp cooeparte with each other to search for keys // if found, it returns the true, else false @@ -124,7 +129,8 @@ class GpuSlabHashContext { } __device__ __forceinline__ SlabAllocAddressT - allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) { + allocateSlab(SlabAllocLightContext& + local_allocator_ctx, const uint32_t& laneId) { return local_allocator_ctx.warpAllocate(laneId); } @@ -139,14 +145,14 @@ class GpuSlabHashContext { uint32_t hash_y_; typename ConcurrentSetT::SlabTypeT* d_table_; // a copy of dynamic allocator's context to be used on the GPU - AllocatorContextT global_allocator_ctx_; + SlabAllocLightContext global_allocator_ctx_; }; /* * This class owns the allocated memory for the hash table */ -template -class GpuSlabHash { +template +class GpuSlabHash { private: // fixed known parameters: static constexpr uint32_t BLOCKSIZE_ = 128; @@ -168,19 +174,21 @@ class GpuSlabHash { // slab hash context, contains everything that a GPU application needs to be // able to use this data structure - GpuSlabHashContext gpu_context_; + GpuSlabHashContext gpu_context_; // const pointer to an allocator that all instances of slab hash are going to // use. The allocator itself is not owned by this class - DynamicAllocatorT* dynamic_allocator_; + SlabAllocLight* dynamic_allocator_; uint32_t device_idx_; public: GpuSlabHash(const uint32_t num_buckets, - DynamicAllocatorT* dynamic_allocator, + SlabAllocLight* dynamic_allocator, uint32_t device_idx, const time_t seed = 0, - const bool identity_hash = false) + const bool identity_hash = false, + float thresh_lf = 0.60) : num_buckets_(num_buckets) , d_table_(nullptr) , slab_unit_size_(0) @@ -197,7 +205,8 @@ class GpuSlabHash { CHECK_CUDA_ERROR(cudaSetDevice(device_idx_)); slab_unit_size_ = - GpuSlabHashContext::getSlabUnitSize(); + GpuSlabHashContext::getSlabUnitSize(); // allocating initial buckets: CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_)); @@ -228,7 +237,8 @@ class GpuSlabHash { // returns some debug information about the slab hash std::string to_string(); double computeLoadFactor(int flag) {} - GpuSlabHashContext& getSlabHashContext() { + GpuSlabHashContext& getSlabHashContext() { return gpu_context_; } diff --git a/src/concurrent_set/cset_helper_kernels.cuh b/src/concurrent_set/cset_helper_kernels.cuh index 211f7f7..3c4c3a2 100644 --- a/src/concurrent_set/cset_helper_kernels.cuh +++ b/src/concurrent_set/cset_helper_kernels.cuh @@ -16,11 +16,11 @@ #pragma once namespace cset { -template +template __global__ void build_table_kernel( KeyT* d_key, uint32_t num_keys, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; @@ -29,7 +29,7 @@ __global__ void build_table_kernel( } // initializing the memory allocator on each warp: - AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext()); + SlabAllocLightContext local_allocator_ctx(slab_hash.getAllocatorContext()); local_allocator_ctx.initAllocator(tid, laneId); KeyT myKey = 0; @@ -46,12 +46,12 @@ __global__ void build_table_kernel( } //=== Individual search kernel: -template +template __global__ void search_table( KeyT* d_queries, KeyT* d_results, uint32_t num_queries, - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t laneId = threadIdx.x & 0x1F; diff --git a/src/concurrent_set/cset_implementation.cuh b/src/concurrent_set/cset_implementation.cuh index d189054..dfa4375 100644 --- a/src/concurrent_set/cset_implementation.cuh +++ b/src/concurrent_set/cset_implementation.cuh @@ -16,8 +16,8 @@ #pragma once -template -void GpuSlabHash::buildBulk( +template +void GpuSlabHash::buildBulk( KeyT* d_key, ValueT* d_value, uint32_t num_keys) { @@ -28,8 +28,8 @@ void GpuSlabHash::buildBulk( <<>>(d_key, num_keys, gpu_context_); } -template -void GpuSlabHash::searchIndividual( +template +void GpuSlabHash::searchIndividual( KeyT* d_query, ValueT* d_result, uint32_t num_queries) { @@ -39,8 +39,8 @@ void GpuSlabHash::searchIndividual( <<>>(d_query, d_result, num_queries, gpu_context_); } -template -std::string GpuSlabHash::to_string() { +template +std::string GpuSlabHash::to_string() { std::string result; result += " ==== GpuSlabHash: \n"; result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n"; diff --git a/src/concurrent_set/cset_warp_operations.cuh b/src/concurrent_set/cset_warp_operations.cuh index 5211c75..3b1240a 100644 --- a/src/concurrent_set/cset_warp_operations.cuh +++ b/src/concurrent_set/cset_warp_operations.cuh @@ -16,14 +16,14 @@ #pragma once -template +template __device__ __forceinline__ bool -GpuSlabHashContext::insertKey( +GpuSlabHashContext::insertKey( bool& to_be_inserted, const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id, - AllocatorContextT& local_allocator_ctx) { + SlabAllocLightContext& local_allocator_ctx) { using SlabHashT = ConcurrentSetT; uint32_t work_queue = 0; uint32_t last_work_queue = 0; @@ -91,9 +91,9 @@ GpuSlabHashContext::insertKey( } // ======== -template +template __device__ __forceinline__ bool -GpuSlabHashContext::searchKey( +GpuSlabHashContext::searchKey( bool& to_be_searched, const uint32_t& laneId, const KeyT& myKey, @@ -138,9 +138,9 @@ GpuSlabHashContext::searchKey( return myResult; } -template +template __device__ __forceinline__ bool -GpuSlabHashContext::searchKeyBulk( +GpuSlabHashContext::searchKeyBulk( const uint32_t& laneId, const KeyT& myKey, const uint32_t bucket_id) {} \ No newline at end of file diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh index 167245e..14de391 100644 --- a/src/gpu_hash_table.cuh +++ b/src/gpu_hash_table.cuh @@ -21,7 +21,8 @@ * This class acts as a helper class to simplify simulations around different * kinds of slab hash implementations */ -template +template class gpu_hash_table { private: uint32_t max_keys_; @@ -32,10 +33,13 @@ class gpu_hash_table { public: // Slab hash invariant - GpuSlabHash* slab_hash_; + GpuSlabHash* slab_hash_; + + SlabAllocLight* dynamic_allocator_; // the dynamic allocator that is being used for slab hash - DynamicAllocatorT* dynamic_allocator_; + //DynamicAllocatorT* dynamic_allocator_; + uint32_t device_idx_; @@ -46,16 +50,20 @@ class gpu_hash_table { ValueT* d_result_; uint32_t* d_count_; + float thresh_lf_; + gpu_hash_table(uint32_t max_keys, uint32_t num_buckets, const uint32_t device_idx, const int64_t seed, const bool req_values = true, const bool identity_hash = false, - const bool verbose = false) + const bool verbose = false, + float thresh_lf = 0.60) : max_keys_(max_keys) , num_buckets_(num_buckets) , seed_(seed) + , thresh_lf_(thresh_lf) , req_values_(req_values) , slab_hash_(nullptr) , identity_hash_(identity_hash) @@ -74,14 +82,14 @@ class gpu_hash_table { } CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_)); CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_)); - CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); - + //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_)); + // allocate an initialize the allocator: - dynamic_allocator_ = new DynamicAllocatorT(); + dynamic_allocator_ = new SlabAllocLight(num_buckets); // slab hash: - slab_hash_ = new GpuSlabHash( - num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_); + slab_hash_ = new GpuSlabHash( + num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_, thresh_lf_); if (verbose) { std::cout << slab_hash_->to_string() << std::endl; } @@ -95,7 +103,7 @@ class gpu_hash_table { } CHECK_CUDA_ERROR(cudaFree(d_query_)); CHECK_CUDA_ERROR(cudaFree(d_result_)); - CHECK_CUDA_ERROR(cudaFree(d_count_)); + //CHECK_CUDA_ERROR(cudaFree(d_count_)); // delete the dynamic allocator: delete dynamic_allocator_; diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh index 81c9d17..5ee1ea1 100644 --- a/src/slab_hash_global.cuh +++ b/src/slab_hash_global.cuh @@ -17,6 +17,7 @@ #pragma once #include "slab_alloc.cuh" +#include "cub/cub.cuh" #define CHECK_CUDA_ERROR(call) \ do { \ @@ -118,17 +119,17 @@ class PhaseConcurrentMapT { }; // the main class to be specialized for different types of hash tables -template +template class GpuSlabHash; -template +template class GpuSlabHashContext; // The custom allocator that is being used for this code: // this might need to be a template paramater itself namespace slab_alloc_par { -constexpr uint32_t log_num_mem_blocks = 8; -constexpr uint32_t num_super_blocks = 32; +constexpr uint32_t log_num_mem_blocks = 9; // 64 MB +constexpr uint32_t num_super_blocks = 15; constexpr uint32_t num_replicas = 1; } // namespace slab_alloc_par diff --git a/src/slab_iterator.cuh b/src/slab_iterator.cuh index 3cd0586..35199e6 100644 --- a/src/slab_iterator.cuh +++ b/src/slab_iterator.cuh @@ -19,12 +19,12 @@ // a forward iterator for the slab hash data structure: // currently just specialized for concurrent set // TODO implement for other types -template +template class SlabIterator { public: using SlabHashT = ConcurrentSetT; - GpuSlabHashContext& slab_hash_; + GpuSlabHashContext& slab_hash_; // current position of the iterator KeyT* cur_ptr_; @@ -35,7 +35,7 @@ class SlabIterator { // initialize the iterator with the first bucket's pointer address of the slab // hash __host__ __device__ - SlabIterator(GpuSlabHashContext& slab_hash) + SlabIterator(GpuSlabHashContext& slab_hash) : slab_hash_(slab_hash) , cur_ptr_(reinterpret_cast(slab_hash_.getDeviceTablePointer())) , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE) diff --git a/test/iterator_test.cu b/test/iterator_test.cu index bb0461f..7b5b9ee 100644 --- a/test/iterator_test.cu +++ b/test/iterator_test.cu @@ -28,9 +28,9 @@ #define DEVICE_ID 0 //======================================= -template +template __global__ void print_table( - GpuSlabHashContext slab_hash) { + GpuSlabHashContext slab_hash) { uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t wid = tid >> 5; uint32_t laneId = threadIdx.x & 0x1F; @@ -44,7 +44,7 @@ __global__ void print_table( if (tid == 0) { printf(" == Printing the base array\n"); - SlabIterator iter(slab_hash); + SlabIterator iter(slab_hash); for (int i = 0; i < iter.cur_size_; i++) { if ((i & 0x1F) == 0) printf(" == bucket %d:\n", i >> 5);