diff --git a/include/gpu_engine.cuh b/include/gpu_engine.cuh index b033035..effb7b2 100644 --- a/include/gpu_engine.cuh +++ b/include/gpu_engine.cuh @@ -369,6 +369,53 @@ void size( } } +// for Core = MultiBucketHashTable +template +GLOBALQUALIFIER +void num_values( + index_t * const num_out, + const Core core) +{ + __shared__ index_t smem; + + const index_t tid = helpers::global_thread_id(); + const auto block = cg::this_thread_block(); + + if(tid < core.capacity()) + { + const bool empty = !core.is_valid_key(core.table_[tid].key); + + if(block.thread_rank() == 0) + { + smem = 0; + } + + block.sync(); + + index_t value_count = 0; + if(!empty) + { + const auto bucket = core.table_[tid].value; + #pragma unroll + for(int b = 0; b < core.bucket_size(); ++b) { + const auto& value = bucket[b]; + if(value != core.empty_value()) + ++value_count; + } + + // TODO warp reduce + atomicAdd(&smem, value_count); + } + + block.sync(); + + if(block.thread_rank() == 0 && smem != 0) + { + atomicAdd(num_out, smem); + } + } +} + template GLOBALQUALIFIER void num_values( diff --git a/include/multi_bucket_hash_table.cuh b/include/multi_bucket_hash_table.cuh index 0f1cf0e..3f27ac3 100644 --- a/include/multi_bucket_hash_table.cuh +++ b/include/multi_bucket_hash_table.cuh @@ -996,6 +996,36 @@ public: } } + /*! \brief number of occupied slots in the hash table + * \param[in] stream CUDA stream in which this operation is executed in + * \return the number of occupied slots + */ + HOSTQUALIFIER INLINEQUALIFIER + index_type num_occupied(const cudaStream_t stream = 0) const noexcept + { + if(!is_initialized_) return 0; + + index_type out; + index_type * tmp = temp_.get(); + + cudaMemsetAsync(tmp, 0, sizeof(index_t), stream); + + kernels::size + <<>> + (tmp, *this); + + cudaMemcpyAsync( + &out, + tmp, + sizeof(index_type), + D2H, + stream); + + cudaStreamSynchronize(stream); + + return out; + } + /*! \brief number of values stored inside the hash table * \info alias for \c size() * \param[in] stream CUDA stream in which this operation is executed in @@ -1021,49 +1051,9 @@ public: cudaMemsetAsync(tmp, 0, sizeof(index_t), stream); - helpers::lambda_kernel + kernels::num_values <<>> - ([=, *this] DEVICEQUALIFIER - { - __shared__ index_t smem; - - const index_t tid = helpers::global_thread_id(); - const auto block = cg::this_thread_block(); - - if(tid < capacity()) - { - const bool empty = !is_valid_key(table_[tid].key); - - if(block.thread_rank() == 0) - { - smem = 0; - } - - block.sync(); - - index_t value_count = 0; - if(!empty) - { - const auto bucket = table_[tid].value; - #pragma unroll - for(int b = 0; b < bucket_size(); ++b) { - const auto& value = bucket[b]; - if(value != empty_value()) - ++value_count; - } - - // TODO warp reduce - atomicAdd(&smem, value_count); - } - - block.sync(); - - if(block.thread_rank() == 0 && smem != 0) - { - atomicAdd(tmp, smem); - } - } - }); + (tmp, *this); cudaMemcpyAsync( &out, @@ -1084,7 +1074,7 @@ public: HOSTQUALIFIER INLINEQUALIFIER float key_load_factor(const cudaStream_t stream = 0) const noexcept { - return float(num_keys(stream)) / float(capacity()); + return float(num_occupied(stream)) / float(capacity()); } /*! \brief current load factor of the hash table @@ -1336,6 +1326,10 @@ private: GLOBALQUALIFIER friend void kernels::size(index_type * const, const Core); + template + GLOBALQUALIFIER + friend void kernels::num_values(index_type * const, const Core); + template GLOBALQUALIFIER friend void kernels::for_each(Func, const Core);