diff --git a/.gitmodules b/.gitmodules
index fcfa184..201a205 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,12 @@
 [submodule "SlabAlloc"]
 	path = SlabAlloc
-	url = https://github.com/owensgroup/SlabAlloc
+	url = https://github.com/Nicolas-Iskos/SlabAlloc
 [submodule "ThirdParty/rapidjson"]
 	path = ThirdParty/rapidjson
 	url = https://github.com/Tencent/rapidjson
 [submodule "ThirdParty/googletest"]
 	path = ThirdParty/googletest
 	url = https://github.com/google/googletest
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVIDIA/cub.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2310713..72c7a6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ find_package(CUDA 8.0 REQUIRED)
 option(CMAKE_VERBOSE_MAKEFILE ON)
 option(DGTEST,  "DGTEST"  ON)
 
-set(CUDA_NVCC_FLAGS -std=c++11)
+set(CUDA_NVCC_FLAGS -std=c++14)
 set (CMAKE_CXX_STANDARD 11)
 
 if (CUDA_VERBOSE_PTXAS)
@@ -83,6 +83,7 @@ if(SLABHASH_GENCODE_SM75)
 endif(SLABHASH_GENCODE_SM75)
 
 include_directories(SlabAlloc/src)
+include_directories(cub)
 include_directories(src src/concurrent)
 include_directories(ThirdParty/rapidjson/include)
 include_directories(ThirdParty/googletest/googletest)
diff --git a/README.md b/README.md
index 2c8c171..8a20910 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # SlabHash
-A warp-oriented dynamic hash table for GPUs
+A warp-oriented dynamic hash table for GPUs. 
+
+This fork of SlabHash is modified to allow for dynamic growth of the slab pool, allowing the slab hash to increase its total memory footprint as keys are inserted during run time.
 
 ## Publication:
 This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper:
@@ -489,4 +491,4 @@ init lf         final lf        num buckets     init build rate(M/s)    concurre
 0.59            0.58            80660           501.725         639.850
 0.64            0.62            69906           493.702         601.822
 
-```
\ No newline at end of file
+```
diff --git a/SlabAlloc b/SlabAlloc
index d655028..13983dd 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit d6550289344cd4474d237e959bfd6bc962981af7
+Subproject commit 13983dd0f92d2ef9b9079651d9a28ffe6da74a43
diff --git a/cub b/cub
new file mode 160000
index 0000000..c3be9a9
--- /dev/null
+++ b/cub
@@ -0,0 +1 @@
+Subproject commit c3be9a94273b5049520aacc7db00c738668aaa3f
diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 1329740..bbd74b1 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -22,8 +22,9 @@
  * used at runtime. This class does not own the allocated memory on the gpu
  * (i.e., d_table_)
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                         log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -31,12 +32,16 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext()
-      : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {}
+      : num_buckets_(0), total_num_slabs_(0), total_num_keys_(0), 
+        hash_x_(0), hash_y_(0), d_table_(nullptr) {}
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+      log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
+    total_num_slabs_ = rhs.getTotalNumSlabs();
+    total_num_keys_ = rhs.getTotalNumKeys();
     hash_x_ = rhs.getHashX();
     hash_y_ = rhs.getHashY();
     d_table_ = rhs.getDeviceTablePointer();
@@ -58,16 +63,30 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               AllocatorContextT* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, 
+                               num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
     d_table_ =
         reinterpret_cast<typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT*>(d_table);
     global_allocator_ctx_ = *allocator_ctx;
+    total_num_slabs_ = num_buckets + global_allocator_ctx_.getNumSlabsInPool();
+    total_num_keys_ = 0;
   }
 
-  __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
+  __host__ void updateAllocatorContext(SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>* allocator_ctx) {
+    global_allocator_ctx_ = *allocator_ctx;
+    total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool();
+  }
+
+  __host__ void updateTotalNumKeys(uint32_t keysAdded) {
+    total_num_keys_ += keysAdded;
+  }
+
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -76,6 +95,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     return d_table_;
   }
 
+  __device__ __host__ __forceinline__ uint32_t getTotalNumSlabs() {return total_num_slabs_; }
+  __device__ __host__ __forceinline__ uint32_t getTotalNumKeys() {return total_num_keys_; }
   __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; }
   __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; }
   __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; }
@@ -86,22 +107,25 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // threads in a warp cooperate with each other to insert key-value pairs
   // into the slab hash
-  __device__ __forceinline__ void insertPair(bool& to_be_inserted,
+  __device__ __forceinline__ void insertPair(/*bool& mySuccess,*/
+                                             bool& to_be_inserted,
                                              const uint32_t& laneId,
                                              const KeyT& myKey,
                                              const ValueT& myValue,
                                              const uint32_t bucket_id,
-                                             AllocatorContextT& local_allocator_context);
+                                             SlabAllocLightContext<log_num_mem_blocks, 
+                                             num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
   __device__ __forceinline__ bool insertPairUnique(
+      int& mySuccess,
       bool& to_be_inserted,
       const uint32_t& laneId,
       const KeyT& myKey,
       const ValueT& myValue,
       const uint32_t bucket_id,
-      AllocatorContextT& local_allocator_context);
+      SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooperate with each other to search for keys
   // if found, it returns the corresponding value, else SEARCH_NOT_FOUND
@@ -154,7 +178,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+    local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -165,18 +190,20 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // === members:
   uint32_t num_buckets_;
+  uint32_t total_num_slabs_;
+  uint32_t total_num_keys_;
   uint32_t hash_x_;
   uint32_t hash_y_;
   typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT* d_table_;
   // a copy of dynamic allocator's context to be used on the GPU
-  AllocatorContextT global_allocator_ctx_;
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> global_allocator_ctx_;
 };
 
 /*
  * This class owns the allocated memory for the hash table
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> {
  private:
   // fixed known parameters:
   static constexpr uint32_t BLOCKSIZE_ = 128;
@@ -198,24 +225,29 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+  log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
-  DynamicAllocatorT* dynamic_allocator_;
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1> *dynamic_allocator_;
   uint32_t device_idx_;
 
+  float thresh_lf_;
+
  public:
   GpuSlabHash(const uint32_t num_buckets,
-              DynamicAllocatorT* dynamic_allocator,
+              SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
-              const bool identity_hash = false)
+              const bool identity_hash = false,
+              float thresh_lf = 0.60)
       : num_buckets_(num_buckets)
       , d_table_(nullptr)
       , slab_unit_size_(0)
       , dynamic_allocator_(dynamic_allocator)
-      , device_idx_(device_idx) {
+      , device_idx_(device_idx)
+      , thresh_lf_(thresh_lf) {
     assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash.");
     assert(sizeof(typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT) ==
                (WARP_WIDTH_ * sizeof(uint32_t)) &&
@@ -227,7 +259,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+        log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
@@ -258,7 +291,9 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   // returns some debug information about the slab hash
   std::string to_string();
   double computeLoadFactor(int flag);
-
+ 
+  void resize();
+  uint32_t checkForPreemptiveResize(uint32_t keysAdded);
   void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
   void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
   void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries);
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 99880a8..80890a0 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -16,30 +16,84 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::resize() {
+  dynamic_allocator_->growPool();
+  gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+}
+
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                     log_num_mem_blocks, num_super_blocks>::checkForPreemptiveResize(uint32_t keysAdded) {
+  auto numSlabs = gpu_context_.getTotalNumSlabs();
+  
+  auto capacity = numSlabs * 16; // capacity in key-value size multiples
+  auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
+  auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
+  auto numResizes = 0;
+
+  if(finalSlabLoadFactor >= thresh_lf_) {
+    numResizes = 1;
+  }
+
+  return numResizes;
+}
+
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
+  
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
-  // calling the kernel for bulk build:
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
+  auto numResizes = checkForPreemptiveResize(num_keys);
+  for(auto i = 0; i < numResizes; ++i) {
+    resize();
+  }
+
+  // calling the kernel for bulk build:
   build_table_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
+  CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+  
+  // now that the bulk insert has completed successfully, we can
+  // update the total number of keys in the table
+  gpu_context_.updateTotalNumKeys(num_keys);
 }
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
-    KeyT* d_key,
-    ValueT* d_value,
-    uint32_t num_keys) {
+
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::buildBulkWithUniqueKeys(
+  KeyT* d_key,
+  ValueT* d_value,
+  uint32_t num_keys) {
+  
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
-  // calling the kernel for bulk build:
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
+  auto numResizes = checkForPreemptiveResize(num_keys);
+  for(auto i = 0; i < numResizes; ++i) {
+    resize();
+  }
+
+  // calling the kernel for bulk build:
+  int *num_successes;
+  CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int)));
+  *num_successes = 0;
+
   build_table_with_unique_keys_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
+      <<<num_blocks, BLOCKSIZE_>>>(num_successes, d_key, d_value, num_keys, gpu_context_);
+  CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+  
+  // now that the bulk insert has completed successfully, we can
+  // update the total number of keys in the table
+  gpu_context_.updateTotalNumKeys(*num_successes);
 }
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
+
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::searchIndividual(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -49,8 +103,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::searchBulk(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -60,8 +115,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchBulk(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::countIndividual(
     KeyT* d_query,
     uint32_t* d_count,
     uint32_t num_queries) {
@@ -71,8 +127,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_count, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::deleteIndividual(
     KeyT* d_key,
     uint32_t num_keys) {
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
@@ -82,8 +139,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteIndividual(
 }
 
 // perform a batch of (a mixture of) updates/searches
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::batchedOperation(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::batchedOperation(
     KeyT* d_key,
     ValueT* d_result,
     uint32_t num_ops) {
@@ -93,8 +151,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::batchedOperation(
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_result, num_ops, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::to_string() {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                        log_num_mem_blocks, num_super_blocks>::to_string() {
   std::string result;
   result += " ==== GpuSlabHash: \n";
   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
@@ -108,8 +167,9 @@ std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::to_string()
   return result;
 }
 
-template <typename KeyT, typename ValueT>
-double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::computeLoadFactor(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                   log_num_mem_blocks, num_super_blocks>::computeLoadFactor(
     int flag = 0) {
   uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_];
   uint32_t* d_bucket_pairs_count;
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index f48b5e5..3530d21 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -18,12 +18,13 @@
 /*
  *
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_kernel(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+    log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -31,7 +32,8 @@ __global__ void build_table_kernel(
     return;
   }
 
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> 
+    local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
@@ -49,25 +51,29 @@ __global__ void build_table_kernel(
   slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
 }
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_with_unique_keys_kernel(
+    int *num_successes,
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+    log_num_mem_blocks, num_super_blocks> slab_hash) {
+  
+  typedef cub::BlockReduce<std::size_t, 128> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
-  if ((tid - laneId) >= num_keys) {
-    return;
-  }
-
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> 
+    local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
+  int mySuccess = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
@@ -77,6 +83,13 @@ __global__ void build_table_with_unique_keys_kernel(
     to_insert = true;
   }
 
-  slab_hash.insertPairUnique(
-      to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  if ((tid - laneId) < num_keys) {
+    slab_hash.insertPairUnique(mySuccess,
+        to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  }
+      
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess);
+  if(threadIdx.x == 0) {
+    atomicAdd(num_successes, block_num_successes);
+  }
 }
\ No newline at end of file
diff --git a/src/concurrent_map/device/concurrent_kernel.cuh b/src/concurrent_map/device/concurrent_kernel.cuh
index d72c546..8cd7626 100644
--- a/src/concurrent_map/device/concurrent_kernel.cuh
+++ b/src/concurrent_map/device/concurrent_kernel.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void batched_operations(
     uint32_t* d_operations,
     uint32_t* d_results,
     uint32_t num_operations,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -29,7 +29,7 @@ __global__ void batched_operations(
     return;
 
   // initializing the memory allocator on each warp:
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   uint32_t myOperation = 0;
diff --git a/src/concurrent_map/device/count_kernel.cuh b/src/concurrent_map/device/count_kernel.cuh
index 99fffba..d71da3d 100644
--- a/src/concurrent_map/device/count_kernel.cuh
+++ b/src/concurrent_map/device/count_kernel.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void count_key(
     KeyT* d_queries,
     uint32_t* d_counts,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/device/delete_kernel.cuh b/src/concurrent_map/device/delete_kernel.cuh
index 9542623..edc47cc 100644
--- a/src/concurrent_map/device/delete_kernel.cuh
+++ b/src/concurrent_map/device/delete_kernel.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void delete_table_keys(
     KeyT* d_key_deleted,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/device/misc_kernels.cuh b/src/concurrent_map/device/misc_kernels.cuh
index 5f9e5b8..0dfef23 100644
--- a/src/concurrent_map/device/misc_kernels.cuh
+++ b/src/concurrent_map/device/misc_kernels.cuh
@@ -21,9 +21,9 @@
  * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and
  * d_slabs_count_result arrays respectively
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void bucket_count_kernel(
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash,
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash,
     uint32_t* d_pairs_count_result,
     uint32_t* d_slabs_count_result,
     uint32_t num_buckets) {
diff --git a/src/concurrent_map/device/search_kernel.cuh b/src/concurrent_map/device/search_kernel.cuh
index c5f38e6..e01ec56 100644
--- a/src/concurrent_map/device/search_kernel.cuh
+++ b/src/concurrent_map/device/search_kernel.cuh
@@ -17,12 +17,12 @@
 #pragma once
 
 //=== Individual search kernel:
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table(
     KeyT* d_queries,
     ValueT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -49,12 +49,12 @@ __global__ void search_table(
 }
 
 //=== Bulk search kernel:
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table_bulk(
     KeyT* d_queries,
     ValueT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/warp/count.cuh b/src/concurrent_map/warp/count.cuh
index 4e9a70f..e3d0ede 100644
--- a/src/concurrent_map/warp/count.cuh
+++ b/src/concurrent_map/warp/count.cuh
@@ -19,9 +19,9 @@
 //================================================
 // Individual Count Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::countKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
diff --git a/src/concurrent_map/warp/delete.cuh b/src/concurrent_map/warp/delete.cuh
index d8d6685..dc65ced 100644
--- a/src/concurrent_map/warp/delete.cuh
+++ b/src/concurrent_map/warp/delete.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::deleteKey(
     bool& to_be_deleted,
     const uint32_t& laneId,
     const KeyT& myKey,
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 8653ddf..2a88bd7 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -21,15 +21,16 @@
  * it is assumed all threads within a warp are present and collaborating with
  * each other with a warp-cooperative work sharing (WCWS) strategy.
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::insertPair(
+    /*bool& mySuccess,*/
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const ValueT& myValue,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks>& local_allocator_ctx) {
   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -54,6 +55,11 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
         // allocate a new node:
         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
+        if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
+          //mySuccess = false; // signal that this key needs to be reinserted 
+          to_be_inserted = false;
+          continue;
+        }
 
         // TODO: experiment if it's better to use lane 0 instead
         if (laneId == 31) {
@@ -87,8 +93,10 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
                        << 32) |
                           *reinterpret_cast<const uint32_t*>(
                               reinterpret_cast<const unsigned char*>(&myKey)));
-        if (old_key_value_pair == EMPTY_PAIR_64)
-          to_be_inserted = false;  // succesfful insertion
+        if (old_key_value_pair == EMPTY_PAIR_64) {
+          //mySuccess = true;
+          to_be_inserted = false;  // successful insertion
+        }
       }
     }
     last_work_queue = work_queue;
@@ -102,15 +110,17 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
  * each other with a warp-cooperative work sharing (WCWS) strategy.
  * returns true only if a new key was inserted into the hash table
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::insertPairUnique(
+    int& mySuccess,
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const ValueT& myValue,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx) {
+
   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -135,14 +145,19 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
     uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) &
                           SlabHashT::REGULAR_NODE_KEY_MASK;
     if (isExisting) {  // key exist in the hash table
-      if (laneId == src_lane)
+      if (laneId == src_lane) {
         to_be_inserted = false;
+      }
     } else {
       if (isEmpty == 0) {  // no empty slot available:
         uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
         if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
           // allocate a new node:
           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
+          if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
+            to_be_inserted = false;
+            continue;
+          }
 
           if (laneId == 31) {
             const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
@@ -176,6 +191,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                             *reinterpret_cast<const uint32_t*>(
                                 reinterpret_cast<const unsigned char*>(&myKey)));
           if (old_key_value_pair == EMPTY_PAIR_64) {
+            mySuccess += 1;
             to_be_inserted = false;  // successful insertion
             new_insertion = true;
           }
diff --git a/src/concurrent_map/warp/search.cuh b/src/concurrent_map/warp/search.cuh
index de490de..0bb39be 100644
--- a/src/concurrent_map/warp/search.cuh
+++ b/src/concurrent_map/warp/search.cuh
@@ -19,9 +19,9 @@
 //================================================
 // Individual Search Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -73,9 +73,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKey(
 //================================================
 // Bulk Search Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKeyBulk(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchKeyBulk(
     const uint32_t& laneId,
     const KeyT& myKey,
     ValueT& myValue,
diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh
index 3a0fd6a..518d556 100644
--- a/src/concurrent_set/cset_class.cuh
+++ b/src/concurrent_set/cset_class.cuh
@@ -21,8 +21,9 @@
  * used at runtime. This class does not own the allocated memory on the gpu
  * (i.e., d_table_)
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                         log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -36,7 +37,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                         log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
     hash_x_ = rhs.getHashX();
     hash_y_ = rhs.getHashY();
@@ -57,7 +59,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               AllocatorContextT* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, 
+                               num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
@@ -65,7 +68,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
     global_allocator_ctx_ = *allocator_ctx;
   }
 
-  __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -88,7 +92,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
                                             const uint32_t& laneId,
                                             const KeyT& myKey,
                                             const uint32_t bucket_id,
-                                            AllocatorContextT& local_allocator_context);
+                                            SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+                                            local_allocator_context);
 
   // threads in a warp cooeparte with each other to search for keys
   // if found, it returns the true, else false
@@ -124,7 +129,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+    local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -139,14 +145,14 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   uint32_t hash_y_;
   typename ConcurrentSetT<KeyT>::SlabTypeT* d_table_;
   // a copy of dynamic allocator's context to be used on the GPU
-  AllocatorContextT global_allocator_ctx_;
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> global_allocator_ctx_;
 };
 
 /*
  * This class owns the allocated memory for the hash table
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> {
  private:
   // fixed known parameters:
   static constexpr uint32_t BLOCKSIZE_ = 128;
@@ -168,19 +174,21 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                     log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
-  DynamicAllocatorT* dynamic_allocator_;
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator_;
   uint32_t device_idx_;
 
  public:
   GpuSlabHash(const uint32_t num_buckets,
-              DynamicAllocatorT* dynamic_allocator,
+              SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
-              const bool identity_hash = false)
+              const bool identity_hash = false,
+              float thresh_lf = 0.60)
       : num_buckets_(num_buckets)
       , d_table_(nullptr)
       , slab_unit_size_(0)
@@ -197,7 +205,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                           log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
@@ -228,7 +237,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   // returns some debug information about the slab hash
   std::string to_string();
   double computeLoadFactor(int flag) {}
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& getSlabHashContext() {
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                     log_num_mem_blocks, num_super_blocks>& getSlabHashContext() {
     return gpu_context_;
   }
 
diff --git a/src/concurrent_set/cset_helper_kernels.cuh b/src/concurrent_set/cset_helper_kernels.cuh
index 211f7f7..3c4c3a2 100644
--- a/src/concurrent_set/cset_helper_kernels.cuh
+++ b/src/concurrent_set/cset_helper_kernels.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 namespace cset {
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_kernel(
     KeyT* d_key,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -29,7 +29,7 @@ __global__ void build_table_kernel(
   }
 
   // initializing the memory allocator on each warp:
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
@@ -46,12 +46,12 @@ __global__ void build_table_kernel(
 }
 
 //=== Individual search kernel:
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table(
     KeyT* d_queries,
     KeyT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_set/cset_implementation.cuh b/src/concurrent_set/cset_implementation.cuh
index d189054..dfa4375 100644
--- a/src/concurrent_set/cset_implementation.cuh
+++ b/src/concurrent_set/cset_implementation.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::buildBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
@@ -28,8 +28,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::buildBulk(
       <<<num_blocks, BLOCKSIZE_>>>(d_key, num_keys, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchIndividual(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -39,8 +39,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::to_string() {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::to_string() {
   std::string result;
   result += " ==== GpuSlabHash: \n";
   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
diff --git a/src/concurrent_set/cset_warp_operations.cuh b/src/concurrent_set/cset_warp_operations.cuh
index 5211c75..3b1240a 100644
--- a/src/concurrent_set/cset_warp_operations.cuh
+++ b/src/concurrent_set/cset_warp_operations.cuh
@@ -16,14 +16,14 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::insertKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::insertKey(
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx) {
   using SlabHashT = ConcurrentSetT<KeyT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -91,9 +91,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::insertKey(
 }
 
 // ========
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -138,9 +138,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKey(
   return myResult;
 }
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKeyBulk(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchKeyBulk(
     const uint32_t& laneId,
     const KeyT& myKey,
     const uint32_t bucket_id) {}
\ No newline at end of file
diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index 167245e..14de391 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -21,7 +21,8 @@
  * This class acts as a helper class to simplify simulations around different
  * kinds of slab hash implementations
  */
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, 
+          uint32_t log_num_mem_blocks=9, uint32_t num_super_blocks=1>
 class gpu_hash_table {
  private:
   uint32_t max_keys_;
@@ -32,10 +33,13 @@ class gpu_hash_table {
 
  public:
   // Slab hash invariant
-  GpuSlabHash<KeyT, ValueT, SlabHashT>* slab_hash_;
+  GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>* slab_hash_;
+
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator_;
 
   // the dynamic allocator that is being used for slab hash
-  DynamicAllocatorT* dynamic_allocator_;
+  //DynamicAllocatorT* dynamic_allocator_;
+
 
   uint32_t device_idx_;
 
@@ -46,16 +50,20 @@ class gpu_hash_table {
   ValueT* d_result_;
   uint32_t* d_count_;
 
+  float thresh_lf_;
+
   gpu_hash_table(uint32_t max_keys,
                  uint32_t num_buckets,
                  const uint32_t device_idx,
                  const int64_t seed,
                  const bool req_values = true,
                  const bool identity_hash = false,
-                 const bool verbose = false)
+                 const bool verbose = false,
+                 float thresh_lf = 0.60)
       : max_keys_(max_keys)
       , num_buckets_(num_buckets)
       , seed_(seed)
+      , thresh_lf_(thresh_lf)
       , req_values_(req_values)
       , slab_hash_(nullptr)
       , identity_hash_(identity_hash)
@@ -74,14 +82,14 @@ class gpu_hash_table {
     }
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
-    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
-
+    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
+    
     // allocate an initialize the allocator:
-    dynamic_allocator_ = new DynamicAllocatorT();
+    dynamic_allocator_ = new SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>(num_buckets);
 
     // slab hash:
-    slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT>(
-        num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_);
+    slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>(
+        num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_, thresh_lf_);
     if (verbose) {
       std::cout << slab_hash_->to_string() << std::endl;
     }
@@ -95,7 +103,7 @@ class gpu_hash_table {
     }
     CHECK_CUDA_ERROR(cudaFree(d_query_));
     CHECK_CUDA_ERROR(cudaFree(d_result_));
-    CHECK_CUDA_ERROR(cudaFree(d_count_));
+    //CHECK_CUDA_ERROR(cudaFree(d_count_));
 
     // delete the dynamic allocator:
     delete dynamic_allocator_;
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 81c9d17..5ee1ea1 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "slab_alloc.cuh"
+#include "cub/cub.cuh"
 
 #define CHECK_CUDA_ERROR(call)                                                          \
   do {                                                                                  \
@@ -118,17 +119,17 @@ class PhaseConcurrentMapT {
 };
 
 // the main class to be specialized for different types of hash tables
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class GpuSlabHash;
 
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class GpuSlabHashContext;
 
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 8;
-constexpr uint32_t num_super_blocks = 32;
+constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
+constexpr uint32_t num_super_blocks = 15;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 
diff --git a/src/slab_iterator.cuh b/src/slab_iterator.cuh
index 3cd0586..35199e6 100644
--- a/src/slab_iterator.cuh
+++ b/src/slab_iterator.cuh
@@ -19,12 +19,12 @@
 // a forward iterator for the slab hash data structure:
 // currently just specialized for concurrent set
 // TODO implement for other types
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class SlabIterator {
  public:
   using SlabHashT = ConcurrentSetT<KeyT>;
 
-  GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash_;
+  GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& slab_hash_;
 
   // current position of the iterator
   KeyT* cur_ptr_;
@@ -35,7 +35,7 @@ class SlabIterator {
   // initialize the iterator with the first bucket's pointer address of the slab
   // hash
   __host__ __device__
-  SlabIterator(GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash)
+  SlabIterator(GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& slab_hash)
       : slab_hash_(slab_hash)
       , cur_ptr_(reinterpret_cast<KeyT*>(slab_hash_.getDeviceTablePointer()))
       , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE)
diff --git a/test/iterator_test.cu b/test/iterator_test.cu
index bb0461f..7b5b9ee 100644
--- a/test/iterator_test.cu
+++ b/test/iterator_test.cu
@@ -28,9 +28,9 @@
 #define DEVICE_ID 0
 //=======================================
 
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void print_table(
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t wid = tid >> 5;
   uint32_t laneId = threadIdx.x & 0x1F;
@@ -44,7 +44,7 @@ __global__ void print_table(
 
   if (tid == 0) {
     printf(" == Printing the base array\n");
-    SlabIterator<KeyT> iter(slab_hash);
+    SlabIterator<KeyT, log_num_mem_blocks, num_super_blocks> iter(slab_hash);
     for (int i = 0; i < iter.cur_size_; i++) {
       if ((i & 0x1F) == 0)
         printf(" == bucket %d:\n", i >> 5);