From 5b20e9776ef559d59f35857c4da08005f1f5cf9e Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Thu, 11 Jun 2020 09:28:43 -0700
Subject: [PATCH 01/61] updated SlabAlloc to place super blocks on separate
 arrays

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index d655028..4e27bf1 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit d6550289344cd4474d237e959bfd6bc962981af7
+Subproject commit 4e27bf1c0678a2bd60d5a05e0c020936ebf6c9cb

From 7da2ae7964c8996b6d2b718de7fcee6a91dcf23e Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Thu, 11 Jun 2020 16:33:20 -0700
Subject: [PATCH 02/61] buildBulk now reports which keys were successfully
 inserted

---
 SlabAlloc                                  |  2 +-
 src/concurrent_map/cmap_class.cuh          |  7 +++-
 src/concurrent_map/cmap_implementation.cuh | 44 ++++++++++++++++++++--
 src/concurrent_map/device/build.cuh        | 10 ++++-
 src/concurrent_map/warp/insert.cuh         |  8 +++-
 5 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 4e27bf1..e9464bb 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 4e27bf1c0678a2bd60d5a05e0c020936ebf6c9cb
+Subproject commit e9464bb6d019c1239f608c743fae67b407471bb8
diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 1329740..8967d66 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -86,7 +86,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // threads in a warp cooperate with each other to insert key-value pairs
   // into the slab hash
-  __device__ __forceinline__ void insertPair(bool& to_be_inserted,
+  __device__ __forceinline__ void insertPair(bool& mySuccess,
+                                             bool& to_be_inserted,
                                              const uint32_t& laneId,
                                              const KeyT& myKey,
                                              const ValueT& myValue,
@@ -245,9 +246,13 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
       hf_ = {0u, 0u};
     }
 
+    std::cout << "Initializing context params " << std::endl;
+
     // initializing the gpu_context_:
     gpu_context_.initParameters(
         num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr());
+
+    std::cout << "Done initializing context params" << std::endl;
   }
 
   ~GpuSlabHash() {
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 99880a8..e4fa4ad 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -21,11 +21,47 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
+  
+  int h_retry = 1;
+  int *d_retry;
+  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
+  CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
+  bool* d_success;
+  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
+  CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
+
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
-  // calling the kernel for bulk build:
-  CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  build_table_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
+
+
+
+
+  bool *h_success = (bool*) malloc(num_keys * sizeof(bool));
+
+
+
+
+
+  while(h_retry) {
+    // calling the kernel for bulk build:
+    CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
+    build_table_kernel<KeyT, ValueT>
+        <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
+    CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
+
+    //
+    CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost));
+    for(auto i = 0; i < num_keys; ++i) {
+      if(h_success[i] == false) {
+        std::cout << "Key " << i << " evaluated to false" << std::endl;
+        break;
+      }
+    }
+
+    std::cout << "Evaluating need to resize" << std::endl;
+    // resize the pool here if necessary
+
+    CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
+  }
 }
 template <typename KeyT, typename ValueT>
 void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index f48b5e5..3e8895a 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -20,6 +20,8 @@
  */
 template <typename KeyT, typename ValueT>
 __global__ void build_table_kernel(
+    int* d_retry,
+    bool* d_success,
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -34,19 +36,23 @@ __global__ void build_table_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
+  bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
+    mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
-    to_insert = true;
+    to_insert = !mySuccess;
   }
 
-  slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  d_success[tid] = mySuccess;
+  atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
 }
 
 template <typename KeyT, typename ValueT>
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 8653ddf..9b7fe0e 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -24,6 +24,7 @@
 template <typename KeyT, typename ValueT>
 __device__ __forceinline__ void
 GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
+    bool& mySuccess,
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -54,6 +55,10 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
       if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
         // allocate a new node:
         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
+        if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
+          mySuccess = true; // signal that this key needs to be reinserted 
+          to_be_inserted = false;
+        }
 
         // TODO: experiment if it's better to use lane 0 instead
         if (laneId == 31) {
@@ -88,7 +93,8 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
                           *reinterpret_cast<const uint32_t*>(
                               reinterpret_cast<const unsigned char*>(&myKey)));
         if (old_key_value_pair == EMPTY_PAIR_64)
-          to_be_inserted = false;  // succesfful insertion
+          mySuccess = true;
+          to_be_inserted = false;  // successful insertion
       }
     }
     last_work_queue = work_queue;

From e5257c03e1661914a147a8981cbd72de2050172a Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Fri, 12 Jun 2020 08:19:29 -0700
Subject: [PATCH 03/61] buildBulk function can detect if allocation pool needs
 to be resized

---
 src/concurrent_map/cmap_implementation.cuh | 22 +++-------------------
 src/concurrent_map/device/build.cuh        |  7 +++++--
 src/concurrent_map/warp/insert.cuh         |  5 +++--
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index e4fa4ad..cb68beb 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -31,38 +31,21 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
 
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
-
-
-
-
-  bool *h_success = (bool*) malloc(num_keys * sizeof(bool));
-
-
-
-
-
   while(h_retry) {
     // calling the kernel for bulk build:
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
     build_table_kernel<KeyT, ValueT>
         <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
 
-    //
-    CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost));
-    for(auto i = 0; i < num_keys; ++i) {
-      if(h_success[i] == false) {
-        std::cout << "Key " << i << " evaluated to false" << std::endl;
-        break;
-      }
-    }
-
     std::cout << "Evaluating need to resize" << std::endl;
     // resize the pool here if necessary
 
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }
 }
+
 template <typename KeyT, typename ValueT>
 void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
     KeyT* d_key,
@@ -74,6 +57,7 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   build_table_with_unique_keys_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
 }
+
 template <typename KeyT, typename ValueT>
 void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
     KeyT* d_query,
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index 3e8895a..50c236d 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -51,8 +51,11 @@ __global__ void build_table_kernel(
   }
 
   slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
-  d_success[tid] = mySuccess;
-  atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
+  
+  if (tid < num_keys) {
+    d_success[tid] = mySuccess;
+    atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
+  }
 }
 
 template <typename KeyT, typename ValueT>
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 9b7fe0e..dfb127b 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -56,7 +56,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
         // allocate a new node:
         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
         if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
-          mySuccess = true; // signal that this key needs to be reinserted 
+          mySuccess = false; // signal that this key needs to be reinserted 
           to_be_inserted = false;
         }
 
@@ -92,9 +92,10 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
                        << 32) |
                           *reinterpret_cast<const uint32_t*>(
                               reinterpret_cast<const unsigned char*>(&myKey)));
-        if (old_key_value_pair == EMPTY_PAIR_64)
+        if (old_key_value_pair == EMPTY_PAIR_64) {
           mySuccess = true;
           to_be_inserted = false;  // successful insertion
+        }
       }
     }
     last_work_queue = work_queue;

From a314c19c053cb4c8dfa97158773b07de094b87e0 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Fri, 12 Jun 2020 12:04:03 -0700
Subject: [PATCH 04/61] Added support for resizing dynamic allocation pool

---
 SlabAlloc                                  |  2 +-
 src/concurrent_map/cmap_class.cuh          |  4 ++++
 src/concurrent_map/cmap_implementation.cuh | 26 +++++++++++++++++++---
 src/concurrent_map/warp/insert.cuh         |  1 +
 src/slab_hash_global.cuh                   |  4 ++--
 5 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index e9464bb..ff0696a 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit e9464bb6d019c1239f608c743fae67b407471bb8
+Subproject commit ff0696a8d0b6829076495263dd2e1dd4b7866d28
diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 8967d66..8bf3836 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -67,6 +67,10 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     global_allocator_ctx_ = *allocator_ctx;
   }
 
+  __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) {
+    global_allocator_ctx_ = *allocator_ctx;
+  }
+
   __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
     return global_allocator_ctx_;
   }
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index cb68beb..5844c38 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -21,7 +21,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
-  
+
+  bool *h_success = (bool*) malloc(sizeof(bool) * num_keys);     
+
+
   int h_retry = 1;
   int *d_retry;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
@@ -31,17 +34,34 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
 
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+  CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   while(h_retry) {
+    std::cout << "Calling build kernel" << std::endl;
     // calling the kernel for bulk build:
-    CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
     build_table_kernel<KeyT, ValueT>
         <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
+    
+    /*
+    CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost));
+
+    int num_false = 0;
+    for(auto i = 0; i < num_keys; ++i) {
+      if(h_success[i] == false) num_false++;
+    }
+
+    std::cout << "numfalse " << num_false << std::endl;
+    */
 
     std::cout << "Evaluating need to resize" << std::endl;
     // resize the pool here if necessary
-
+    if(h_retry) {
+      std::cout << "Resizing pool" << std::endl;
+      dynamic_allocator_->growPool();
+      gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+      std::cout << "Done resizing pool" << std::endl;
+    }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }
 }
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index dfb127b..9955b42 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -58,6 +58,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
         if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
           mySuccess = false; // signal that this key needs to be reinserted 
           to_be_inserted = false;
+          continue;
         }
 
         // TODO: experiment if it's better to use lane 0 instead
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 81c9d17..ace129e 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -127,8 +127,8 @@ class GpuSlabHashContext;
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 8;
-constexpr uint32_t num_super_blocks = 32;
+constexpr uint32_t log_num_mem_blocks = 12;
+constexpr uint32_t num_super_blocks = 2;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From 49161a2c282e3e8222716c30a4f9f127bfc8fbd4 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Fri, 12 Jun 2020 14:29:20 -0700
Subject: [PATCH 05/61] removed prints

---
 src/concurrent_map/cmap_class.cuh          | 4 ----
 src/concurrent_map/cmap_implementation.cuh | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 8bf3836..c601c30 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -250,13 +250,9 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
       hf_ = {0u, 0u};
     }
 
-    std::cout << "Initializing context params " << std::endl;
-
     // initializing the gpu_context_:
     gpu_context_.initParameters(
         num_buckets_, hf_.x, hf_.y, d_table_, dynamic_allocator_->getContextPtr());
-
-    std::cout << "Done initializing context params" << std::endl;
   }
 
   ~GpuSlabHash() {
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 5844c38..d46f832 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -36,7 +36,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   while(h_retry) {
-    std::cout << "Calling build kernel" << std::endl;
     // calling the kernel for bulk build:
     build_table_kernel<KeyT, ValueT>
         <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
@@ -54,13 +53,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     std::cout << "numfalse " << num_false << std::endl;
     */
 
-    std::cout << "Evaluating need to resize" << std::endl;
     // resize the pool here if necessary
     if(h_retry) {
-      std::cout << "Resizing pool" << std::endl;
       dynamic_allocator_->growPool();
       gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
-      std::cout << "Done resizing pool" << std::endl;
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }

From ff5ba413b625efc85d8d1c29c4d0ede0600352ee Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Mon, 15 Jun 2020 16:17:32 -0700
Subject: [PATCH 06/61] added pool growth support for unique bulk insert and
 temporarily removed some device allocations in gpu_hash_table for testing

---
 SlabAlloc                                  |  2 +-
 src/concurrent_map/cmap_class.cuh          |  1 +
 src/concurrent_map/cmap_implementation.cuh | 45 ++++++++++++++--------
 src/concurrent_map/device/build.cuh        | 13 ++++++-
 src/concurrent_map/warp/insert.cuh         | 11 +++++-
 src/gpu_hash_table.cuh                     | 14 +++----
 src/slab_hash_global.cuh                   |  2 +-
 7 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index ff0696a..36cfa6f 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit ff0696a8d0b6829076495263dd2e1dd4b7866d28
+Subproject commit 36cfa6f95cd47fd1825e59f51a5ca7d7192a4713
diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index c601c30..c0dae4b 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -101,6 +101,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
   __device__ __forceinline__ bool insertPairUnique(
+      bool& mySuccess,
       bool& to_be_inserted,
       const uint32_t& laneId,
       const KeyT& myKey,
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index d46f832..0a9d9e1 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -21,10 +21,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
-
-  bool *h_success = (bool*) malloc(sizeof(bool) * num_keys);     
-
-
   int h_retry = 1;
   int *d_retry;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
@@ -42,17 +38,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
     
-    /*
-    CHECK_CUDA_ERROR(cudaMemcpy(h_success, d_success, num_keys * sizeof(bool), cudaMemcpyDeviceToHost));
-
-    int num_false = 0;
-    for(auto i = 0; i < num_keys; ++i) {
-      if(h_success[i] == false) num_false++;
-    }
-
-    std::cout << "numfalse " << num_false << std::endl;
-    */
-
     // resize the pool here if necessary
     if(h_retry) {
       dynamic_allocator_->growPool();
@@ -60,6 +45,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }
+  CHECK_CUDA_ERROR(cudaFree(d_retry));
+  CHECK_CUDA_ERROR(cudaFree(d_success));
 }
 
 template <typename KeyT, typename ValueT>
@@ -67,11 +54,39 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
+  int h_retry = 1;
+  int *d_retry;
+  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
+  CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
+  bool* d_success;
+  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
+  CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
+
+  const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
+  CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
+  while(h_retry) {
+    // calling the kernel for bulk build:
+    build_table_with_unique_keys_kernel<KeyT, ValueT>
+        <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+    CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
+    
+    // resize the pool here if necessary
+    if(h_retry) {
+      dynamic_allocator_->growPool();
+      gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+    }
+    CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
+  }
+  CHECK_CUDA_ERROR(cudaFree(d_retry));
+  CHECK_CUDA_ERROR(cudaFree(d_success));
+  /*
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   // calling the kernel for bulk build:
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   build_table_with_unique_keys_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
+  */
 }
 
 template <typename KeyT, typename ValueT>
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index 50c236d..f24b5cb 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -60,6 +60,8 @@ __global__ void build_table_kernel(
 
 template <typename KeyT, typename ValueT>
 __global__ void build_table_with_unique_keys_kernel(
+    int* d_retry,
+    bool* d_success,
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -74,18 +76,25 @@ __global__ void build_table_with_unique_keys_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
+  bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
+    mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
-    to_insert = true;
+    to_insert = !mySuccess;
   }
 
-  slab_hash.insertPairUnique(
+  slab_hash.insertPairUnique(mySuccess,
       to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+
+  if (tid < num_keys) {
+    d_success[tid] = mySuccess;
+    atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
+  }
 }
\ No newline at end of file
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 9955b42..4b94d81 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -113,6 +113,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
 template <typename KeyT, typename ValueT>
 __device__ __forceinline__ bool
 GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
+    bool& mySuccess,
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -143,14 +144,21 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
     uint32_t isExisting = (__ballot_sync(0xFFFFFFFF, src_unit_data == src_key)) &
                           SlabHashT::REGULAR_NODE_KEY_MASK;
     if (isExisting) {  // key exist in the hash table
-      if (laneId == src_lane)
+      if (laneId == src_lane) {
+        mySuccess = true;
         to_be_inserted = false;
+      }
     } else {
       if (isEmpty == 0) {  // no empty slot available:
         uint32_t next_ptr = __shfl_sync(0xFFFFFFFF, src_unit_data, 31, 32);
         if (next_ptr == SlabHashT::EMPTY_INDEX_POINTER) {
           // allocate a new node:
           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
+          if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
+            mySuccess = false; // signal that this key needs to be reinserted 
+            to_be_inserted = false;
+            continue;
+          }
 
           if (laneId == 31) {
             const uint32_t* p = (next == SlabHashT::A_INDEX_POINTER)
@@ -184,6 +192,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                             *reinterpret_cast<const uint32_t*>(
                                 reinterpret_cast<const unsigned char*>(&myKey)));
           if (old_key_value_pair == EMPTY_PAIR_64) {
+            mySuccess = true;
             to_be_inserted = false;  // successful insertion
             new_insertion = true;
           }
diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index 167245e..b41a969 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -72,10 +72,10 @@ class gpu_hash_table {
     if (req_values_) {
       CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_));
     }
-    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
-    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
-    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
-
+    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
+    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
+    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
+    
     // allocate an initialize the allocator:
     dynamic_allocator_ = new DynamicAllocatorT();
 
@@ -93,9 +93,9 @@ class gpu_hash_table {
     if (req_values_) {
       CHECK_CUDA_ERROR(cudaFree(d_value_));
     }
-    CHECK_CUDA_ERROR(cudaFree(d_query_));
-    CHECK_CUDA_ERROR(cudaFree(d_result_));
-    CHECK_CUDA_ERROR(cudaFree(d_count_));
+    //CHECK_CUDA_ERROR(cudaFree(d_query_));
+    //CHECK_CUDA_ERROR(cudaFree(d_result_));
+    //CHECK_CUDA_ERROR(cudaFree(d_count_));
 
     // delete the dynamic allocator:
     delete dynamic_allocator_;
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index ace129e..38437a0 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -127,7 +127,7 @@ class GpuSlabHashContext;
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 12;
+constexpr uint32_t log_num_mem_blocks = 11;
 constexpr uint32_t num_super_blocks = 2;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par

From 44ab2f5e57df9734f1fa2c806c550927c77ead84 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Tue, 16 Jun 2020 11:52:40 -0700
Subject: [PATCH 07/61] added support for preemptive pool growth

---
 SlabAlloc                                  |  2 +-
 src/concurrent_map/cmap_class.cuh          | 20 ++++++++-
 src/concurrent_map/cmap_implementation.cuh | 48 +++++++++++++++++-----
 src/gpu_hash_table.cuh                     |  8 ++--
 4 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 36cfa6f..6abbedb 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 36cfa6f95cd47fd1825e59f51a5ca7d7192a4713
+Subproject commit 6abbedb680bca342863589afd152b12ced0c0fd1
diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index c0dae4b..6b4a9cc 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -31,12 +31,15 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext()
-      : num_buckets_(0), hash_x_(0), hash_y_(0), d_table_(nullptr) {}
+      : num_buckets_(0), total_num_slabs_(0), total_num_keys_(0), 
+        hash_x_(0), hash_y_(0), d_table_(nullptr) {}
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
       GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
+    total_num_slabs_ = rhs.getTotalNumSlabs();
+    total_num_keys_ = rhs.getTotalNumKeys();
     hash_x_ = rhs.getHashX();
     hash_y_ = rhs.getHashY();
     d_table_ = rhs.getDeviceTablePointer();
@@ -65,10 +68,17 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     d_table_ =
         reinterpret_cast<typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT*>(d_table);
     global_allocator_ctx_ = *allocator_ctx;
+    total_num_slabs_ = num_buckets + global_allocator_ctx_.getNumSlabsInPool();
+    total_num_keys_ = 0;
   }
 
   __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) {
     global_allocator_ctx_ = *allocator_ctx;
+    total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool();
+  }
+
+  __host__ void updateTotalNumKeys(uint32_t keysAdded) {
+    total_num_keys_ += keysAdded;
   }
 
   __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
@@ -80,6 +90,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     return d_table_;
   }
 
+  __device__ __host__ __forceinline__ uint32_t getTotalNumSlabs() {return total_num_slabs_; }
+  __device__ __host__ __forceinline__ uint32_t getTotalNumKeys() {return total_num_keys_; }
   __device__ __host__ __forceinline__ uint32_t getNumBuckets() { return num_buckets_; }
   __device__ __host__ __forceinline__ uint32_t getHashX() { return hash_x_; }
   __device__ __host__ __forceinline__ uint32_t getHashY() { return hash_y_; }
@@ -171,6 +183,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // === members:
   uint32_t num_buckets_;
+  uint32_t total_num_slabs_;
+  uint32_t total_num_keys_;
   uint32_t hash_x_;
   uint32_t hash_y_;
   typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT* d_table_;
@@ -264,7 +278,9 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   // returns some debug information about the slab hash
   std::string to_string();
   double computeLoadFactor(int flag);
-
+ 
+  void resize();
+  uint32_t checkForPreemptiveResize(uint32_t keysAdded);
   void buildBulk(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
   void buildBulkWithUniqueKeys(KeyT* d_key, ValueT* d_value, uint32_t num_keys);
   void searchIndividual(KeyT* d_query, ValueT* d_result, uint32_t num_queries);
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 0a9d9e1..30e155d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -16,6 +16,30 @@
 
 #pragma once
 
+template <typename KeyT, typename ValueT>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
+  dynamic_allocator_->growPool();
+  gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+}
+
+template <typename KeyT, typename ValueT>
+uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
+  auto numSlabs = gpu_context_.getTotalNumSlabs();
+  auto maxElemCapacity = numSlabs * 15;
+  auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
+  auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity;
+  auto numResizes = 0;
+
+  if(finalSlabLoadFactor > 0.85) {
+    numResizes = 1;
+  }
+  if(finalSlabLoadFactor > 1.95) {
+    numResizes = 2;
+  }
+
+  return numResizes;
+}
+
 template <typename KeyT, typename ValueT>
 void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     KeyT* d_key,
@@ -40,8 +64,7 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     
     // resize the pool here if necessary
     if(h_retry) {
-      dynamic_allocator_->growPool();
-      gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+      resize();
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }
@@ -65,6 +88,13 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   while(h_retry) {
+
+    // predict whether resizing will be necessary, and take preemptive action.
+    auto numResizes = checkForPreemptiveResize(num_keys);
+    for(auto i = 0; i < numResizes; ++i) {
+      resize();
+    }
+
     // calling the kernel for bulk build:
     build_table_with_unique_keys_kernel<KeyT, ValueT>
         <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
@@ -73,20 +103,16 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
     
     // resize the pool here if necessary
     if(h_retry) {
-      dynamic_allocator_->growPool();
-      gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
+      resize();
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
   }
   CHECK_CUDA_ERROR(cudaFree(d_retry));
   CHECK_CUDA_ERROR(cudaFree(d_success));
-  /*
-  const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
-  // calling the kernel for bulk build:
-  CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  build_table_with_unique_keys_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
-  */
+
+  // now that the bulk insert has completed successfully, we can
+  // update the total number of keys in the table
+  gpu_context_.updateTotalNumKeys(num_keys);
 }
 
 template <typename KeyT, typename ValueT>
diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index b41a969..03930b0 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -72,8 +72,8 @@ class gpu_hash_table {
     if (req_values_) {
       CHECK_CUDA_ERROR(cudaMalloc((void**)&d_value_, sizeof(ValueT) * max_keys_));
     }
-    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
-    //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
+    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_query_, sizeof(KeyT) * max_keys_));
+    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_result_, sizeof(ValueT) * max_keys_));
     //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
     
     // allocate an initialize the allocator:
@@ -93,8 +93,8 @@ class gpu_hash_table {
     if (req_values_) {
       CHECK_CUDA_ERROR(cudaFree(d_value_));
     }
-    //CHECK_CUDA_ERROR(cudaFree(d_query_));
-    //CHECK_CUDA_ERROR(cudaFree(d_result_));
+    CHECK_CUDA_ERROR(cudaFree(d_query_));
+    CHECK_CUDA_ERROR(cudaFree(d_result_));
     //CHECK_CUDA_ERROR(cudaFree(d_count_));
 
     // delete the dynamic allocator:

From 48b21dfa76d30bb18d4bf6a51308492e443e1253 Mon Sep 17 00:00:00 2001
From: Nico Iskos <niskos@niskos-dt.nvidia.com>
Date: Wed, 17 Jun 2020 09:13:13 -0700
Subject: [PATCH 08/61] added preemptive resizing for non unique build kernel

---
 src/concurrent_map/cmap_implementation.cuh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 30e155d..fc31fe7 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -56,6 +56,12 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   while(h_retry) {
+    // predict whether resizing will be necessary, and take preemptive action.
+    auto numResizes = checkForPreemptiveResize(num_keys);
+    for(auto i = 0; i < numResizes; ++i) {
+      resize();
+    }
+
     // calling the kernel for bulk build:
     build_table_kernel<KeyT, ValueT>
         <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
@@ -70,6 +76,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   }
   CHECK_CUDA_ERROR(cudaFree(d_retry));
   CHECK_CUDA_ERROR(cudaFree(d_success));
+  
+  // now that the bulk insert has completed successfully, we can
+  // update the total number of keys in the table
+  gpu_context_.updateTotalNumKeys(num_keys);
 }
 
 template <typename KeyT, typename ValueT>
@@ -77,6 +87,7 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
+
   int h_retry = 1;
   int *d_retry;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
@@ -88,7 +99,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
   while(h_retry) {
-
     // predict whether resizing will be necessary, and take preemptive action.
     auto numResizes = checkForPreemptiveResize(num_keys);
     for(auto i = 0; i < numResizes; ++i) {

From 1978da6630771dde9693d7fd7d0f379ed68083cd Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 27 Sep 2020 00:14:02 -0400
Subject: [PATCH 09/61] decreased suberblock size

---
 src/slab_hash_global.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 38437a0..e9f21aa 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -127,7 +127,7 @@ class GpuSlabHashContext;
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 11;
+constexpr uint32_t log_num_mem_blocks = 10; // 128 MB
 constexpr uint32_t num_super_blocks = 2;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par

From 01c972b5f6a5c7b4e8baa5871d4157ba2d1cbf49 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Mon, 28 Dec 2020 10:24:15 -0500
Subject: [PATCH 10/61] updated gitmodules

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index fcfa184..1c94e86 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "SlabAlloc"]
 	path = SlabAlloc
-	url = https://github.com/owensgroup/SlabAlloc
+	url = https://github.com/Nicolas-Iskos/SlabAlloc
 [submodule "ThirdParty/rapidjson"]
 	path = ThirdParty/rapidjson
 	url = https://github.com/Tencent/rapidjson

From c3c9bccf834b5636b6d96b3b483d567d8cf4bda7 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Mon, 28 Dec 2020 14:07:49 -0500
Subject: [PATCH 11/61] removed explicit checks for need to resize

---
 src/concurrent_map/cmap_class.cuh          |  4 +--
 src/concurrent_map/cmap_implementation.cuh | 37 +++++++++++++---------
 src/concurrent_map/device/build.cuh        | 30 ++++++++++--------
 src/concurrent_map/warp/insert.cuh         | 14 ++++----
 4 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 6b4a9cc..a66e98b 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -102,7 +102,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // threads in a warp cooperate with each other to insert key-value pairs
   // into the slab hash
-  __device__ __forceinline__ void insertPair(bool& mySuccess,
+  __device__ __forceinline__ void insertPair(/*bool& mySuccess,*/
                                              bool& to_be_inserted,
                                              const uint32_t& laneId,
                                              const KeyT& myKey,
@@ -113,7 +113,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
   __device__ __forceinline__ bool insertPairUnique(
-      bool& mySuccess,
+      /*bool& mySuccess,*/
       bool& to_be_inserted,
       const uint32_t& laneId,
       const KeyT& myKey,
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index fc31fe7..af272f1 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -45,6 +45,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
+  
+  /*
   int h_retry = 1;
   int *d_retry;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
@@ -52,10 +54,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   bool* d_success;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
   CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
-
+  */
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  while(h_retry) {
+  //while(h_retry) {
     // predict whether resizing will be necessary, and take preemptive action.
     auto numResizes = checkForPreemptiveResize(num_keys);
     for(auto i = 0; i < numResizes; ++i) {
@@ -64,18 +66,20 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
 
     // calling the kernel for bulk build:
     build_table_kernel<KeyT, ValueT>
-        <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
+        <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-    CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
+    //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
     
     // resize the pool here if necessary
+    /*
     if(h_retry) {
       resize();
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-  }
-  CHECK_CUDA_ERROR(cudaFree(d_retry));
-  CHECK_CUDA_ERROR(cudaFree(d_success));
+    */
+  //}
+  //CHECK_CUDA_ERROR(cudaFree(d_retry));
+  //CHECK_CUDA_ERROR(cudaFree(d_success));
   
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
@@ -87,7 +91,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
-
+  
+  /*
   int h_retry = 1;
   int *d_retry;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
@@ -95,10 +100,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   bool* d_success;
   CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
   CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
-
+  */
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  while(h_retry) {
+  //while(h_retry) {
     // predict whether resizing will be necessary, and take preemptive action.
     auto numResizes = checkForPreemptiveResize(num_keys);
     for(auto i = 0; i < numResizes; ++i) {
@@ -107,18 +112,20 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
 
     // calling the kernel for bulk build:
     build_table_with_unique_keys_kernel<KeyT, ValueT>
-        <<<num_blocks, BLOCKSIZE_>>>(d_retry, d_success, d_key, d_value, num_keys, gpu_context_);
+        <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-    CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
+    //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
     
     // resize the pool here if necessary
+    /*
     if(h_retry) {
       resize();
     }
     CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-  }
-  CHECK_CUDA_ERROR(cudaFree(d_retry));
-  CHECK_CUDA_ERROR(cudaFree(d_success));
+    */
+  //}
+  //CHECK_CUDA_ERROR(cudaFree(d_retry));
+  //CHECK_CUDA_ERROR(cudaFree(d_success));
 
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index f24b5cb..de5c68e 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -20,8 +20,8 @@
  */
 template <typename KeyT, typename ValueT>
 __global__ void build_table_kernel(
-    int* d_retry,
-    bool* d_success,
+    /*int* d_retry,*/
+    /*bool* d_success,*/
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -36,32 +36,34 @@ __global__ void build_table_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
-  bool mySuccess = true;
+  //bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
-    mySuccess = d_success[tid];
+    //mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
-    to_insert = !mySuccess;
+    to_insert = true;
   }
 
-  slab_hash.insertPair(mySuccess, to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  slab_hash.insertPair(/*mySuccess,*/ to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
   
+  /*
   if (tid < num_keys) {
     d_success[tid] = mySuccess;
     atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
   }
+  */
 }
 
 template <typename KeyT, typename ValueT>
 __global__ void build_table_with_unique_keys_kernel(
-    int* d_retry,
-    bool* d_success,
+    /*int* d_retry,*/
+    /*bool* d_success,*/
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -76,25 +78,27 @@ __global__ void build_table_with_unique_keys_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
-  bool mySuccess = true;
+  //bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
-    mySuccess = d_success[tid];
+    //mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
-    to_insert = !mySuccess;
+    to_insert = true;
   }
 
-  slab_hash.insertPairUnique(mySuccess,
+  slab_hash.insertPairUnique(/*mySuccess,*/
       to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
-
+  
+  /*
   if (tid < num_keys) {
     d_success[tid] = mySuccess;
     atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
   }
+  */
 }
\ No newline at end of file
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 4b94d81..c52ed11 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -24,7 +24,7 @@
 template <typename KeyT, typename ValueT>
 __device__ __forceinline__ void
 GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
-    bool& mySuccess,
+    /*bool& mySuccess,*/
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -56,7 +56,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
         // allocate a new node:
         uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
         if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
-          mySuccess = false; // signal that this key needs to be reinserted 
+          //mySuccess = false; // signal that this key needs to be reinserted 
           to_be_inserted = false;
           continue;
         }
@@ -94,7 +94,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
                           *reinterpret_cast<const uint32_t*>(
                               reinterpret_cast<const unsigned char*>(&myKey)));
         if (old_key_value_pair == EMPTY_PAIR_64) {
-          mySuccess = true;
+          //mySuccess = true;
           to_be_inserted = false;  // successful insertion
         }
       }
@@ -113,7 +113,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
 template <typename KeyT, typename ValueT>
 __device__ __forceinline__ bool
 GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
-    bool& mySuccess,
+    /*bool& mySuccess,*/
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -145,7 +145,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                           SlabHashT::REGULAR_NODE_KEY_MASK;
     if (isExisting) {  // key exist in the hash table
       if (laneId == src_lane) {
-        mySuccess = true;
+        //mySuccess = true;
         to_be_inserted = false;
       }
     } else {
@@ -155,7 +155,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
           // allocate a new node:
           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
           if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
-            mySuccess = false; // signal that this key needs to be reinserted 
+            //mySuccess = false; // signal that this key needs to be reinserted 
             to_be_inserted = false;
             continue;
           }
@@ -192,7 +192,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                             *reinterpret_cast<const uint32_t*>(
                                 reinterpret_cast<const unsigned char*>(&myKey)));
           if (old_key_value_pair == EMPTY_PAIR_64) {
-            mySuccess = true;
+            //mySuccess = true;
             to_be_inserted = false;  // successful insertion
             new_insertion = true;
           }

From c1f8e7ba9b98de0e29138919b4ccad30c19d9d7b Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Mon, 28 Dec 2020 20:13:21 -0500
Subject: [PATCH 12/61] removed dead code

---
 src/concurrent_map/cmap_implementation.cuh | 86 +++++-----------------
 src/concurrent_map/device/build.cuh        | 26 +------
 2 files changed, 22 insertions(+), 90 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index af272f1..b87ea77 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -46,40 +46,17 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
     ValueT* d_value,
     uint32_t num_keys) {
   
-  /*
-  int h_retry = 1;
-  int *d_retry;
-  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
-  CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-  bool* d_success;
-  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
-  CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
-  */
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  //while(h_retry) {
-    // predict whether resizing will be necessary, and take preemptive action.
-    auto numResizes = checkForPreemptiveResize(num_keys);
-    for(auto i = 0; i < numResizes; ++i) {
-      resize();
-    }
+  auto numResizes = checkForPreemptiveResize(num_keys);
+  for(auto i = 0; i < numResizes; ++i) {
+    resize();
+  }
 
-    // calling the kernel for bulk build:
-    build_table_kernel<KeyT, ValueT>
-        <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-    //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
-    
-    // resize the pool here if necessary
-    /*
-    if(h_retry) {
-      resize();
-    }
-    CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-    */
-  //}
-  //CHECK_CUDA_ERROR(cudaFree(d_retry));
-  //CHECK_CUDA_ERROR(cudaFree(d_success));
+  // calling the kernel for bulk build:
+  build_table_kernel<KeyT, ValueT>
+      <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
+  CHECK_CUDA_ERROR(cudaDeviceSynchronize());
   
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
@@ -88,45 +65,22 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
 
 template <typename KeyT, typename ValueT>
 void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
-    KeyT* d_key,
-    ValueT* d_value,
-    uint32_t num_keys) {
+  KeyT* d_key,
+  ValueT* d_value,
+  uint32_t num_keys) {
   
-  /*
-  int h_retry = 1;
-  int *d_retry;
-  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_retry, sizeof(int)));
-  CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-  bool* d_success;
-  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_success, num_keys * sizeof(bool)));
-  CHECK_CUDA_ERROR(cudaMemset((void*)d_success, 0x00, num_keys * sizeof(bool)));
-  */
   const uint32_t num_blocks = (num_keys + BLOCKSIZE_ - 1) / BLOCKSIZE_;
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
-  //while(h_retry) {
-    // predict whether resizing will be necessary, and take preemptive action.
-    auto numResizes = checkForPreemptiveResize(num_keys);
-    for(auto i = 0; i < numResizes; ++i) {
-      resize();
-    }
-
-    // calling the kernel for bulk build:
-    build_table_with_unique_keys_kernel<KeyT, ValueT>
-        <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-    //CHECK_CUDA_ERROR(cudaMemcpy(&h_retry, d_retry, sizeof(int), cudaMemcpyDeviceToHost));
-    
-    // resize the pool here if necessary
-    /*
-    if(h_retry) {
-      resize();
-    }
-    CHECK_CUDA_ERROR(cudaMemset((void*)d_retry, 0x00, sizeof(int)));
-    */
-  //}
-  //CHECK_CUDA_ERROR(cudaFree(d_retry));
-  //CHECK_CUDA_ERROR(cudaFree(d_success));
+  auto numResizes = checkForPreemptiveResize(num_keys);
+  for(auto i = 0; i < numResizes; ++i) {
+    resize();
+  }
 
+  // calling the kernel for bulk build:
+  build_table_with_unique_keys_kernel<KeyT, ValueT>
+      <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
+  CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+  
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
   gpu_context_.updateTotalNumKeys(num_keys);
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index de5c68e..f48b5e5 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -20,8 +20,6 @@
  */
 template <typename KeyT, typename ValueT>
 __global__ void build_table_kernel(
-    /*int* d_retry,*/
-    /*bool* d_success,*/
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -36,34 +34,23 @@ __global__ void build_table_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
-  //bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
-    //mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
     to_insert = true;
   }
 
-  slab_hash.insertPair(/*mySuccess,*/ to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
-  
-  /*
-  if (tid < num_keys) {
-    d_success[tid] = mySuccess;
-    atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
-  }
-  */
+  slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
 }
 
 template <typename KeyT, typename ValueT>
 __global__ void build_table_with_unique_keys_kernel(
-    /*int* d_retry,*/
-    /*bool* d_success,*/
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
@@ -78,27 +65,18 @@ __global__ void build_table_with_unique_keys_kernel(
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
-  //bool mySuccess = true;
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
-    //mySuccess = d_success[tid];
     myKey = d_key[tid];
     myValue = d_value[tid];
     myBucket = slab_hash.computeBucket(myKey);
     to_insert = true;
   }
 
-  slab_hash.insertPairUnique(/*mySuccess,*/
+  slab_hash.insertPairUnique(
       to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
-  
-  /*
-  if (tid < num_keys) {
-    d_success[tid] = mySuccess;
-    atomicCAS(d_retry, 0, (int)!mySuccess); // if any key was not successful, we need to resize and retry
-  }
-  */
 }
\ No newline at end of file

From a4f6ba0acd0c2db55232a53820e188c1727e474c Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Mon, 28 Dec 2020 23:48:55 -0500
Subject: [PATCH 13/61] resizing parameter tuning

---
 src/concurrent_map/cmap_implementation.cuh | 5 +----
 src/slab_hash_global.cuh                   | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index b87ea77..7d39ae5 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -30,12 +30,9 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.85) {
+  if(finalSlabLoadFactor > 0.60) {
     numResizes = 1;
   }
-  if(finalSlabLoadFactor > 1.95) {
-    numResizes = 2;
-  }
 
   return numResizes;
 }
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index e9f21aa..c2b6b2f 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -127,8 +127,8 @@ class GpuSlabHashContext;
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 10; // 128 MB
-constexpr uint32_t num_super_blocks = 2;
+constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
+constexpr uint32_t num_super_blocks = 1;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From b29b9886f9256b0fe47df6f652ef4e56d823c5b7 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Tue, 29 Dec 2020 21:07:54 -0500
Subject: [PATCH 14/61] added cub

---
 .gitmodules              | 3 +++
 CMakeLists.txt           | 3 ++-
 SlabAlloc                | 2 +-
 cub                      | 1 +
 src/slab_hash_global.cuh | 1 +
 5 files changed, 8 insertions(+), 2 deletions(-)
 create mode 160000 cub

diff --git a/.gitmodules b/.gitmodules
index 1c94e86..201a205 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "ThirdParty/googletest"]
 	path = ThirdParty/googletest
 	url = https://github.com/google/googletest
+[submodule "cub"]
+	path = cub
+	url = https://github.com/NVIDIA/cub.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2310713..72c7a6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ find_package(CUDA 8.0 REQUIRED)
 option(CMAKE_VERBOSE_MAKEFILE ON)
 option(DGTEST,  "DGTEST"  ON)
 
-set(CUDA_NVCC_FLAGS -std=c++11)
+set(CUDA_NVCC_FLAGS -std=c++14)
 set (CMAKE_CXX_STANDARD 11)
 
 if (CUDA_VERBOSE_PTXAS)
@@ -83,6 +83,7 @@ if(SLABHASH_GENCODE_SM75)
 endif(SLABHASH_GENCODE_SM75)
 
 include_directories(SlabAlloc/src)
+include_directories(cub)
 include_directories(src src/concurrent)
 include_directories(ThirdParty/rapidjson/include)
 include_directories(ThirdParty/googletest/googletest)
diff --git a/SlabAlloc b/SlabAlloc
index 6abbedb..3b98001 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 6abbedb680bca342863589afd152b12ced0c0fd1
+Subproject commit 3b980011e912c32de4ca1f4756258309aee4776c
diff --git a/cub b/cub
new file mode 160000
index 0000000..c3be9a9
--- /dev/null
+++ b/cub
@@ -0,0 +1 @@
+Subproject commit c3be9a94273b5049520aacc7db00c738668aaa3f
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index c2b6b2f..15ec263 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "slab_alloc.cuh"
+#include "cub/cub.cuh"
 
 #define CHECK_CUDA_ERROR(call)                                                          \
   do {                                                                                  \

From e3c64636c9184cc406e03e37a50112e69c894103 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 31 Dec 2020 14:39:41 -0500
Subject: [PATCH 15/61] added block reduce key counting for
 buildBulkWithUniqueKeys

---
 src/concurrent_map/cmap_class.cuh          |  2 +-
 src/concurrent_map/cmap_implementation.cuh |  9 ++++++---
 src/concurrent_map/device/build.cuh        | 13 ++++++++++++-
 src/concurrent_map/warp/insert.cuh         |  9 +++++----
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index a66e98b..c1e8020 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -113,7 +113,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
   __device__ __forceinline__ bool insertPairUnique(
-      /*bool& mySuccess,*/
+      int& mySuccess,
       bool& to_be_inserted,
       const uint32_t& laneId,
       const KeyT& myKey,
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 7d39ae5..420cc9c 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -51,8 +51,9 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   }
 
   // calling the kernel for bulk build:
+  //int num_successes = 0;
   build_table_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
+      <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());
   
   // now that the bulk insert has completed successfully, we can
@@ -74,13 +75,15 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   }
 
   // calling the kernel for bulk build:
+  int num_successes = 0;
   build_table_with_unique_keys_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(/*d_retry, d_success,*/ d_key, d_value, num_keys, gpu_context_);
+      <<<num_blocks, BLOCKSIZE_>>>(&num_successes, d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());
   
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
-  gpu_context_.updateTotalNumKeys(num_keys);
+  gpu_context_.updateTotalNumKeys(num_successes);
+  std::cout << "num_successes: " << num_successes << std::endl;
 }
 
 template <typename KeyT, typename ValueT>
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index f48b5e5..b391fb6 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -51,10 +51,15 @@ __global__ void build_table_kernel(
 
 template <typename KeyT, typename ValueT>
 __global__ void build_table_with_unique_keys_kernel(
+    int *num_successes,
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
     GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+  
+  typedef cub::BlockReduce<std::size_t, 128> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -68,6 +73,7 @@ __global__ void build_table_with_unique_keys_kernel(
   KeyT myKey = 0;
   ValueT myValue = 0;
   uint32_t myBucket = 0;
+  int mySuccess = 0;
   bool to_insert = false;
 
   if (tid < num_keys) {
@@ -77,6 +83,11 @@ __global__ void build_table_with_unique_keys_kernel(
     to_insert = true;
   }
 
-  slab_hash.insertPairUnique(
+  slab_hash.insertPairUnique(mySuccess,
       to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+      
+  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess);
+  if(threadIdx.x == 0) {
+    atomicAdd(num_successes, block_num_successes);
+  }
 }
\ No newline at end of file
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index c52ed11..ad8f2c8 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -113,13 +113,14 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
 template <typename KeyT, typename ValueT>
 __device__ __forceinline__ bool
 GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
-    /*bool& mySuccess,*/
+    int& mySuccess,
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const ValueT& myValue,
     const uint32_t bucket_id,
     AllocatorContextT& local_allocator_ctx) {
+
   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -145,7 +146,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                           SlabHashT::REGULAR_NODE_KEY_MASK;
     if (isExisting) {  // key exist in the hash table
       if (laneId == src_lane) {
-        //mySuccess = true;
+        mySuccess = 0;
         to_be_inserted = false;
       }
     } else {
@@ -155,7 +156,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
           // allocate a new node:
           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
           if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
-            //mySuccess = false; // signal that this key needs to be reinserted 
+            mySuccess = 0; // signal that this key needs to be reinserted 
             to_be_inserted = false;
             continue;
           }
@@ -192,7 +193,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                             *reinterpret_cast<const uint32_t*>(
                                 reinterpret_cast<const unsigned char*>(&myKey)));
           if (old_key_value_pair == EMPTY_PAIR_64) {
-            //mySuccess = true;
+            mySuccess = 1;
             to_be_inserted = false;  // successful insertion
             new_insertion = true;
           }

From 36ee554d3c38563fbc52551417ba15d8311bef2f Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 31 Dec 2020 15:03:29 -0500
Subject: [PATCH 16/61] bug fixes in insert unique call

---
 src/concurrent_map/cmap_implementation.cuh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 420cc9c..748958a 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -75,15 +75,18 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   }
 
   // calling the kernel for bulk build:
-  int num_successes = 0;
+  int *num_successes;
+  CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int)));
+  *num_successes = 0;
+
   build_table_with_unique_keys_kernel<KeyT, ValueT>
-      <<<num_blocks, BLOCKSIZE_>>>(&num_successes, d_key, d_value, num_keys, gpu_context_);
+      <<<num_blocks, BLOCKSIZE_>>>(num_successes, d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());
   
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
-  gpu_context_.updateTotalNumKeys(num_successes);
-  std::cout << "num_successes: " << num_successes << std::endl;
+  gpu_context_.updateTotalNumKeys(*num_successes);
+  std::cout << "num_successes: " << *num_successes << std::endl;
 }
 
 template <typename KeyT, typename ValueT>

From 610af7bb7fa64db5e33229aeac4d1e29a0494408 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 31 Dec 2020 15:19:57 -0500
Subject: [PATCH 17/61] removed print

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 748958a..ad6837d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -86,7 +86,7 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
   gpu_context_.updateTotalNumKeys(*num_successes);
-  std::cout << "num_successes: " << *num_successes << std::endl;
+  //std::cout << "num_successes: " << *num_successes << std::endl;
 }
 
 template <typename KeyT, typename ValueT>

From 8b006f9a1f4d2047c08aac16ea03d8e6a39c4be7 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 01:11:38 -0500
Subject: [PATCH 18/61] 37.5 split

---
 SlabAlloc                | 2 +-
 src/slab_hash_global.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 3b98001..f85aec8 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 3b980011e912c32de4ca1f4756258309aee4776c
+Subproject commit f85aec8640315f01d0a8d4f3333f5b851e5f4b1f
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 15ec263..de0de30 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
-constexpr uint32_t num_super_blocks = 1;
+constexpr uint32_t num_super_blocks = 5;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From c1fb64258b3c2f2caa14707bd63f5efda6d4538e Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 14:48:51 -0500
Subject: [PATCH 19/61] resize at 70% lf

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index ad6837d..43d9c88 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -30,7 +30,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.60) {
+  if(finalSlabLoadFactor > 0.70) {
     numResizes = 1;
   }
 

From d2d2926492ff727c477b27be84433cc0668aec14 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 15:10:18 -0500
Subject: [PATCH 20/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 43d9c88..a5fc273 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -27,6 +27,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto numSlabs = gpu_context_.getTotalNumSlabs();
   auto maxElemCapacity = numSlabs * 15;
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
+  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity;
   auto numResizes = 0;
 

From de70e25420ede59f8c10f0febd281631513ebd11 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 15:49:54 -0500
Subject: [PATCH 21/61] changes to SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index f85aec8..99ea8d8 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit f85aec8640315f01d0a8d4f3333f5b851e5f4b1f
+Subproject commit 99ea8d883d6adcef043c5029ffc27dbd3319fe2a

From b090ecea74be21c0920885f5558259eeee9d47f9 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 16:14:06 -0500
Subject: [PATCH 22/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index a5fc273..8a16ea3 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -25,6 +25,7 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
 template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
+  std::cout << "numSlabs " << numSlabs << std::endl;
   auto maxElemCapacity = numSlabs * 15;
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
   std::cout << "finalNumKeys " << finalNumKeys << std::endl;

From 5769e2030f05945e1f1b9beebc9ec7363122e995 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 16:54:20 -0500
Subject: [PATCH 23/61] capacity calculation correction

---
 src/concurrent_map/cmap_implementation.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 8a16ea3..acb7df2 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -26,10 +26,11 @@ template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
   std::cout << "numSlabs " << numSlabs << std::endl;
-  auto maxElemCapacity = numSlabs * 15;
+  
+  auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
   std::cout << "finalNumKeys " << finalNumKeys << std::endl;
-  auto finalSlabLoadFactor = (float) (finalNumKeys) / maxElemCapacity;
+  auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
   if(finalSlabLoadFactor > 0.70) {

From 90909ee7f6a2fcce142170a440318d3339377837 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 1 Jan 2021 17:02:17 -0500
Subject: [PATCH 24/61] removed prints

---
 src/concurrent_map/cmap_implementation.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index acb7df2..0f6c660 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -25,11 +25,11 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
 template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
-  std::cout << "numSlabs " << numSlabs << std::endl;
+  //std::cout << "numSlabs " << numSlabs << std::endl;
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //åstd::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 

From 855db99e58f6d6ee645de57e4cf1fb29f94c3fe6 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Tue, 5 Jan 2021 12:57:54 -0500
Subject: [PATCH 25/61] 87.5% splut

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 0f6c660..ef86a43 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -33,7 +33,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.70) {
+  if(finalSlabLoadFactor > 0.60) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index de0de30..15ec263 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
-constexpr uint32_t num_super_blocks = 5;
+constexpr uint32_t num_super_blocks = 1;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From b82f0aaf13e9cde49c556028c053b1fabade103f Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 16:33:27 -0500
Subject: [PATCH 26/61] 37.5% split

---
 SlabAlloc                                  | 2 +-
 src/concurrent_map/cmap_implementation.cuh | 4 ++--
 src/slab_hash_global.cuh                   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 99ea8d8..8677195 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 99ea8d883d6adcef043c5029ffc27dbd3319fe2a
+Subproject commit 8677195a86fb73193434385d42c42471e1a4006b
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index ef86a43..ace2ca4 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -29,11 +29,11 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //åstd::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.60) {
+  if(finalSlabLoadFactor > 0.80) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 15ec263..de0de30 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
-constexpr uint32_t num_super_blocks = 1;
+constexpr uint32_t num_super_blocks = 5;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From df66adc0a86a578674ce10f977596e97665a29eb Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 17:34:43 -0500
Subject: [PATCH 27/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index ace2ca4..9fa56f7 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -25,11 +25,11 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
 template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
-  //std::cout << "numSlabs " << numSlabs << std::endl;
+  std::cout << "numSlabs " << numSlabs << std::endl;
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 

From 12d4d9cfb0d823f01c8bb20af8ef4b40e1d86b20 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 18:17:17 -0500
Subject: [PATCH 28/61] bug fix in insertUnique device function

---
 src/concurrent_map/warp/insert.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index ad8f2c8..b6d85be 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -193,7 +193,7 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                             *reinterpret_cast<const uint32_t*>(
                                 reinterpret_cast<const unsigned char*>(&myKey)));
           if (old_key_value_pair == EMPTY_PAIR_64) {
-            mySuccess = 1;
+            mySuccess += 1;
             to_be_inserted = false;  // successful insertion
             new_insertion = true;
           }

From 1f0853564c894bed41a1ae7ca9c993f00607c64b Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 19:07:54 -0500
Subject: [PATCH 29/61] bug fix in build

---
 src/concurrent_map/device/build.cuh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index b391fb6..9b05f63 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -63,9 +63,9 @@ __global__ void build_table_with_unique_keys_kernel(
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
-  if ((tid - laneId) >= num_keys) {
-    return;
-  }
+  //if ((tid - laneId) >= num_keys) {
+  //  return;
+  //}
 
   AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
@@ -83,8 +83,10 @@ __global__ void build_table_with_unique_keys_kernel(
     to_insert = true;
   }
 
-  slab_hash.insertPairUnique(mySuccess,
-      to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  if ((tid - laneId) < num_keys) {
+    slab_hash.insertPairUnique(mySuccess,
+        to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
+  }
       
   std::size_t block_num_successes = BlockReduce(temp_storage).Sum(mySuccess);
   if(threadIdx.x == 0) {

From 7c511750f4fc3f664417daedd20a870a66133f82 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 19:31:31 -0500
Subject: [PATCH 30/61] bug fix in insert unique function

---
 src/concurrent_map/warp/insert.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index b6d85be..124b863 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -146,7 +146,6 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
                           SlabHashT::REGULAR_NODE_KEY_MASK;
     if (isExisting) {  // key exist in the hash table
       if (laneId == src_lane) {
-        mySuccess = 0;
         to_be_inserted = false;
       }
     } else {
@@ -156,7 +155,6 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique
           // allocate a new node:
           uint32_t new_node_ptr = allocateSlab(local_allocator_ctx, laneId);
           if(new_node_ptr == 0xFFFFFFFF) { // could not allocate a new slab: pool size needs to be increased
-            mySuccess = 0; // signal that this key needs to be reinserted 
             to_be_inserted = false;
             continue;
           }

From d1a116cfa2661c32c23c2ce7ed1190dedeacf456 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Wed, 6 Jan 2021 19:54:11 -0500
Subject: [PATCH 31/61] changes to SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index 8677195..ffdc551 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 8677195a86fb73193434385d42c42471e1a4006b
+Subproject commit ffdc551ad6d5c6224b909731264b790415790a2b

From 854b1f94df4451a5d894363eb22a3ac4c848fe86 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 01:11:56 -0500
Subject: [PATCH 32/61] resize at 70%

---
 src/concurrent_map/cmap_implementation.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 9fa56f7..780f78c 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -25,15 +25,15 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
 template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
-  std::cout << "numSlabs " << numSlabs << std::endl;
+  //std::cout << "numSlabs " << numSlabs << std::endl;
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.80) {
+  if(finalSlabLoadFactor > 0.70) {
     numResizes = 1;
   }
 

From 34afe70d437f3ebbfc1f8ec4b5e7df2d202f8bfd Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 01:25:49 -0500
Subject: [PATCH 33/61] 87.5 split

---
 SlabAlloc                                  | 2 +-
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index ffdc551..bcfbc76 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit ffdc551ad6d5c6224b909731264b790415790a2b
+Subproject commit bcfbc7621d07acf1a637c3b32aad435439fa0f69
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 780f78c..f8e2e3d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -33,7 +33,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.70) {
+  if(finalSlabLoadFactor > 0.60) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index de0de30..15ec263 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
-constexpr uint32_t num_super_blocks = 5;
+constexpr uint32_t num_super_blocks = 1;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From 57c013367bb696e9173313acb0cc3d32150fbd74 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 01:32:38 -0500
Subject: [PATCH 34/61] resize at 50%

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index f8e2e3d..a0d1ae9 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -33,7 +33,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.60) {
+  if(finalSlabLoadFactor > 0.50) {
     numResizes = 1;
   }
 

From 79eccec25a242a3be7b8840ba859241a6e422ce2 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 01:35:42 -0500
Subject: [PATCH 35/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index a0d1ae9..34e827d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -25,11 +25,10 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
 template <typename KeyT, typename ValueT>
 uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
-  //std::cout << "numSlabs " << numSlabs << std::endl;
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 

From 8314c45c3c076d8e218036420729cac4d5c308e3 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 01:52:05 -0500
Subject: [PATCH 36/61] resize at 60% occupancy

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 34e827d..36ebb4a 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.50) {
+  if(finalSlabLoadFactor > 0.60) {
     numResizes = 1;
   }
 

From 3a73666fa004d1828ad103723ee3d2f9f738b9d3 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 02:04:01 -0500
Subject: [PATCH 37/61] resize at 65% lf

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 36ebb4a..4442998 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.60) {
+  if(finalSlabLoadFactor > 0.65) {
     numResizes = 1;
   }
 

From 31d7a0fa143ea695ecb56879273c1bfb30a53c1f Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 23:15:12 -0500
Subject: [PATCH 38/61] prints

---
 src/concurrent_map/cmap_class.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index c1e8020..2860f48 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -256,12 +256,14 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
     // creating a random number generator:
     if (!identity_hash) {
+      std::cout << "chicken" << std::endl;
       std::mt19937 rng(seed ? seed : time(0));
       hf_.x = rng() % PRIME_DIVISOR_;
       if (hf_.x < 1)
         hf_.x = 1;
       hf_.y = rng() % PRIME_DIVISOR_;
     } else {
+      std::cout << "cow" << std::endl;
       hf_ = {0u, 0u};
     }
 

From 8f7b436bc1c8304718d30346f1bdbd827fcc567f Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Thu, 7 Jan 2021 23:21:57 -0500
Subject: [PATCH 39/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 4442998..7e27814 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -81,6 +81,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int)));
   *num_successes = 0;
 
+  std::cout << "hashes" << gpu_context_.getHashX() << ", " << gpu_context_.getHashY() << std::endl;
+
   build_table_with_unique_keys_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(num_successes, d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());

From 42d25689b851a93194a1144f3eb9c1eabc1da5b7 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 00:17:11 -0500
Subject: [PATCH 40/61] prints

---
 src/concurrent_map/cmap_class.cuh          | 2 --
 src/concurrent_map/cmap_implementation.cuh | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 2860f48..c1e8020 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -256,14 +256,12 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
     // creating a random number generator:
     if (!identity_hash) {
-      std::cout << "chicken" << std::endl;
       std::mt19937 rng(seed ? seed : time(0));
       hf_.x = rng() % PRIME_DIVISOR_;
       if (hf_.x < 1)
         hf_.x = 1;
       hf_.y = rng() % PRIME_DIVISOR_;
     } else {
-      std::cout << "cow" << std::endl;
       hf_ = {0u, 0u};
     }
 
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 7e27814..2393a05 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -28,11 +28,11 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.65) {
+  if(finalSlabLoadFactor > 0.60) {
     numResizes = 1;
   }
 
@@ -81,8 +81,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   CHECK_CUDA_ERROR(cudaMallocManaged(&num_successes, sizeof(int)));
   *num_successes = 0;
 
-  std::cout << "hashes" << gpu_context_.getHashX() << ", " << gpu_context_.getHashY() << std::endl;
-
   build_table_with_unique_keys_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(num_successes, d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());

From 191188975ade8fa95a03a9c31358197730dc05f4 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 14:43:55 -0500
Subject: [PATCH 41/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 4 ++--
 src/slab_hash_global.cuh                   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 2393a05..eed8bf2 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -28,11 +28,11 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 0.60) {
+  if(finalSlabLoadFactor > 1) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 15ec263..b5fa3b6 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -128,8 +128,8 @@ class GpuSlabHashContext;
 // The custom allocator that is being used for this code:
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
-constexpr uint32_t log_num_mem_blocks = 10; // 128 MB 
-constexpr uint32_t num_super_blocks = 1;
+constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
+constexpr uint32_t num_super_blocks = 15;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From c2db909b8873569f69501138d40a9b8b26a9b0f4 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 14:51:43 -0500
Subject: [PATCH 42/61] resize at 90%

---
 src/concurrent_map/cmap_implementation.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index eed8bf2..3f71747 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -28,11 +28,11 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor > 1) {
+  if(finalSlabLoadFactor >= 0.90) {
     numResizes = 1;
   }
 

From c1df972af5901b577dc71c3af974cb60104b0200 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 14:54:56 -0500
Subject: [PATCH 43/61] changes to SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index bcfbc76..794194f 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit bcfbc7621d07acf1a637c3b32aad435439fa0f69
+Subproject commit 794194faea5a184c574c6811426acf476d3c08fa

From 32558d2f233cc6704e51b42c2161d88bb1878cfe Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 15:00:32 -0500
Subject: [PATCH 44/61] changes to SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index 794194f..b4711d1 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 794194faea5a184c574c6811426acf476d3c08fa
+Subproject commit b4711d1cf11d805d7eb47530f6a5a93ddd2c2591

From 936ae4586ceb5bee1e7831dad99a5d3a463b4fd1 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 15:03:13 -0500
Subject: [PATCH 45/61] SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index b4711d1..ee8f226 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit b4711d1cf11d805d7eb47530f6a5a93ddd2c2591
+Subproject commit ee8f226a235ca1c86812876eecbfbae5eeab2b53

From 7053d9ca1e43051f4af6bc5e2cd762c70ce02dc8 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 8 Jan 2021 15:14:30 -0500
Subject: [PATCH 46/61] fixes to SlabAlloc

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index ee8f226..1e9093d 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit ee8f226a235ca1c86812876eecbfbae5eeab2b53
+Subproject commit 1e9093d82e85fce18994c9bb8c4735bad19d0c5b

From b6ae38e286e314d149ef339087aedc05c38a53c4 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sat, 9 Jan 2021 00:12:35 -0500
Subject: [PATCH 47/61] reisze at 75

---
 SlabAlloc                                  | 2 +-
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 1e9093d..3851e82 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 1e9093d82e85fce18994c9bb8c4735bad19d0c5b
+Subproject commit 3851e82d11f68e57fdf10cf9152538513857aac4
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 3f71747..ca599b5 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.90) {
+  if(finalSlabLoadFactor >= 0.75) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index b5fa3b6..99dd521 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
-constexpr uint32_t num_super_blocks = 15;
+constexpr uint32_t num_super_blocks = 10;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From ba0b0c3b8fd96f493cc55a961e8973432bd72b9f Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sat, 9 Jan 2021 00:24:52 -0500
Subject: [PATCH 48/61] prints

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index ca599b5..e0be948 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -28,7 +28,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 

From 946aa03bbb241c0da730ea6770b981a0edf13a69 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sat, 9 Jan 2021 00:29:54 -0500
Subject: [PATCH 49/61] changes to SlabAlloc

---
 SlabAlloc                                  | 2 +-
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index 3851e82..c165af5 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 3851e82d11f68e57fdf10cf9152538513857aac4
+Subproject commit c165af51d2cea7bfed9659eb2f6fc0f0665cb8dc
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index e0be948..ca599b5 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -28,7 +28,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  std::cout << "finalNumKeys " << finalNumKeys << std::endl;
+  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 99dd521..9586188 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
-constexpr uint32_t num_super_blocks = 10;
+constexpr uint32_t num_super_blocks = 11;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From a217079bf41fd8e5c8b48deaebcc2d5c31afd8a1 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sat, 9 Jan 2021 01:23:13 -0500
Subject: [PATCH 50/61] resize 60

---
 SlabAlloc                                  | 2 +-
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SlabAlloc b/SlabAlloc
index c165af5..bf15666 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit c165af51d2cea7bfed9659eb2f6fc0f0665cb8dc
+Subproject commit bf15666694570184b7289e2d3939315349c92bcc
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index ca599b5..5867c7e 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.75) {
+  if(finalSlabLoadFactor >= 0.60) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 9586188..30d174e 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
-constexpr uint32_t num_super_blocks = 11;
+constexpr uint32_t num_super_blocks = 2;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From f5f6f00ff4167394d351966c8f050fa849ed7942 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 15 Jan 2021 23:02:51 -0500
Subject: [PATCH 51/61] resize at 55%

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 5867c7e..7f0951d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.60) {
+  if(finalSlabLoadFactor >= 0.55) {
     numResizes = 1;
   }
 

From d9f39f92c29d7e0539e90f36b6d43eec8b01fbd9 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 2 Apr 2021 16:34:28 -0400
Subject: [PATCH 52/61] resize 60

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 7f0951d..5867c7e 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.55) {
+  if(finalSlabLoadFactor >= 0.60) {
     numResizes = 1;
   }
 

From 6a57ca86e3503f8d8f44607fdfedb63a79aa89e6 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 2 Apr 2021 16:44:00 -0400
Subject: [PATCH 53/61] resize 90

---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/slab_hash_global.cuh                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 5867c7e..3f71747 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.60) {
+  if(finalSlabLoadFactor >= 0.90) {
     numResizes = 1;
   }
 
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index 30d174e..b5fa3b6 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -129,7 +129,7 @@ class GpuSlabHashContext;
 // this might need to be a template paramater itself
 namespace slab_alloc_par {
 constexpr uint32_t log_num_mem_blocks = 9; // 64 MB 
-constexpr uint32_t num_super_blocks = 2;
+constexpr uint32_t num_super_blocks = 15;
 constexpr uint32_t num_replicas = 1;
 }  // namespace slab_alloc_par
 

From 38b5a03d26b7de31c2acdeeb478d5619025887cc Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Fri, 2 Apr 2021 17:27:00 -0400
Subject: [PATCH 54/61] SlabAlloc changes

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index bf15666..95cb758 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit bf15666694570184b7289e2d3939315349c92bcc
+Subproject commit 95cb7581b33ba13b29db4429a1f2213241bb1d47

From b24b99850cc645cc1663549e420d8cfd3ffe2d86 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sat, 17 Apr 2021 21:56:41 -0400
Subject: [PATCH 55/61] number of mem blocks and super blocks configurable from
 user code

---
 src/concurrent_map/cmap_class.cuh             | 33 +++++++-------
 src/concurrent_map/cmap_implementation.cuh    | 44 +++++++++----------
 src/concurrent_map/device/build.cuh           | 12 ++---
 .../device/concurrent_kernel.cuh              |  6 +--
 src/concurrent_map/device/count_kernel.cuh    |  4 +-
 src/concurrent_map/device/delete_kernel.cuh   |  4 +-
 src/concurrent_map/device/misc_kernels.cuh    |  4 +-
 src/concurrent_map/device/search_kernel.cuh   |  8 ++--
 src/concurrent_map/warp/count.cuh             |  4 +-
 src/concurrent_map/warp/delete.cuh            |  4 +-
 src/concurrent_map/warp/insert.cuh            | 12 ++---
 src/concurrent_map/warp/search.cuh            |  8 ++--
 src/concurrent_set/cset_class.cuh             | 30 ++++++-------
 src/concurrent_set/cset_helper_kernels.cuh    | 10 ++---
 src/concurrent_set/cset_implementation.cuh    | 12 ++---
 src/concurrent_set/cset_warp_operations.cuh   | 14 +++---
 src/gpu_hash_table.cuh                        | 15 ++++---
 src/slab_hash_global.cuh                      |  4 +-
 src/slab_iterator.cuh                         |  6 +--
 test/iterator_test.cu                         |  6 +--
 20 files changed, 123 insertions(+), 117 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index c1e8020..d89c5ef 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -22,8 +22,8 @@
  * used at runtime. This class does not own the allocated memory on the gpu
  * (i.e., d_table_)
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -36,7 +36,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
     total_num_slabs_ = rhs.getTotalNumSlabs();
     total_num_keys_ = rhs.getTotalNumKeys();
@@ -61,7 +61,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               AllocatorContextT* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
@@ -72,7 +72,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     total_num_keys_ = 0;
   }
 
-  __host__ void updateAllocatorContext(AllocatorContextT* allocator_ctx) {
+  __host__ void updateAllocatorContext(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
     global_allocator_ctx_ = *allocator_ctx;
     total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool();
   }
@@ -81,7 +81,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     total_num_keys_ += keysAdded;
   }
 
-  __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -108,7 +108,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
                                              const KeyT& myKey,
                                              const ValueT& myValue,
                                              const uint32_t bucket_id,
-                                             AllocatorContextT& local_allocator_context);
+                                             SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
@@ -119,7 +119,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
       const KeyT& myKey,
       const ValueT& myValue,
       const uint32_t bucket_id,
-      AllocatorContextT& local_allocator_context);
+      SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooperate with each other to search for keys
   // if found, it returns the corresponding value, else SEARCH_NOT_FOUND
@@ -172,7 +172,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -189,14 +189,14 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
   uint32_t hash_y_;
   typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT* d_table_;
   // a copy of dynamic allocator's context to be used on the GPU
-  AllocatorContextT global_allocator_ctx_;
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> global_allocator_ctx_;
 };
 
 /*
  * This class owns the allocated memory for the hash table
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> {
  private:
   // fixed known parameters:
   static constexpr uint32_t BLOCKSIZE_ = 128;
@@ -218,16 +218,17 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
-  DynamicAllocatorT* dynamic_allocator_;
+  //DynamicAllocatorT* dynamic_allocator_;
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1> *dynamic_allocator_;
   uint32_t device_idx_;
 
  public:
   GpuSlabHash(const uint32_t num_buckets,
-              DynamicAllocatorT* dynamic_allocator,
+              SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
               const bool identity_hash = false)
@@ -247,7 +248,7 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> {
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 3f71747..a76ec7d 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -16,14 +16,14 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::resize() {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::resize() {
   dynamic_allocator_->growPool();
   gpu_context_.updateAllocatorContext(dynamic_allocator_->getContextPtr());
 }
 
-template <typename KeyT, typename ValueT>
-uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemptiveResize(uint32_t keysAdded) {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
@@ -39,8 +39,8 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::checkForPreemp
   return numResizes;
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
@@ -63,8 +63,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulk(
   gpu_context_.updateTotalNumKeys(num_keys);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqueKeys(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::buildBulkWithUniqueKeys(
   KeyT* d_key,
   ValueT* d_value,
   uint32_t num_keys) {
@@ -91,8 +91,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::buildBulkWithUniqu
   //std::cout << "num_successes: " << *num_successes << std::endl;
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchIndividual(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -102,8 +102,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchBulk(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -113,8 +113,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchBulk(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::countIndividual(
     KeyT* d_query,
     uint32_t* d_count,
     uint32_t num_queries) {
@@ -124,8 +124,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_count, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::deleteIndividual(
     KeyT* d_key,
     uint32_t num_keys) {
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
@@ -135,8 +135,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteIndividual(
 }
 
 // perform a batch of (a mixture of) updates/searches
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::batchedOperation(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::batchedOperation(
     KeyT* d_key,
     ValueT* d_result,
     uint32_t num_ops) {
@@ -146,8 +146,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::batchedOperation(
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_result, num_ops, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::to_string() {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::to_string() {
   std::string result;
   result += " ==== GpuSlabHash: \n";
   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
@@ -161,8 +161,8 @@ std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::to_string()
   return result;
 }
 
-template <typename KeyT, typename ValueT>
-double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::computeLoadFactor(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::computeLoadFactor(
     int flag = 0) {
   uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_];
   uint32_t* d_bucket_pairs_count;
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index 9b05f63..3f00207 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -18,12 +18,12 @@
 /*
  *
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_kernel(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -31,7 +31,7 @@ __global__ void build_table_kernel(
     return;
   }
 
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
@@ -49,13 +49,13 @@ __global__ void build_table_kernel(
   slab_hash.insertPair(to_insert, laneId, myKey, myValue, myBucket, local_allocator_ctx);
 }
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_with_unique_keys_kernel(
     int *num_successes,
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   
   typedef cub::BlockReduce<std::size_t, 128> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -67,7 +67,7 @@ __global__ void build_table_with_unique_keys_kernel(
   //  return;
   //}
 
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
diff --git a/src/concurrent_map/device/concurrent_kernel.cuh b/src/concurrent_map/device/concurrent_kernel.cuh
index d72c546..8cd7626 100644
--- a/src/concurrent_map/device/concurrent_kernel.cuh
+++ b/src/concurrent_map/device/concurrent_kernel.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void batched_operations(
     uint32_t* d_operations,
     uint32_t* d_results,
     uint32_t num_operations,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -29,7 +29,7 @@ __global__ void batched_operations(
     return;
 
   // initializing the memory allocator on each warp:
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   uint32_t myOperation = 0;
diff --git a/src/concurrent_map/device/count_kernel.cuh b/src/concurrent_map/device/count_kernel.cuh
index 99fffba..d71da3d 100644
--- a/src/concurrent_map/device/count_kernel.cuh
+++ b/src/concurrent_map/device/count_kernel.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void count_key(
     KeyT* d_queries,
     uint32_t* d_counts,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/device/delete_kernel.cuh b/src/concurrent_map/device/delete_kernel.cuh
index 9542623..edc47cc 100644
--- a/src/concurrent_map/device/delete_kernel.cuh
+++ b/src/concurrent_map/device/delete_kernel.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void delete_table_keys(
     KeyT* d_key_deleted,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/device/misc_kernels.cuh b/src/concurrent_map/device/misc_kernels.cuh
index 5f9e5b8..0dfef23 100644
--- a/src/concurrent_map/device/misc_kernels.cuh
+++ b/src/concurrent_map/device/misc_kernels.cuh
@@ -21,9 +21,9 @@
  * slabs per bucket. The final results per bucket is stored in d_pairs_count_result and
  * d_slabs_count_result arrays respectively
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void bucket_count_kernel(
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash,
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash,
     uint32_t* d_pairs_count_result,
     uint32_t* d_slabs_count_result,
     uint32_t num_buckets) {
diff --git a/src/concurrent_map/device/search_kernel.cuh b/src/concurrent_map/device/search_kernel.cuh
index c5f38e6..e01ec56 100644
--- a/src/concurrent_map/device/search_kernel.cuh
+++ b/src/concurrent_map/device/search_kernel.cuh
@@ -17,12 +17,12 @@
 #pragma once
 
 //=== Individual search kernel:
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table(
     KeyT* d_queries,
     ValueT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -49,12 +49,12 @@ __global__ void search_table(
 }
 
 //=== Bulk search kernel:
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table_bulk(
     KeyT* d_queries,
     ValueT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_map/warp/count.cuh b/src/concurrent_map/warp/count.cuh
index 4e9a70f..e3d0ede 100644
--- a/src/concurrent_map/warp/count.cuh
+++ b/src/concurrent_map/warp/count.cuh
@@ -19,9 +19,9 @@
 //================================================
 // Individual Count Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::countKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::countKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
diff --git a/src/concurrent_map/warp/delete.cuh b/src/concurrent_map/warp/delete.cuh
index d8d6685..dc65ced 100644
--- a/src/concurrent_map/warp/delete.cuh
+++ b/src/concurrent_map/warp/delete.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::deleteKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::deleteKey(
     bool& to_be_deleted,
     const uint32_t& laneId,
     const KeyT& myKey,
diff --git a/src/concurrent_map/warp/insert.cuh b/src/concurrent_map/warp/insert.cuh
index 124b863..2a88bd7 100644
--- a/src/concurrent_map/warp/insert.cuh
+++ b/src/concurrent_map/warp/insert.cuh
@@ -21,16 +21,16 @@
  * it is assumed all threads within a warp are present and collaborating with
  * each other with a warp-cooperative work sharing (WCWS) strategy.
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::insertPair(
     /*bool& mySuccess,*/
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const ValueT& myValue,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks>& local_allocator_ctx) {
   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -110,16 +110,16 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPair(
  * each other with a warp-cooperative work sharing (WCWS) strategy.
  * returns true only if a new key was inserted into the hash table
  */
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::insertPairUnique(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::insertPairUnique(
     int& mySuccess,
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const ValueT& myValue,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx) {
 
   using SlabHashT = ConcurrentMapT<KeyT, ValueT>;
   uint32_t work_queue = 0;
diff --git a/src/concurrent_map/warp/search.cuh b/src/concurrent_map/warp/search.cuh
index de490de..0bb39be 100644
--- a/src/concurrent_map/warp/search.cuh
+++ b/src/concurrent_map/warp/search.cuh
@@ -19,9 +19,9 @@
 //================================================
 // Individual Search Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -73,9 +73,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKey(
 //================================================
 // Bulk Search Unit:
 //================================================
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ void
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::searchKeyBulk(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchKeyBulk(
     const uint32_t& laneId,
     const KeyT& myKey,
     ValueT& myValue,
diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh
index 3a0fd6a..9510ef7 100644
--- a/src/concurrent_set/cset_class.cuh
+++ b/src/concurrent_set/cset_class.cuh
@@ -21,8 +21,8 @@
  * used at runtime. This class does not own the allocated memory on the gpu
  * (i.e., d_table_)
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -36,7 +36,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
     hash_x_ = rhs.getHashX();
     hash_y_ = rhs.getHashY();
@@ -57,7 +57,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               AllocatorContextT* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
@@ -65,7 +65,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
     global_allocator_ctx_ = *allocator_ctx;
   }
 
-  __device__ __host__ __forceinline__ AllocatorContextT& getAllocatorContext() {
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -88,7 +88,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
                                             const uint32_t& laneId,
                                             const KeyT& myKey,
                                             const uint32_t bucket_id,
-                                            AllocatorContextT& local_allocator_context);
+                                            SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooeparte with each other to search for keys
   // if found, it returns the true, else false
@@ -124,7 +124,7 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(AllocatorContextT& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -139,14 +139,14 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   uint32_t hash_y_;
   typename ConcurrentSetT<KeyT>::SlabTypeT* d_table_;
   // a copy of dynamic allocator's context to be used on the GPU
-  AllocatorContextT global_allocator_ctx_;
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> global_allocator_ctx_;
 };
 
 /*
  * This class owns the allocated memory for the hash table
  */
-template <typename KeyT, typename ValueT>
-class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> {
  private:
   // fixed known parameters:
   static constexpr uint32_t BLOCKSIZE_ = 128;
@@ -168,16 +168,16 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
-  DynamicAllocatorT* dynamic_allocator_;
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator_;
   uint32_t device_idx_;
 
  public:
   GpuSlabHash(const uint32_t num_buckets,
-              DynamicAllocatorT* dynamic_allocator,
+              SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
               const bool identity_hash = false)
@@ -197,7 +197,7 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
@@ -228,7 +228,7 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet> {
   // returns some debug information about the slab hash
   std::string to_string();
   double computeLoadFactor(int flag) {}
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>& getSlabHashContext() {
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& getSlabHashContext() {
     return gpu_context_;
   }
 
diff --git a/src/concurrent_set/cset_helper_kernels.cuh b/src/concurrent_set/cset_helper_kernels.cuh
index 211f7f7..3c4c3a2 100644
--- a/src/concurrent_set/cset_helper_kernels.cuh
+++ b/src/concurrent_set/cset_helper_kernels.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 namespace cset {
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void build_table_kernel(
     KeyT* d_key,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -29,7 +29,7 @@ __global__ void build_table_kernel(
   }
 
   // initializing the memory allocator on each warp:
-  AllocatorContextT local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
@@ -46,12 +46,12 @@ __global__ void build_table_kernel(
 }
 
 //=== Individual search kernel:
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void search_table(
     KeyT* d_queries,
     KeyT* d_results,
     uint32_t num_queries,
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
diff --git a/src/concurrent_set/cset_implementation.cuh b/src/concurrent_set/cset_implementation.cuh
index d189054..dfa4375 100644
--- a/src/concurrent_set/cset_implementation.cuh
+++ b/src/concurrent_set/cset_implementation.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::buildBulk(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
@@ -28,8 +28,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::buildBulk(
       <<<num_blocks, BLOCKSIZE_>>>(d_key, num_keys, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchIndividual(
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchIndividual(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -39,8 +39,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchIndividual(
       <<<num_blocks, BLOCKSIZE_>>>(d_query, d_result, num_queries, gpu_context_);
 }
 
-template <typename KeyT, typename ValueT>
-std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::to_string() {
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
+std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::to_string() {
   std::string result;
   result += " ==== GpuSlabHash: \n";
   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
diff --git a/src/concurrent_set/cset_warp_operations.cuh b/src/concurrent_set/cset_warp_operations.cuh
index 5211c75..3b1240a 100644
--- a/src/concurrent_set/cset_warp_operations.cuh
+++ b/src/concurrent_set/cset_warp_operations.cuh
@@ -16,14 +16,14 @@
 
 #pragma once
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::insertKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::insertKey(
     bool& to_be_inserted,
     const uint32_t& laneId,
     const KeyT& myKey,
     const uint32_t bucket_id,
-    AllocatorContextT& local_allocator_ctx) {
+    SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx) {
   using SlabHashT = ConcurrentSetT<KeyT>;
   uint32_t work_queue = 0;
   uint32_t last_work_queue = 0;
@@ -91,9 +91,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::insertKey(
 }
 
 // ========
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKey(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchKey(
     bool& to_be_searched,
     const uint32_t& laneId,
     const KeyT& myKey,
@@ -138,9 +138,9 @@ GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKey(
   return myResult;
 }
 
-template <typename KeyT, typename ValueT>
+template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __device__ __forceinline__ bool
-GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet>::searchKeyBulk(
+GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>::searchKeyBulk(
     const uint32_t& laneId,
     const KeyT& myKey,
     const uint32_t bucket_id) {}
\ No newline at end of file
diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index 03930b0..665088c 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -21,7 +21,8 @@
  * This class acts as a helper class to simplify simulations around different
  * kinds of slab hash implementations
  */
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, 
+          uint32_t log_num_mem_blocks=9, uint32_t num_super_blocks=1>
 class gpu_hash_table {
  private:
   uint32_t max_keys_;
@@ -32,10 +33,13 @@ class gpu_hash_table {
 
  public:
   // Slab hash invariant
-  GpuSlabHash<KeyT, ValueT, SlabHashT>* slab_hash_;
+  GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>* slab_hash_;
+
+  SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator_;
 
   // the dynamic allocator that is being used for slab hash
-  DynamicAllocatorT* dynamic_allocator_;
+  //DynamicAllocatorT* dynamic_allocator_;
+
 
   uint32_t device_idx_;
 
@@ -77,10 +81,11 @@ class gpu_hash_table {
     //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
     
     // allocate an initialize the allocator:
-    dynamic_allocator_ = new DynamicAllocatorT();
+    //dynamic_allocator_ = new DynamicAllocatorT();
+    dynamic_allocator_ = new SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>;
 
     // slab hash:
-    slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT>(
+    slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>(
         num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_);
     if (verbose) {
       std::cout << slab_hash_->to_string() << std::endl;
diff --git a/src/slab_hash_global.cuh b/src/slab_hash_global.cuh
index b5fa3b6..5ee1ea1 100644
--- a/src/slab_hash_global.cuh
+++ b/src/slab_hash_global.cuh
@@ -119,10 +119,10 @@ class PhaseConcurrentMapT {
 };
 
 // the main class to be specialized for different types of hash tables
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class GpuSlabHash;
 
-template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT>
+template <typename KeyT, typename ValueT, SlabHashTypeT SlabHashT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class GpuSlabHashContext;
 
 // The custom allocator that is being used for this code:
diff --git a/src/slab_iterator.cuh b/src/slab_iterator.cuh
index 3cd0586..35199e6 100644
--- a/src/slab_iterator.cuh
+++ b/src/slab_iterator.cuh
@@ -19,12 +19,12 @@
 // a forward iterator for the slab hash data structure:
 // currently just specialized for concurrent set
 // TODO implement for other types
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 class SlabIterator {
  public:
   using SlabHashT = ConcurrentSetT<KeyT>;
 
-  GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash_;
+  GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& slab_hash_;
 
   // current position of the iterator
   KeyT* cur_ptr_;
@@ -35,7 +35,7 @@ class SlabIterator {
   // initialize the iterator with the first bucket's pointer address of the slab
   // hash
   __host__ __device__
-  SlabIterator(GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet>& slab_hash)
+  SlabIterator(GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& slab_hash)
       : slab_hash_(slab_hash)
       , cur_ptr_(reinterpret_cast<KeyT*>(slab_hash_.getDeviceTablePointer()))
       , cur_size_(slab_hash_.getNumBuckets() * SlabHashT::BASE_UNIT_SIZE)
diff --git a/test/iterator_test.cu b/test/iterator_test.cu
index bb0461f..7b5b9ee 100644
--- a/test/iterator_test.cu
+++ b/test/iterator_test.cu
@@ -28,9 +28,9 @@
 #define DEVICE_ID 0
 //=======================================
 
-template <typename KeyT>
+template <typename KeyT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
 __global__ void print_table(
-    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet> slab_hash) {
+    GpuSlabHashContext<KeyT, KeyT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t wid = tid >> 5;
   uint32_t laneId = threadIdx.x & 0x1F;
@@ -44,7 +44,7 @@ __global__ void print_table(
 
   if (tid == 0) {
     printf(" == Printing the base array\n");
-    SlabIterator<KeyT> iter(slab_hash);
+    SlabIterator<KeyT, log_num_mem_blocks, num_super_blocks> iter(slab_hash);
     for (int i = 0; i < iter.cur_size_; i++) {
       if ((i & 0x1F) == 0)
         printf(" == bucket %d:\n", i >> 5);

From 10c00747ab97e689074e956dc1648ef7d14c9690 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 18 Apr 2021 17:11:31 -0400
Subject: [PATCH 56/61] user configurable threshold resizing lf

---
 src/concurrent_map/cmap_class.cuh          | 9 ++++++---
 src/concurrent_map/cmap_implementation.cuh | 2 +-
 src/concurrent_set/cset_class.cuh          | 3 ++-
 src/gpu_hash_table.cuh                     | 9 ++++++---
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index d89c5ef..542f3b0 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -222,21 +222,24 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
-  //DynamicAllocatorT* dynamic_allocator_;
   SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1> *dynamic_allocator_;
   uint32_t device_idx_;
 
+  float thresh_lf_;
+
  public:
   GpuSlabHash(const uint32_t num_buckets,
               SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
-              const bool identity_hash = false)
+              const bool identity_hash = false,
+              float thresh_lf = 0.60)
       : num_buckets_(num_buckets)
       , d_table_(nullptr)
       , slab_unit_size_(0)
       , dynamic_allocator_(dynamic_allocator)
-      , device_idx_(device_idx) {
+      , device_idx_(device_idx)
+      , thresh_lf_(thresh_lf) {
     assert(dynamic_allocator && "No proper dynamic allocator attached to the slab hash.");
     assert(sizeof(typename ConcurrentMapT<KeyT, ValueT>::SlabTypeT) ==
                (WARP_WIDTH_ * sizeof(uint32_t)) &&
diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index a76ec7d..0016e54 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -32,7 +32,7 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blo
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
-  if(finalSlabLoadFactor >= 0.90) {
+  if(finalSlabLoadFactor >= thresh_lf_) {
     numResizes = 1;
   }
 
diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh
index 9510ef7..8c86734 100644
--- a/src/concurrent_set/cset_class.cuh
+++ b/src/concurrent_set/cset_class.cuh
@@ -180,7 +180,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks
               SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>* dynamic_allocator,
               uint32_t device_idx,
               const time_t seed = 0,
-              const bool identity_hash = false)
+              const bool identity_hash = false,
+              float thresh_lf = 0.60)
       : num_buckets_(num_buckets)
       , d_table_(nullptr)
       , slab_unit_size_(0)
diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index 665088c..0300b2d 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -50,16 +50,20 @@ class gpu_hash_table {
   ValueT* d_result_;
   uint32_t* d_count_;
 
+  float thresh_lf_;
+
   gpu_hash_table(uint32_t max_keys,
                  uint32_t num_buckets,
                  const uint32_t device_idx,
                  const int64_t seed,
                  const bool req_values = true,
                  const bool identity_hash = false,
-                 const bool verbose = false)
+                 const bool verbose = false,
+                 float thresh_lf = 0.60)
       : max_keys_(max_keys)
       , num_buckets_(num_buckets)
       , seed_(seed)
+      , thresh_lf_(thresh_lf)
       , req_values_(req_values)
       , slab_hash_(nullptr)
       , identity_hash_(identity_hash)
@@ -81,12 +85,11 @@ class gpu_hash_table {
     //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
     
     // allocate an initialize the allocator:
-    //dynamic_allocator_ = new DynamicAllocatorT();
     dynamic_allocator_ = new SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>;
 
     // slab hash:
     slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>(
-        num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_);
+        num_buckets_, dynamic_allocator_, device_idx_, seed_, identity_hash_, thresh_lf_);
     if (verbose) {
       std::cout << slab_hash_->to_string() << std::endl;
     }

From 148f1b8da3c6cccc33da148bcd11b6c76dd0791e Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 18 Apr 2021 18:31:22 -0400
Subject: [PATCH 57/61] minor changes

---
 src/gpu_hash_table.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu_hash_table.cuh b/src/gpu_hash_table.cuh
index 0300b2d..14de391 100644
--- a/src/gpu_hash_table.cuh
+++ b/src/gpu_hash_table.cuh
@@ -85,7 +85,7 @@ class gpu_hash_table {
     //CHECK_CUDA_ERROR(cudaMalloc((void**)&d_count_, sizeof(uint32_t) * max_keys_));
     
     // allocate an initialize the allocator:
-    dynamic_allocator_ = new SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>;
+    dynamic_allocator_ = new SlabAllocLight<log_num_mem_blocks, num_super_blocks, 1>(num_buckets);
 
     // slab hash:
     slab_hash_ = new GpuSlabHash<KeyT, ValueT, SlabHashT, log_num_mem_blocks, num_super_blocks>(

From e701dbf6a5bd4770c970d90da5683e6a2b4069bb Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 18 Apr 2021 18:52:01 -0400
Subject: [PATCH 58/61] SlabAlloc changes

---
 SlabAlloc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SlabAlloc b/SlabAlloc
index 95cb758..13983dd 160000
--- a/SlabAlloc
+++ b/SlabAlloc
@@ -1 +1 @@
-Subproject commit 95cb7581b33ba13b29db4429a1f2213241bb1d47
+Subproject commit 13983dd0f92d2ef9b9079651d9a28ffe6da74a43

From b44ccde7b122d0c20f6a0b49e201c97f746760b9 Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 18 Apr 2021 23:22:45 -0400
Subject: [PATCH 59/61] style fixes

---
 src/concurrent_map/cmap_implementation.cuh | 33 +++++++++++++---------
 src/concurrent_map/device/build.cuh        | 16 +++++------
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/concurrent_map/cmap_implementation.cuh b/src/concurrent_map/cmap_implementation.cuh
index 0016e54..80890a0 100644
--- a/src/concurrent_map/cmap_implementation.cuh
+++ b/src/concurrent_map/cmap_implementation.cuh
@@ -23,12 +23,12 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::checkForPreemptiveResize(uint32_t keysAdded) {
+uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                     log_num_mem_blocks, num_super_blocks>::checkForPreemptiveResize(uint32_t keysAdded) {
   auto numSlabs = gpu_context_.getTotalNumSlabs();
   
   auto capacity = numSlabs * 16; // capacity in key-value size multiples
   auto finalNumKeys = gpu_context_.getTotalNumKeys() + keysAdded;
-  //std::cout << "finalNumKeys " << finalNumKeys << std::endl;
   auto finalSlabLoadFactor = (float) (finalNumKeys) / capacity;
   auto numResizes = 0;
 
@@ -40,7 +40,8 @@ uint32_t GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blo
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::buildBulk(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::buildBulk(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys) {
@@ -53,7 +54,6 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
   }
 
   // calling the kernel for bulk build:
-  //int num_successes = 0;
   build_table_kernel<KeyT, ValueT>
       <<<num_blocks, BLOCKSIZE_>>>(d_key, d_value, num_keys, gpu_context_);
   CHECK_CUDA_ERROR(cudaDeviceSynchronize());
@@ -64,7 +64,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::buildBulkWithUniqueKeys(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::buildBulkWithUniqueKeys(
   KeyT* d_key,
   ValueT* d_value,
   uint32_t num_keys) {
@@ -88,11 +89,11 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
   // now that the bulk insert has completed successfully, we can
   // update the total number of keys in the table
   gpu_context_.updateTotalNumKeys(*num_successes);
-  //std::cout << "num_successes: " << *num_successes << std::endl;
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchIndividual(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::searchIndividual(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -103,7 +104,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::searchBulk(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::searchBulk(
     KeyT* d_query,
     ValueT* d_result,
     uint32_t num_queries) {
@@ -114,7 +116,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::countIndividual(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::countIndividual(
     KeyT* d_query,
     uint32_t* d_count,
     uint32_t num_queries) {
@@ -125,7 +128,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::deleteIndividual(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::deleteIndividual(
     KeyT* d_key,
     uint32_t num_keys) {
   CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
@@ -136,7 +140,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 
 // perform a batch of (a mixture of) updates/searches
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::batchedOperation(
+void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                 log_num_mem_blocks, num_super_blocks>::batchedOperation(
     KeyT* d_key,
     ValueT* d_result,
     uint32_t num_ops) {
@@ -147,7 +152,8 @@ void GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks,
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::to_string() {
+std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                        log_num_mem_blocks, num_super_blocks>::to_string() {
   std::string result;
   result += " ==== GpuSlabHash: \n";
   result += "\t Running on device \t\t " + std::to_string(device_idx_) + "\n";
@@ -162,7 +168,8 @@ std::string GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_
 }
 
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::computeLoadFactor(
+double GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                   log_num_mem_blocks, num_super_blocks>::computeLoadFactor(
     int flag = 0) {
   uint32_t* h_bucket_pairs_count = new uint32_t[num_buckets_];
   uint32_t* d_bucket_pairs_count;
diff --git a/src/concurrent_map/device/build.cuh b/src/concurrent_map/device/build.cuh
index 3f00207..3530d21 100644
--- a/src/concurrent_map/device/build.cuh
+++ b/src/concurrent_map/device/build.cuh
@@ -23,7 +23,8 @@ __global__ void build_table_kernel(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+    log_num_mem_blocks, num_super_blocks> slab_hash) {
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
@@ -31,7 +32,8 @@ __global__ void build_table_kernel(
     return;
   }
 
-  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> 
+    local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;
@@ -55,7 +57,8 @@ __global__ void build_table_with_unique_keys_kernel(
     KeyT* d_key,
     ValueT* d_value,
     uint32_t num_keys,
-    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> slab_hash) {
+    GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+    log_num_mem_blocks, num_super_blocks> slab_hash) {
   
   typedef cub::BlockReduce<std::size_t, 128> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -63,11 +66,8 @@ __global__ void build_table_with_unique_keys_kernel(
   uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   uint32_t laneId = threadIdx.x & 0x1F;
 
-  //if ((tid - laneId) >= num_keys) {
-  //  return;
-  //}
-
-  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> local_allocator_ctx(slab_hash.getAllocatorContext());
+  SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1> 
+    local_allocator_ctx(slab_hash.getAllocatorContext());
   local_allocator_ctx.initAllocator(tid, laneId);
 
   KeyT myKey = 0;

From c956e08ee147bdcce4801a9c82ee7030d0130dcf Mon Sep 17 00:00:00 2001
From: Nicolas-Iskos <nki@andrew.cmu.edu>
Date: Sun, 18 Apr 2021 23:28:57 -0400
Subject: [PATCH 60/61] style fixes

---
 src/concurrent_map/cmap_class.cuh | 27 ++++++++++++++++++---------
 src/concurrent_set/cset_class.cuh | 27 ++++++++++++++++++---------
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/concurrent_map/cmap_class.cuh b/src/concurrent_map/cmap_class.cuh
index 542f3b0..bbd74b1 100644
--- a/src/concurrent_map/cmap_class.cuh
+++ b/src/concurrent_map/cmap_class.cuh
@@ -23,7 +23,8 @@
  * (i.e., d_table_)
  */
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> {
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                         log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -36,7 +37,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+      log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
     total_num_slabs_ = rhs.getTotalNumSlabs();
     total_num_keys_ = rhs.getTotalNumKeys();
@@ -61,7 +63,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, 
+                               num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
@@ -72,7 +75,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
     total_num_keys_ = 0;
   }
 
-  __host__ void updateAllocatorContext(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
+  __host__ void updateAllocatorContext(SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>* allocator_ctx) {
     global_allocator_ctx_ = *allocator_ctx;
     total_num_slabs_ = num_buckets_ + global_allocator_ctx_.getNumSlabsInPool();
   }
@@ -81,7 +85,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
     total_num_keys_ += keysAdded;
   }
 
-  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& getAllocatorContext() {
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -108,7 +113,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
                                              const KeyT& myKey,
                                              const ValueT& myValue,
                                              const uint32_t bucket_id,
-                                             SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
+                                             SlabAllocLightContext<log_num_mem_blocks, 
+                                             num_super_blocks, 1>& local_allocator_context);
 
   // threads in a warp cooperate with each other to insert a unique key (and its value)
   // into the slab hash
@@ -172,7 +178,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+    local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -218,7 +225,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+  log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
@@ -251,7 +259,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+        log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
diff --git a/src/concurrent_set/cset_class.cuh b/src/concurrent_set/cset_class.cuh
index 8c86734..518d556 100644
--- a/src/concurrent_set/cset_class.cuh
+++ b/src/concurrent_set/cset_class.cuh
@@ -22,7 +22,8 @@
  * (i.e., d_table_)
  */
 template <typename KeyT, typename ValueT, uint32_t log_num_mem_blocks, uint32_t num_super_blocks>
-class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> {
+class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                         log_num_mem_blocks, num_super_blocks> {
  public:
   // fixed known parameters:
   static constexpr uint32_t PRIME_DIVISOR_ = 4294967291u;
@@ -36,7 +37,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem
 
 #pragma hd_warning_disable
   __host__ __device__ GpuSlabHashContext(
-      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& rhs) {
+      GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                         log_num_mem_blocks, num_super_blocks>& rhs) {
     num_buckets_ = rhs.getNumBuckets();
     hash_x_ = rhs.getHashX();
     hash_y_ = rhs.getHashY();
@@ -57,7 +59,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem
                                const uint32_t hash_x,
                                const uint32_t hash_y,
                                int8_t* d_table,
-                               SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>* allocator_ctx) {
+                               SlabAllocLightContext<log_num_mem_blocks, 
+                               num_super_blocks, 1>* allocator_ctx) {
     num_buckets_ = num_buckets;
     hash_x_ = hash_x;
     hash_y_ = hash_y;
@@ -65,7 +68,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem
     global_allocator_ctx_ = *allocator_ctx;
   }
 
-  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& getAllocatorContext() {
+  __device__ __host__ __forceinline__ SlabAllocLightContext<log_num_mem_blocks, 
+    num_super_blocks, 1>& getAllocatorContext() {
     return global_allocator_ctx_;
   }
 
@@ -88,7 +92,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem
                                             const uint32_t& laneId,
                                             const KeyT& myKey,
                                             const uint32_t bucket_id,
-                                            SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_context);
+                                            SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+                                            local_allocator_context);
 
   // threads in a warp cooeparte with each other to search for keys
   // if found, it returns the true, else false
@@ -124,7 +129,8 @@ class GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem
   }
 
   __device__ __forceinline__ SlabAllocAddressT
-  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& local_allocator_ctx, const uint32_t& laneId) {
+  allocateSlab(SlabAllocLightContext<log_num_mem_blocks, num_super_blocks, 1>& 
+    local_allocator_ctx, const uint32_t& laneId) {
     return local_allocator_ctx.warpAllocate(laneId);
   }
 
@@ -168,7 +174,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks
 
   // slab hash context, contains everything that a GPU application needs to be
   // able to use this data structure
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks> gpu_context_;
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                     log_num_mem_blocks, num_super_blocks> gpu_context_;
 
   // const pointer to an allocator that all instances of slab hash are going to
   // use. The allocator itself is not owned by this class
@@ -198,7 +205,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks
     CHECK_CUDA_ERROR(cudaSetDevice(device_idx_));
 
     slab_unit_size_ =
-        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
+        GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentMap, 
+                           log_num_mem_blocks, num_super_blocks>::getSlabUnitSize();
 
     // allocating initial buckets:
     CHECK_CUDA_ERROR(cudaMalloc((void**)&d_table_, slab_unit_size_ * num_buckets_));
@@ -229,7 +237,8 @@ class GpuSlabHash<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks
   // returns some debug information about the slab hash
   std::string to_string();
   double computeLoadFactor(int flag) {}
-  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, log_num_mem_blocks, num_super_blocks>& getSlabHashContext() {
+  GpuSlabHashContext<KeyT, ValueT, SlabHashTypeT::ConcurrentSet, 
+                     log_num_mem_blocks, num_super_blocks>& getSlabHashContext() {
     return gpu_context_;
   }
 

From cd2319d39c37b74c514abba2d469a1e3b33872f4 Mon Sep 17 00:00:00 2001
From: Nicolas Iskos <46684817+Nicolas-Iskos@users.noreply.github.com>
Date: Tue, 20 Apr 2021 08:53:42 -0400
Subject: [PATCH 61/61] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2c8c171..8a20910 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # SlabHash
-A warp-oriented dynamic hash table for GPUs
+A warp-oriented dynamic hash table for GPUs. 
+
+This fork of SlabHash is modified to allow for dynamic growth of the slab pool, allowing the slab hash to increase its total memory footprint as keys are inserted during run time.
 
 ## Publication:
 This library is based on the original slab hash paper, initially proposed in the following IPDPS'18 paper:
@@ -489,4 +491,4 @@ init lf         final lf        num buckets     init build rate(M/s)    concurre
 0.59            0.58            80660           501.725         639.850
 0.64            0.62            69906           493.702         601.822
 
-```
\ No newline at end of file
+```