diff --git a/include/merlin/core_kernels.cuh b/include/merlin/core_kernels.cuh
index dd81b814..5bacd590 100644
--- a/include/merlin/core_kernels.cuh
+++ b/include/merlin/core_kernels.cuh
@@ -927,38 +927,160 @@ __global__ void dump_kernel_v2(const Table<K, V, S>* __restrict table,
 
   for (size_t ii = tid; ii < search_length; ii += gridDim.x * blockDim.x) {
     size_t bkt_idx = (ii + offset) / bucket_max_size;
-    int key_idx = (ii + offset) % bucket_max_size;
-    int leading_key_idx = key_idx / TILE_SIZE * TILE_SIZE;
+    size_t key_idx = (ii + offset) % bucket_max_size;
+    size_t leading_key_idx = key_idx / TILE_SIZE * TILE_SIZE;
     Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);
 
     const K key =
         (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
     S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
+
     bool match =
         (!IS_RESERVED_KEY<K>(key)) && pred(key, score, pattern, threshold);
     unsigned int vote = g.ballot(match);
     int tile_cnt = __popc(vote);
-    int tile_offset = 0;
+    size_t tile_offset = 0;
     if (g.thread_rank() == 0) {
-      tile_offset = static_cast<int>(
-          atomicAdd(d_dump_counter, static_cast<size_t>(tile_cnt)));
+      tile_offset = atomicAdd(d_dump_counter, static_cast<size_t>(tile_cnt));
     }
     tile_offset = g.shfl(tile_offset, 0);
+    int bias_g = tile_cnt - __popc(vote >> (key_idx % TILE_SIZE));
+
+    if (match) {
+      d_key[tile_offset + bias_g] = key;
+      if (d_score) {
+        d_score[tile_offset + bias_g] = score;
+      }
+    }
+
+#pragma unroll
+    for (int r = 0; r < TILE_SIZE; r++) {
+      unsigned int biased_vote = vote >> r;
+      bool cur_match = biased_vote & 1;
+      if (cur_match) {
+        int bias = tile_cnt - __popc(biased_vote);
+        size_t cur_idx = leading_key_idx + r;
+
+        for (int j = g.thread_rank(); j < dim; j += TILE_SIZE) {
+          d_val[(tile_offset + bias) * dim + j] =
+              bucket->vectors[cur_idx * dim + j];
+        }
+      }
+    }
+  }
+}
+
+template <class K, class V, class S,
+          template <typename, typename> class PredFunctor>
+__global__ void size_if_kernel(const Table<K, V, S>* __restrict table,
+                               Bucket<K, V, S>* buckets, const K pattern,
+                               const S threshold, size_t* d_counter) {
+  extern __shared__ unsigned char s[];
+  KVM<K, V, S>* const block_tuples{reinterpret_cast<KVM<K, V, S>*>(s)};
+
+  const size_t bucket_max_size{table->bucket_max_size};
+
+  size_t local_acc = 0;
+  __shared__ size_t block_acc;
+  PredFunctor<K, S> pred;
+
+  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};
+
+  if (threadIdx.x == 0) {
+    block_acc = 0;
+  }
+  __syncthreads();
+
+  for (size_t i = tid; i < table->capacity; i += blockDim.x * gridDim.x) {
+    Bucket<K, V, S>* const bucket{&buckets[i / bucket_max_size]};
+
+    const int key_idx{static_cast<int>(i % bucket_max_size)};
+    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};
+    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
+
+    if ((!IS_RESERVED_KEY(key)) && pred(key, score, pattern, threshold)) {
+      ++local_acc;
+    }
+  }
+  atomicAdd(&block_acc, local_acc);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(d_counter, block_acc);
+  }
+}
+
+template <class K, class V, class S,
+          template <typename, typename> class PredFunctor, int TILE_SIZE>
+__global__ void dump_kernel_v3(const Table<K, V, S>* __restrict table,
+                               Bucket<K, V, S>* buckets, const K pattern,
+                               const S threshold, K* d_key, V* __restrict d_val,
+                               S* __restrict d_score, const size_t offset,
+                               const size_t search_length,
+                               size_t* d_dump_counter) {
+  const size_t bucket_max_size = table->bucket_max_size;
+  int dim = table->dim;
+  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
+
+  PredFunctor<K, S> pred;
+
+  __shared__ int block_cnt;
+  __shared__ size_t block_offset;
+
+  size_t tid = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+  for (size_t ii = tid; ii < search_length; ii += gridDim.x * blockDim.x) {
+    size_t bkt_idx = (ii + offset) / bucket_max_size;
+    size_t key_idx = (ii + offset) % bucket_max_size;
+    size_t leading_key_idx = key_idx / TILE_SIZE * TILE_SIZE;
+    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);
+
+    const K key =
+        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
+    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
+
+    if (threadIdx.x == 0) {
+      block_cnt = 0;
+    }
+    __syncthreads();
+
+    bool match =
+        (!IS_RESERVED_KEY<K>(key)) && pred(key, score, pattern, threshold);
+    unsigned int vote = g.ballot(match);
+    int tile_cnt = __popc(vote);
+
+    int in_block_tile_offset = 0;
+    if (g.thread_rank() == 0) {
+      in_block_tile_offset =
+          atomicAdd(reinterpret_cast<int*>(&block_cnt), tile_cnt);
+    }
+    in_block_tile_offset = g.shfl(in_block_tile_offset, 0);
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      block_offset = atomicAdd(d_dump_counter, static_cast<size_t>(block_cnt));
+    }
+    __syncthreads();
+
+    int tile_offset = block_offset + in_block_tile_offset;
+    int bias_g = tile_cnt - __popc(vote >> (key_idx % TILE_SIZE));
 
     if (match) {
-      d_key[tile_offset + key_idx] = key;
+      d_key[tile_offset + bias_g] = key;
       if (d_score) {
-        d_score[tile_offset + key_idx] = score;
+        d_score[tile_offset + bias_g] = score;
       }
     }
 
 #pragma unroll
     for (int r = 0; r < TILE_SIZE; r++) {
-      bool cur_match = vote >> r & 1;
+      unsigned int biased_vote = vote >> r;
+      bool cur_match = biased_vote & 1;
       if (cur_match) {
+        int bias = tile_cnt - __popc(biased_vote);
         int cur_idx = leading_key_idx + r;
         for (int j = g.thread_rank(); j < dim; j += TILE_SIZE) {
-          d_val[(tile_offset + cur_idx) * dim + j] =
+          d_val[(tile_offset + bias) * dim + j] =
               bucket->vectors[cur_idx * dim + j];
         }
       }
diff --git a/include/merlin_hashtable.cuh b/include/merlin_hashtable.cuh
index 29782dca..802a424f 100644
--- a/include/merlin_hashtable.cuh
+++ b/include/merlin_hashtable.cuh
@@ -918,6 +918,7 @@ class HashTable : public HashTableBase<K, V, S> {
     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id));
     shared_mem_size_ = deviceProp.sharedMemPerBlock;
     sm_cnt_ = deviceProp.multiProcessorCount;
+    max_threads_per_block_ = deviceProp.maxThreadsPerBlock;
     create_table<key_type, value_type, score_type>(
         &table_, allocator_, options_.dim, options_.init_capacity,
         options_.max_capacity, options_.max_hbm_for_vectors,
@@ -2621,10 +2622,10 @@ class HashTable : public HashTableBase<K, V, S> {
                            offset % TILE_SIZE == 0 && n % TILE_SIZE == 0;
 
     if (match_fast_cond) {
-      int grid_size = std::min(sm_cnt_, static_cast<int>(SAFE_GET_GRID_SIZE(
-                                            n, options_.block_size)));
-      const int TILE_SIZE = 8;
-
+      int grid_size = std::min(
+          sm_cnt_ * max_threads_per_block_ / options_.block_size,
+          static_cast<int>(SAFE_GET_GRID_SIZE(n, options_.block_size)));
+      const int TILE_SIZE = 32;
       dump_kernel_v2<key_type, value_type, score_type, PredFunctor, TILE_SIZE>
           <<<grid_size, options_.block_size, 0, stream>>>(
               d_table_, table_->buckets, pattern, threshold, keys, values,
@@ -2687,6 +2688,28 @@ class HashTable : public HashTableBase<K, V, S> {
     return h_size;
   }
 
+  /**
+   * @brief Returns the number of keys if meet PredFunctor.
+   *
+   * @param stream The CUDA stream that is used to execute the operation.
+   * @return The table size match condiction of PredFunctor.
+   */
+  template <template <typename, typename> class PredFunctor>
+  void size_if(const key_type& pattern, const score_type& threshold,
+               size_type* d_counter, cudaStream_t stream = 0) const {
+    read_shared_lock lock(mutex_, stream);
+    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
+
+    size_t grid_size = SAFE_GET_GRID_SIZE(capacity(), options_.block_size);
+    grid_size = std::min(grid_size,
+                         static_cast<size_t>(sm_cnt_ * max_threads_per_block_ /
+                                             options_.block_size));
+    size_if_kernel<key_type, value_type, score_type, PredFunctor>
+        <<<grid_size, options_.block_size, 0, stream>>>(
+            d_table_, table_->buckets, pattern, threshold, d_counter);
+    CudaCheckError();
+  }
+
   /**
    * @brief Returns the hash table capacity.
    *
@@ -3057,6 +3080,7 @@ class HashTable : public HashTableBase<K, V, S> {
   TableCore* d_table_ = nullptr;
   size_t shared_mem_size_ = 0;
   int sm_cnt_ = 0;
+  int max_threads_per_block_ = 0;
   std::atomic_bool reach_max_capacity_{false};
   bool initialized_ = false;
   mutable group_shared_mutex mutex_;
diff --git a/tests/export_batch_if_test.cc.cu b/tests/export_batch_if_test.cc.cu
index 9a19f3f1..3ccd2d3a 100644
--- a/tests/export_batch_if_test.cc.cu
+++ b/tests/export_batch_if_test.cc.cu
@@ -30,12 +30,12 @@ struct ExportIfPredFunctor {
   }
 };
 
-void test_export_batch_if() {
-  constexpr uint64_t CAP = 1llu << 23; 
-  size_t n0 = 1llu << 20 - 163;
-  size_t n1 = 1llu << 20 + 221;
-  size_t n2 = 1llu << 20 - 17; 
-  size_t dim = 32;
+void test_export_batch_if_with_limited_size() {
+  constexpr uint64_t CAP = 1llu << 24;
+  size_t n0 = (1llu << 23) - 163;
+  size_t n1 = (1llu << 23) + 221;
+  size_t n2 = (1llu << 23) - 17;
+  size_t dim = 64;
   size_t table_size = 0;
   i64 pattern = 0;
   u64 threshold = 40;
@@ -54,73 +54,135 @@ void test_export_batch_if() {
   std::unique_ptr<Table> table = std::make_unique<Table>();
   table->init(options);
 
+  size_t* d_cnt = nullptr;
+  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));
+
   test_util::KVMSBuffer<i64, f32, u64> buffer0;
   buffer0.Reserve(n0, dim, stream);
   buffer0.ToRange(0, 1, stream);
   buffer0.Setscore((u64)15, stream);
-  table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(),
-                          buffer0.scores_ptr(), stream, true, false);
-  table_size = table->size(stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size.");
+  {
+    test_util::KVMSBuffer<i64, f32, u64> buffer0_ev;
+    buffer0_ev.Reserve(n0, dim, stream);
+    buffer0_ev.ToZeros(stream);
+    // table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(),
+    //                         buffer0.scores_ptr(), stream, true, false);
+    table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(),
+                            buffer0.scores_ptr(), buffer0_ev.keys_ptr(),
+                            buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(),
+                            d_cnt, stream, true, false);
+    table_size = table->size(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size.");
+  }
 
   test_util::KVMSBuffer<i64, f32, u64> buffer1;
   buffer1.Reserve(n1, dim, stream);
   buffer1.ToRange(n0, 1, stream);
   buffer1.Setscore((u64)30, stream);
-  table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(),
-                          buffer1.scores_ptr(), stream, true, false);
-  table_size = table->size(stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  MERLIN_EXPECT_TRUE(table_size == n0 + n1, "Invalid table size.");
+  {
+    test_util::KVMSBuffer<i64, f32, u64> buffer1_ev;
+    buffer1_ev.Reserve(n0, dim, stream);
+    buffer1_ev.ToZeros(stream);
+    // table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(),
+    //                         buffer1.scores_ptr(), stream, true, false);
+    table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(),
+                            buffer1.scores_ptr(), buffer1_ev.keys_ptr(),
+                            buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(),
+                            d_cnt, stream, true, false);
+    table_size = table->size(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 
   test_util::KVMSBuffer<i64, f32, u64> buffer2;
   buffer2.Reserve(n2, dim, stream);
   buffer2.ToRange(n0 + n1, 1, stream);
   buffer2.Setscore((u64)45, stream);
-  table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(),
-                          buffer2.scores_ptr(), stream, true, false);
-  table_size = table->size(stream);
+  {
+    test_util::KVMSBuffer<i64, f32, u64> buffer2_ev;
+    buffer2_ev.Reserve(n0, dim, stream);
+    buffer2_ev.ToZeros(stream);
+    // table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(),
+    //                         buffer2.scores_ptr(), stream, true, false);
+    table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(),
+                            buffer2.scores_ptr(), buffer2_ev.keys_ptr(),
+                            buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(),
+                            d_cnt, stream, true, false);
+    table_size = table->size(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    printf("final size: %zu, capacity: %zu\n", table_size, table->capacity());
+  }
+
+  size_t h_cnt = 0;
+  size_t h_cnt2 = 0;
+
+  table->size_if<ExportIfPredFunctor>(pattern, threshold, d_cnt, stream);
+  CUDA_CHECK(cudaMemcpyAsync(&h_cnt, d_cnt, sizeof(size_t),
+                             cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  MERLIN_EXPECT_TRUE(table_size == n0 + n1 + n2, "Invalid table size.");
+  printf("---> check h_cnt from size_if kernel: %zu\n", h_cnt);
 
   test_util::KVMSBuffer<i64, f32, u64> buffer_out;
-  buffer_out.Reserve(CAP, dim, stream);
+  buffer_out.Reserve(h_cnt, dim, stream);
   buffer_out.ToZeros(stream);
 
-  size_t* d_cnt = nullptr;
-  size_t h_cnt = 0;
-  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));
   CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  bool use_pin = true;
+
+  uint64_t t0 = test_util::getTimestamp();
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start);
   table->export_batch_if<ExportIfPredFunctor>(
       pattern, threshold, static_cast<size_t>(CAP), 0, d_cnt,
-      buffer_out.keys_ptr(), buffer_out.values_ptr(), buffer_out.scores_ptr(),
-      stream);
-  CUDA_CHECK(cudaMemcpyAsync(&h_cnt, d_cnt, sizeof(size_t),
+      buffer_out.keys_ptr(!use_pin), buffer_out.values_ptr(!use_pin),
+      buffer_out.scores_ptr(!use_pin), stream);
+  cudaEventRecord(stop);
+  CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t),
                              cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  MERLIN_EXPECT_TRUE(h_cnt == n0 + n1, "export_batch_if get invalid cnt.");
+  printf("final h_cnt2: %zu\n", h_cnt2);
 
-  buffer_out.SyncData(false, stream);
+  MERLIN_EXPECT_TRUE(
+      h_cnt == h_cnt2,
+      "size_if and export_batch_if get different matching count.");
+  float cu_cost = 0;
+  cudaEventElapsedTime(&cu_cost, start, stop);
+  uint64_t t1 = test_util::getTimestamp();
+  printf("final h_cnt2: %zu, cost: %zu, cu_cost: %f\n", h_cnt2, t1 - t0,
+         cu_cost);
+
+  if (!use_pin) {
+    buffer_out.SyncData(false, stream);
+  }
   CUDA_CHECK(cudaStreamSynchronize(stream));
+  uint64_t t2 = test_util::getTimestamp();
+  printf("use_pin: %d. After sycn data of len: %zu, total cost: %zu\n", use_pin,
+         h_cnt2, t2 - t0);
 
   std::unordered_map<i64, u64> record;
   for (size_t i = 0; i < h_cnt; i++) {
     i64 key = buffer_out.keys_ptr(false)[i];
     u64 score = buffer_out.scores_ptr(false)[i];
-    MERLIN_EXPECT_TRUE(key == static_cast<i64>(score), "");
+    MERLIN_EXPECT_TRUE(score < threshold, "");
     record[key] = score;
     for (int j = 0; j < dim; j++) {
       f32 value = buffer_out.values_ptr(false)[i * dim + j];
       MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), "");
     }
   }
-  MERLIN_EXPECT_TRUE(record.size() == n0 + n1 + n2, "");
+  MERLIN_EXPECT_TRUE(record.size() == h_cnt2, "");
+  printf("record: %zu\n", record.size());
+  printf("n0+n1: %zu\n", n0 + n1);
+  printf("n0+n1+n2: %zu\n", n0 + n1 + n2);
   printf("done\n");
 }
 
 int main() {
-  test_export_batch_if();
+  test_export_batch_if_with_limited_size();
   return 0;
 }
diff --git a/tests/test_util.cuh b/tests/test_util.cuh
index 95a5a23c..15abdcf2 100644
--- a/tests/test_util.cuh
+++ b/tests/test_util.cuh
@@ -38,7 +38,7 @@
   }
 
 #define MERLIN_EXPECT_TRUE(cond, msg)                                    \
-  if (!cond) {                                                           \
+  if ((cond) == false) {                                                 \
     fprintf(stderr, "[ERROR] %s at %s : %d\n", msg, __FILE__, __LINE__); \
     exit(-1);                                                            \
   }
@@ -410,7 +410,7 @@ struct HostAndDeviceBuffer {
       CUDA_FREE_POINTERS(stream, d_data);
     }
     if (h_data) {
-      free(h_data);
+      CUDA_CHECK(cudaFreeHost(h_data));
       h_data = nullptr;
     }
     if (d_data) {
@@ -418,7 +418,7 @@ struct HostAndDeviceBuffer {
       d_data = nullptr;
     }
     getBufferOnDevice(&d_data, n * sizeof(T), stream);
-    h_data = (T*)malloc(n * sizeof(T));
+    CUDA_CHECK(cudaMallocHost(&h_data, n * sizeof(T)));
     size_ = n;
   }
 
@@ -433,7 +433,7 @@ struct HostAndDeviceBuffer {
       CUDA_FREE_POINTERS(stream, d_data);
     }
     if (h_data) {
-      free(h_data);
+      CUDA_CHECK(cudaFreeHost(h_data));
       h_data = nullptr;
     }
     if (d_data) {