From d32bdaf97e8bf4a05b88a504facd2f040d5fc524 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 14 Nov 2023 04:46:47 +0000
Subject: [PATCH] fix `finished_count`

---
 src/turbomind/models/llama/LlamaBatch.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 168529c52f..529174e666 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -355,7 +355,9 @@ bool LlamaBatch<T>::Initialize()
         });
 
         // all blocks are not enough to hold a single sequence
-        // FT_CHECK_WITH_INFO(active_end != idxs.begin(), "No enough blocks.");
+        if (!sequences.empty()) {
+            FT_CHECK_WITH_INFO(active_end != idxs.begin(), "No enough blocks.");
+        }
 
         // move swap-ins to the back
         auto swapin_beg = std::stable_partition(idxs.begin(), active_end, [&](int idx) {
@@ -398,6 +400,8 @@ bool LlamaBatch<T>::Initialize()
         ClearState(*incoming_);
     }
 
+    FT_CHECK(state_->size <= max_batch_size_);
+
     /// Update block ptrs when there were
     //  1. swap-in or swap-out
     //  2. holes in the active buffer
@@ -1399,6 +1403,8 @@ void LlamaBatch<T>::InternalThreadEntry(int device_id)
         shared_state->barrier->wait();
 
         auto modified = Initialize();
+        // finished sequences is handled by `Initialize()`
+        finished_count = 0;
 
         ContextDecode();