diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index f46d7ebe35..1afa591828 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -428,6 +428,10 @@ bool LlamaBatch::Initialize() static_assert(sizeof(uintptr_t) == sizeof(void*)); } + // clear incoming buffer + std::fill(incoming_->requests.begin(), incoming_->requests.end(), nullptr); + std::fill(incoming_->sequences.begin(), incoming_->sequences.end(), nullptr); + // in case of swap-in/swap-out or there are holes in active buffer, layout of the buffers is changed // generation & sampling need to be re-initialized for correctness return exchange || active_holes;