From d3a1356c2de3a61d07a716afdd5ff03fb3aba2b4 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 7 Nov 2023 04:14:01 +0000 Subject: [PATCH] clear finished requests --- src/turbomind/models/llama/LlamaBatch.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 1afa59182..9ca3fdc76 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -144,6 +144,7 @@ auto LlamaBatch::ProcessStopRequests(const Requests& requests) -> std::vector if (state_->requests[i] && state_->requests[i]->id == r->id) { ec = 0; CompleteRequest(i, true, r->end_flag); + state_->requests[i].reset(); break; } } @@ -174,7 +175,8 @@ void LlamaBatch::ProcessInferRequests(const Requests& requests) { auto& state = *incoming_; - state.size = state.active_size = 0; + FT_CHECK(state.size == 0); + FT_CHECK(state.active_size == 0); int i = 0; for (const auto& r : requests) { @@ -429,8 +431,9 @@ bool LlamaBatch::Initialize() } // clear incoming buffer - std::fill(incoming_->requests.begin(), incoming_->requests.end(), nullptr); - std::fill(incoming_->sequences.begin(), incoming_->sequences.end(), nullptr); + std::fill_n(incoming_->requests.begin(), incoming_->size, nullptr); + std::fill_n(incoming_->sequences.begin(), incoming_->size, nullptr); + incoming_->size = 0; // in case of swap-in/swap-out or there are holes in active buffer, layout of the buffers is changed // generation & sampling need to be re-initialized for correctness