InternLM · irexyc · Sep 28, 2023
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
@@ -49,6 +49,12 @@ class LlamaV2 {
 
         // rank 0 sets flag to true if there are no more tasks in the request_queue
         bool should_stop = false;
+
+        void reset()
+        {
+            request_queue.open();
+            should_stop = false;
+        }
     };
 
     ~LlamaV2();

diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h
@@ -93,6 +93,12 @@ class RequestQueue {
         cv_.notify_all();
     }
 
+    void open()
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        closed_ = false;
+    }
+
 private:
     std::queue<std::shared_ptr<Request>> stop_queue_;
     std::queue<std::shared_ptr<Request>> infer_queue_;

diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -221,6 +221,10 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
     ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
     ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
 
+    if (rank == 0) {
+        shared_state_->reset();
+    }
+
     auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
                                                   kv_head_num_,
                                                   size_per_head_,