diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 80e561442a..e1287f471b 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -72,6 +72,10 @@ LlamaWeight::~LlamaWeight() pre_decoder_embedding_table = nullptr; post_decoder_embedding_kernel = nullptr; + + for (auto& p : decoder_layer_weights) { + delete p; + } } template diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 57d5c9be5b..e670753701 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -249,13 +249,13 @@ std::unique_ptr> LlamaTritonModel::createSh cuda_device_prop_ptr.get()); return std::make_unique>( - LlamaTritonSharedModelInstance{std::move(llama), - shared_weights_[device_id], - std::move(allocator), + LlamaTritonSharedModelInstance{std::move(allocator), std::move(cublas_algo_map), std::move(cublas_wrapper_mutex), std::move(cublas_wrapper), std::move(cuda_device_prop_ptr), + shared_weights_[device_id], + std::move(llama), session_len_}); } diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h index 1713d96bef..4dff6eb24c 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h @@ -29,13 +29,13 @@ namespace ft = turbomind; template struct LlamaTritonSharedModelInstance { - std::unique_ptr> llm; - std::shared_ptr> llm_weight; std::unique_ptr> allocator; std::unique_ptr cublas_algo_map; std::unique_ptr cublas_wrapper_mutex; std::unique_ptr cublas_wrapper; std::unique_ptr cuda_device_prop_ptr; + std::shared_ptr> llm_weight; + std::unique_ptr> llm; const int session_len; }; diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp index 4026048e31..359060ee04 100644 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp @@ -271,6 +271,7 @@ struct AbstractTransformerModel; struct AbstractTransformerModelInstance; struct AbstractTransformerModelInstance { + virtual ~AbstractTransformerModelInstance() {} virtual std::shared_ptr> forward(std::shared_ptr> input_tensors) = 0;