From afdf9a9eb86f15363c0249117d166d6b45dbb371 Mon Sep 17 00:00:00 2001
From: bhsueh <bhsueh@nvidia.com>
Date: Fri, 8 Sep 2023 06:30:02 +0000
Subject: [PATCH] fix memory leak

---
 .../layers/sampling_layers/BaseSamplingLayer.cc  | 16 ++++++++--------
 .../ParallelGptTritonModelInstance.cc            |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
index bb168847f..a334ddb2f 100644
--- a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
+++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
@@ -44,10 +44,10 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
         reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));
 
     // host buffers.
-    temperature_        = new float[batch_size];
-    repetition_penalty_ = new float[batch_size];
-    min_lengths_        = new int[batch_size];
-    skip_decode_        = new bool[batch_size];
+    temperature_        = (float*)std::realloc((void*)temperature_, batch_size * sizeof(float));
+    repetition_penalty_ = (float*)std::realloc((void*)repetition_penalty_, batch_size * sizeof(float));
+    min_lengths_        = (int*)std::realloc((void*)min_lengths_, batch_size * sizeof(int));
+    skip_decode_        = (bool*)std::realloc((void*)skip_decode_, batch_size * sizeof(bool));
 
     is_allocate_buffer_ = true;
 }
@@ -64,10 +64,10 @@ void BaseSamplingLayer<T>::freeBuffer()
         allocator_->free((void**)(&min_lengths_buf_));
         allocator_->free((void**)(&runtime_logits_buf_));
         allocator_->free((void**)(&skip_decode_buf_));
-        delete[] temperature_;
-        delete[] repetition_penalty_;
-        delete[] min_lengths_;
-        delete[] skip_decode_;
+        std::free(temperature_);
+        std::free(repetition_penalty_);
+        std::free(min_lengths_);
+        std::free(skip_decode_);
         is_allocate_buffer_ = false;
     }
 }
diff --git a/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc b/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
index 2179a69d5..bd5863ece 100644
--- a/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
@@ -69,8 +69,8 @@ std::unordered_map<std::string, ft::Tensor> ParallelGptTritonModelInstance<T>::c
     move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
     move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
 
+    h_total_output_lengths_ = (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
     const int input_data_len = input_tensors->at("input_ids").shape[1];
-    h_total_output_lengths_  = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
     const bool continue_interactive =
         input_tensors->count("START") && reinterpret_cast<const int32_t*>(input_tensors->at("START").data)[0] == 0;
     for (int i = 0; i < request_batch_size; ++i) {
@@ -293,6 +293,7 @@ void ParallelGptTritonModelInstance<T>::freeBuffer()
     allocator_->free((void**)(&d_output_ctx_emb_));
     allocator_->free((void**)(&d_cum_log_probs_));
     allocator_->free((void**)(&d_is_finished_));
+    std::free(h_total_output_lengths_);
 }
 
 template struct ParallelGptTritonModelInstance<float>;