From afdf9a9eb86f15363c0249117d166d6b45dbb371 Mon Sep 17 00:00:00 2001 From: bhsueh Date: Fri, 8 Sep 2023 06:30:02 +0000 Subject: [PATCH] fix memory leak --- .../layers/sampling_layers/BaseSamplingLayer.cc | 16 ++++++++-------- .../ParallelGptTritonModelInstance.cc | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc index bb168847f..a334ddb2f 100644 --- a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc +++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc @@ -44,10 +44,10 @@ void BaseSamplingLayer::allocateBuffer(size_t batch_size, Tensor top_k, Tenso reinterpret_cast(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false)); // host buffers. - temperature_ = new float[batch_size]; - repetition_penalty_ = new float[batch_size]; - min_lengths_ = new int[batch_size]; - skip_decode_ = new bool[batch_size]; + temperature_ = (float*)std::realloc((void*)temperature_, batch_size * sizeof(float)); + repetition_penalty_ = (float*)std::realloc((void*)repetition_penalty_, batch_size * sizeof(float)); + min_lengths_ = (int*)std::realloc((void*)min_lengths_, batch_size * sizeof(int)); + skip_decode_ = (bool*)std::realloc((void*)skip_decode_, batch_size * sizeof(bool)); is_allocate_buffer_ = true; } @@ -64,10 +64,10 @@ void BaseSamplingLayer::freeBuffer() allocator_->free((void**)(&min_lengths_buf_)); allocator_->free((void**)(&runtime_logits_buf_)); allocator_->free((void**)(&skip_decode_buf_)); - delete[] temperature_; - delete[] repetition_penalty_; - delete[] min_lengths_; - delete[] skip_decode_; + std::free(temperature_); + std::free(repetition_penalty_); + std::free(min_lengths_); + std::free(skip_decode_); is_allocate_buffer_ = false; } } diff --git a/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc b/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc index 2179a69d5..bd5863ece 100644 --- a/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc +++ b/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc @@ -69,8 +69,8 @@ std::unordered_map ParallelGptTritonModelInstance::c move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_); move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_); + h_total_output_lengths_ = (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t)); const int input_data_len = input_tensors->at("input_ids").shape[1]; - h_total_output_lengths_ = reinterpret_cast(malloc(request_batch_size * sizeof(uint32_t))); const bool continue_interactive = input_tensors->count("START") && reinterpret_cast(input_tensors->at("START").data)[0] == 0; for (int i = 0; i < request_batch_size; ++i) { @@ -293,6 +293,7 @@ void ParallelGptTritonModelInstance::freeBuffer() allocator_->free((void**)(&d_output_ctx_emb_)); allocator_->free((void**)(&d_cum_log_probs_)); allocator_->free((void**)(&d_is_finished_)); + std::free(h_total_output_lengths_); } template struct ParallelGptTritonModelInstance;