Skip to content

Commit

Permalink
fix memory leak
Browse files Browse the repository at this point in the history
  • Loading branch information
byshiue committed Sep 8, 2023
1 parent f8e42aa commit afdf9a9
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));

// host buffers.
temperature_ = new float[batch_size];
repetition_penalty_ = new float[batch_size];
min_lengths_ = new int[batch_size];
skip_decode_ = new bool[batch_size];
temperature_ = (float*)std::realloc((void*)temperature_, batch_size * sizeof(float));
repetition_penalty_ = (float*)std::realloc((void*)repetition_penalty_, batch_size * sizeof(float));
min_lengths_ = (int*)std::realloc((void*)min_lengths_, batch_size * sizeof(int));
skip_decode_ = (bool*)std::realloc((void*)skip_decode_, batch_size * sizeof(bool));

is_allocate_buffer_ = true;
}
Expand All @@ -64,10 +64,10 @@ void BaseSamplingLayer<T>::freeBuffer()
allocator_->free((void**)(&min_lengths_buf_));
allocator_->free((void**)(&runtime_logits_buf_));
allocator_->free((void**)(&skip_decode_buf_));
delete[] temperature_;
delete[] repetition_penalty_;
delete[] min_lengths_;
delete[] skip_decode_;
std::free(temperature_);
std::free(repetition_penalty_);
std::free(min_lengths_);
std::free(skip_decode_);
is_allocate_buffer_ = false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ std::unordered_map<std::string, ft::Tensor> ParallelGptTritonModelInstance<T>::c
move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);

h_total_output_lengths_ = (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
const int input_data_len = input_tensors->at("input_ids").shape[1];
h_total_output_lengths_ = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
const bool continue_interactive =
input_tensors->count("START") && reinterpret_cast<const int32_t*>(input_tensors->at("START").data)[0] == 0;
for (int i = 0; i < request_batch_size; ++i) {
Expand Down Expand Up @@ -293,6 +293,7 @@ void ParallelGptTritonModelInstance<T>::freeBuffer()
allocator_->free((void**)(&d_output_ctx_emb_));
allocator_->free((void**)(&d_cum_log_probs_));
allocator_->free((void**)(&d_is_finished_));
std::free(h_total_output_lengths_);
}

template struct ParallelGptTritonModelInstance<float>;
Expand Down

0 comments on commit afdf9a9

Please sign in to comment.