From a295fbfbb151d6ef805d49cbb39f4af708762fc9 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 26 Oct 2023 10:22:50 +0000 Subject: [PATCH] config smem size for `batchApplyRepetitionPenalty` --- .../kernels/sampling_penalty_kernels.cu | 7 +++++++ .../sampling_layers/TopKSamplingLayer.cu | 20 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/turbomind/kernels/sampling_penalty_kernels.cu b/src/turbomind/kernels/sampling_penalty_kernels.cu index 4877bdb1a0..5e5b22f940 100644 --- a/src/turbomind/kernels/sampling_penalty_kernels.cu +++ b/src/turbomind/kernels/sampling_penalty_kernels.cu @@ -445,11 +445,18 @@ void invokeBatchApplyRepetitionPenalty(T* logits, dim3 block(min(step, 1024)); dim3 grid(local_batch_size); size_t smem_size = step * (sizeof(float) + sizeof(int)); + if (penalty_type == RepetitionPenaltyType::Additive) { + check_cuda_error(cudaFuncSetAttribute(batchApplyRepetitionPenalty, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); batchApplyRepetitionPenalty<<>>( logits, penalties, output_ids, batch_size, vocab_size, input_lengths, max_input_length, step); } else if (penalty_type == RepetitionPenaltyType::Multiplicative) { + check_cuda_error(cudaFuncSetAttribute(batchApplyRepetitionPenalty, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); batchApplyRepetitionPenalty<<>>( logits, penalties, output_ids, batch_size, vocab_size, input_lengths, max_input_length, step); } diff --git a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu index 614b1a68ce..63dae19888 100644 --- a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu +++ b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu @@ -21,6 +21,7 @@ #include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/macro.h" +#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/memory_utils.h" @@ -131,6 +132,20 @@ void TopKSamplingLayer::freeBuffer() is_allocate_buffer_ = false; } +template +inline static std::string format(const Tensor& t) +{ + std::stringstream ss; + const int size = t.size(); + const T* ptr = t.getPtr(); + ss << "["; + for (int i = 0; i < size; ++i) { + ss << (i ? ", " : "") << ptr[i]; + } + ss << "]"; + return ss.str(); +} + template void TopKSamplingLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) { @@ -168,6 +183,11 @@ void TopKSamplingLayer::setup(const size_t batch_size, const size_t beam_widt cudaAutoCpy(runtime_top_p_buf_, runtime_top_p.getPtr(), batch_size, stream_); } + if (isDebug()) { + TM_LOG_INFO("[TopKSamplingLayer] runtime_top_k: %s", format(runtime_top_k).c_str()); + TM_LOG_INFO("[TopKSamplingLayer] runtime_top_p: %s", format(runtime_top_p).c_str()); + } + dim3 block(std::min((int)batch_size, 256)); dim3 grid(div_up((int)batch_size, (int)block.x)); // support top_k up to 1024.