diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index fd724a2a19..7ed6d1039c 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -55,7 +55,7 @@ def get_prompt(self, prompt, sequence_start=True):
 
     @abstractmethod
     def decorate_prompt(self, prompt, sequence_start):
-        pass
+        return prompt
 
     @staticmethod
     def _translate_messages(messages: List):
@@ -176,8 +176,8 @@ class InternLMChat7B(BaseModel):
     def __init__(self,
                  system='',
                  user='<|User|>',
-                 eoh='<eoh>',
-                 eoa='<eoa>',
+                 eoh='',
+                 eoa='',
                  assistant='<|Bot|>',
                  **kwargs):
         super().__init__(**kwargs)
@@ -231,19 +231,22 @@ def messages2prompt(self, messages, sequence_start=True):
     @property
     def stop_words(self):
         """Return the stop-words' token ids."""
-        return [103027, 103028]
+        return [103028]
 
 
 @MODELS.register_module(name='internlm-chat-7b-8k')
 class InternLMChat7B8K(InternLMChat7B):
+    """Chat template and generation parameters of InternLM-Chat-7B-8K."""
 
-    def __init__(self, session_len=8192, **kwargs):
+    def __init__(self, session_len=8192, repetition_penalty=1.02, **kwargs):
         super(InternLMChat7B8K, self).__init__(**kwargs)
         self.session_len = session_len
+        self.repetition_penalty = repetition_penalty
 
 
 @MODELS.register_module(name='baichuan-7b')
 class Baichuan7B(BaseModel):
+    """Generation parameters of Baichuan-7B base model."""
 
     def __init__(self, repetition_penalty=1.1, **kwargs):
         super().__init__(**kwargs)
@@ -252,6 +255,8 @@ def __init__(self, repetition_penalty=1.1, **kwargs):
 
 @MODELS.register_module(name='baichuan2-7b')
 class Baichuan2_7B(BaseModel):
+    """Chat template and generation parameters of Baichuan2-7B-Base and
+    Baichuan2-7B-Chat models."""
 
     def __init__(self,
                  temperature=0.3,
diff --git a/lmdeploy/turbomind/tokenizer.py b/lmdeploy/turbomind/tokenizer.py
index 98db9c2b61..966f24148e 100644
--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
@@ -112,7 +112,7 @@ class HuggingFaceTokenizer:
 
     def __init__(self, model_dir: str):
         from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
-                                  LlamaTokenizerFast)
+                                  LlamaTokenizer, LlamaTokenizerFast)
         model_file = osp.join(model_dir, 'tokenizer.model')
         backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
         model_file_exists = osp.exists(model_file)
@@ -121,8 +121,9 @@ def __init__(self, model_dir: str):
                   'It may take long time to initialize the tokenizer.')
         self.model = AutoTokenizer.from_pretrained(model_dir,
                                                    trust_remote_code=True)
-        self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
-            or isinstance(self.model, CodeLlamaTokenizerFast)
+        self.need_padding = type(self.model) in [
+            LlamaTokenizer, LlamaTokenizerFast, CodeLlamaTokenizerFast
+        ]
         self._no_prefix_space_tokens = None
         # save tokenizer.json to reuse
         if not osp.exists(backend_tokenizer_file) and model_file_exists:
diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
index c2b6039d67..01592b4ac4 100644
--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -1422,7 +1422,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
             // Trigger the stores to global memory.
             if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
 
-                int offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh
+                size_t offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh
                              + co * QK_ELTS_IN_16B + ci;
 
                 if (!QUANT_POLICY) {
diff --git a/src/turbomind/models/llama/Barrier.h b/src/turbomind/models/llama/Barrier.h
index 6eb0df9585..ab69a746ca 100644
--- a/src/turbomind/models/llama/Barrier.h
+++ b/src/turbomind/models/llama/Barrier.h
@@ -19,9 +19,9 @@ class Barrier {
         FT_CHECK(count == 1);
     }
 
-    Barrier(const Barrier&) = delete;
-    Barrier& operator=(const Barrier&) = delete;
-    Barrier(Barrier&&) noexcept        = delete;
+    Barrier(const Barrier&)                = delete;
+    Barrier& operator=(const Barrier&)     = delete;
+    Barrier(Barrier&&) noexcept            = delete;
     Barrier& operator=(Barrier&&) noexcept = delete;
 
     void wait() {}
@@ -39,9 +39,9 @@ class Barrier {
         pthread_barrier_init(&barrier_, nullptr, count);
     }
 
-    Barrier(const Barrier&) = delete;
-    Barrier& operator=(const Barrier&) = delete;
-    Barrier(Barrier&&) noexcept        = delete;
+    Barrier(const Barrier&)                = delete;
+    Barrier& operator=(const Barrier&)     = delete;
+    Barrier(Barrier&&) noexcept            = delete;
     Barrier& operator=(Barrier&&) noexcept = delete;
 
     void wait()
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 83db7ad65d..42002f9ada 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -899,8 +899,9 @@ void LlamaBatch<T>::outputContextLogits(T*                      context_decoder_
 
     if (context_logits_buf_ == nullptr) {
         NcclGuard guard(llama_->tensor_para_, stream_, true);
-        context_logits_buf_ = (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
-        const auto tp       = llama_->tensor_para_.world_size_;
+        context_logits_buf_ =
+            (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
+        const auto tp = llama_->tensor_para_.world_size_;
         if (tp > 1) {
             FT_CHECK(llama_->vocab_size_padded_ % tp == 0);
             const auto local_vocab_size = llama_->vocab_size_padded_ / tp;
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
index e8f77e1c74..881582acea 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -215,6 +215,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                 layer_offset,
                                 attention_mask,
                                 cu_seqlens,
+                                input_tensors->at("context_lengths").getPtr<int>(),
                                 batch_size,
                                 max_q_len,
                                 max_k_len,
@@ -258,6 +259,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                                                             size_t cache_layer_offset,
                                                             T*     attention_mask,
                                                             int*   cu_seqlens,
+                                                            int*   context_lengths,
                                                             int    batch_size,
                                                             int    max_q_len,
                                                             int    max_k_len,
@@ -274,13 +276,13 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                     int(size_per_head_),
                     int(max_seq_len * size_per_head_),
                     false,
-                    int(cache_layer_offset),
+                    cache_layer_offset,
                     key_cache_ptrs};
     Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_),
                     int(size_per_head_),
                     int(max_seq_len * size_per_head_),
                     false,
-                    int(cache_layer_offset),
+                    cache_layer_offset,
                     val_cache_ptrs};
     Layout layout_o{
         int(local_head_num_ * max_q_len * size_per_head_),
@@ -298,6 +300,8 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                                              qk_buf_float_,
                                              cu_seqlens,
                                              nullptr,
+                                             nullptr,
+                                             context_lengths,
                                              group_size,
                                              layout_q,
                                              layout_k,
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.h b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
index 235b575b8e..f79eaa4ef2 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -72,6 +72,7 @@ class LlamaContextAttentionLayer {
                                  size_t cache_layer_offset,
                                  T*     attention_mask,
                                  int*   cu_seqlens,
+                                 int*   context_lengths,
                                  int    batch_size,
                                  int    max_q_len,
                                  int    max_k_len,
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 2141f72e7f..8d97481c35 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -38,7 +38,7 @@ struct LlamaDecoderLayerWeight {
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank);
     ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other)            = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
 
     void loadModel(std::string dir_path, FtCudaDataType model_file_type);
diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
index 3caaf59068..103b32e88f 100644
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -130,7 +130,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
 
     params.hidden_size_per_head    = size_per_head;
     params.rotary_embedding_dim    = rotary_embedding_dim;
-    params.rotary_embedding_base         = rotary_embedding_base;
+    params.rotary_embedding_base   = rotary_embedding_base;
     params.max_position_embeddings = max_position_embeddings;
     params.use_dynamic_ntk         = use_dynamic_ntk;
     params.use_logn_attn           = use_logn_attn;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 410667f1bd..a0596ab3b1 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -25,8 +25,7 @@
 
 namespace turbomind {
 
-enum class WeightType : int
-{
+enum class WeightType : int {
     kFP32,
     kFP16,
     kFP8,  // not supported yet
diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h
index 0e783df33d..35a99d1a50 100644
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -15,8 +15,7 @@ namespace turbomind {
 template<typename T>
 class LlamaLinear {
 public:
-    enum Type
-    {
+    enum Type {
         kGemm,
         kFusedSiluFfn
     };
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index beaf3c3f6d..9c48e4f818 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -93,7 +93,8 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
     TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
 
-    vocab_size_padded_ = (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
+    vocab_size_padded_ =
+        (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
 
     size_t elem_bits = 0;
     if (quant_policy & QuantPolicy::kCacheKVInt8) {
@@ -171,7 +172,7 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
 
     dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
                                                           vocab_size_padded_,
-                                                          0,            // end_id, deprecated
+                                                          0,  // end_id, deprecated
                                                           stream_,
                                                           cublas_wrapper_,
                                                           allocator_,
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 511cbe5bbf..80e561442a 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -95,8 +95,10 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
 
     loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
 
-    loadWeightFromBin(
-        (T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_padded_}, dir_path + "output.weight", model_file_type);
+    loadWeightFromBin((T*)post_decoder_embedding_kernel,
+                      {hidden_units_ * vocab_size_padded_},
+                      dir_path + "output.weight",
+                      model_file_type);
 
     for (unsigned layer = 0; layer < num_layer_; ++layer) {
         decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index be7fda2b98..411c41f207 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -42,7 +42,7 @@ struct LlamaWeight {
 
     ~LlamaWeight();
 
-    LlamaWeight(const LlamaWeight& other) = delete;
+    LlamaWeight(const LlamaWeight& other)            = delete;
     LlamaWeight& operator=(const LlamaWeight& other) = delete;
 
     void loadModel(std::string dir_path);
diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h
index cb2d1858a3..6af0bdabae 100644
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -25,8 +25,7 @@ struct Request {
     using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
     Callback stream_cb;
 
-    enum
-    {
+    enum {
         kInvalid  = 1,
         kConflict = 2,
         kBusy     = 3,
diff --git a/src/turbomind/models/llama/flash_attention2/block_info.h b/src/turbomind/models/llama/flash_attention2/block_info.h
index 310d1f22bf..38b6aa2583 100644
--- a/src/turbomind/models/llama/flash_attention2/block_info.h
+++ b/src/turbomind/models/llama/flash_attention2/block_info.h
@@ -15,10 +15,14 @@ struct BlockInfo {
     __device__ BlockInfo(const Params& params, const int bidb):
         sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb]),
         sum_s_k(!Varlen || params.cu_seqlens_k == nullptr ? -1 : params.cu_seqlens_k[bidb]),
-        actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q :
-                                                                    params.cu_seqlens_q[bidb + 1] - sum_s_q),
-        actual_seqlen_k(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k :
-                                                                    params.cu_seqlens_k[bidb + 1] - sum_s_k)
+        actual_seqlen_q(params.actual_seqlen_q == nullptr ?
+                            (!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q :
+                                                                         params.cu_seqlens_q[bidb + 1] - sum_s_q) :
+                            params.actual_seqlen_q[bidb]),
+        actual_seqlen_k(params.actual_seqlen_k == nullptr ?
+                            (!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k :
+                                                                         params.cu_seqlens_k[bidb + 1] - sum_s_k) :
+                            params.actual_seqlen_k[bidb])
     {
     }
 
diff --git a/src/turbomind/models/llama/flash_attention2/flash.h b/src/turbomind/models/llama/flash_attention2/flash.h
index 576cbc8d9c..8a5a7c5794 100644
--- a/src/turbomind/models/llama/flash_attention2/flash.h
+++ b/src/turbomind/models/llama/flash_attention2/flash.h
@@ -16,7 +16,7 @@ constexpr int D_DIM     = 2;
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct Qkv_params {
-    using index_t = uint32_t;
+    using index_t = size_t;
     // The QKV matrices.
     void* __restrict__ q_ptr;
     void* __restrict__ k_ptr;
@@ -25,8 +25,8 @@ struct Qkv_params {
     // batched ptr inputs.
     void** __restrict__ k_batched_ptr = nullptr;
     void** __restrict__ v_batched_ptr = nullptr;
-    int k_batched_offset              = 0;
-    int v_batched_offset              = 0;
+    size_t k_batched_offset           = 0;
+    size_t v_batched_offset           = 0;
 
     // The stride between rows of the Q, K and V matrices.
     index_t q_batch_stride;
@@ -72,6 +72,10 @@ struct Flash_fwd_params: public Qkv_params {
     int* __restrict__ cu_seqlens_q;
     int* __restrict__ cu_seqlens_k;
 
+    // array of length b with actual length of each sequence
+    int* __restrict__ actual_seqlen_q;
+    int* __restrict__ actual_seqlen_k;
+
     void* __restrict__ blockmask;
 
     bool is_bf16;
diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp
index e2f12c7233..55bc92c1ff 100644
--- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp
+++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp
@@ -121,6 +121,9 @@ class FlashAttentionOpImpl<T, FMHA_VERSION>::impl {
         fwd_params.cu_seqlens_q = params.cu_seqlens_q;
         fwd_params.cu_seqlens_k = params.cu_seqlens_k;
 
+        fwd_params.actual_seqlen_q = params.actual_seqlen_q;
+        fwd_params.actual_seqlen_k = params.actual_seqlen_k;
+
         fwd_params.blockmask = reinterpret_cast<void*>(params.mask);
 
         fwd_params.is_bf16   = false;
diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
index 29035421c1..4fae69bd08 100644
--- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
+++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
@@ -70,10 +70,10 @@ struct LlamaAttentionKernel:
         scalar_t** v_batch_seqs_ptr = nullptr;
         output_t** o_batch_seqs_ptr = nullptr;
 
-        int q_batch_seqs_offset = 0;
-        int k_batch_seqs_offset = 0;
-        int v_batch_seqs_offset = 0;
-        int o_batch_seqs_offset = 0;
+        size_t q_batch_seqs_offset = 0;
+        size_t k_batch_seqs_offset = 0;
+        size_t v_batch_seqs_offset = 0;
+        size_t o_batch_seqs_offset = 0;
 
         int32_t group_size = 1;
 
@@ -81,7 +81,7 @@ struct LlamaAttentionKernel:
 
         template<typename ptr_t>
         CUTLASS_DEVICE void
-        update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, int batch_seq_offset, int batch_id, int strideB)
+        update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, size_t batch_seq_offset, int batch_id, int strideB)
         {
             if (batch_seq_ptr != nullptr)
                 data_ptr = batch_seq_ptr[batch_id] + batch_seq_offset;
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 6bd4644f0d..dbd6d16704 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -83,9 +83,9 @@ struct BaseAttentionLayout {
     int  stride_batch;
     int  stride_seq;
     int  stride_head;
-    bool use_seqlens       = false;
-    int  batch_seqs_offset = 0;
-    T**  batch_seqs        = nullptr;
+    bool use_seqlens          = false;
+    size_t  batch_seqs_offset = 0;
+    T**  batch_seqs           = nullptr;
 };
 
 template<typename T>
@@ -95,10 +95,12 @@ struct BaseAttentionParams {
     T*                     key;
     T*                     val;
     T*                     mask;
-    float*                 out_accum    = nullptr;
-    int*                   cu_seqlens_q = nullptr;
-    int*                   cu_seqlens_k = nullptr;
-    size_t                 group_size   = 1;
+    float*                 out_accum       = nullptr;
+    int*                   cu_seqlens_q    = nullptr;
+    int*                   cu_seqlens_k    = nullptr;
+    int*                   actual_seqlen_q = nullptr;
+    int*                   actual_seqlen_k = nullptr;
+    size_t                 group_size      = 1;
     BaseAttentionLayout<T> layout_q;
     BaseAttentionLayout<T> layout_k;
     BaseAttentionLayout<T> layout_v;
diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h
index 05c10be80b..011dcca78c 100644
--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -9,8 +9,7 @@
 
 namespace turbomind {
 
-enum QuantPolicy
-{
+enum QuantPolicy {
     kNone = 0x00,
     // reserve 0x01 and 0x02 for backward compatibility
     kReserve1 = 0x01,
@@ -19,8 +18,7 @@ enum QuantPolicy
     kCacheKVInt8 = 0x04,
 };
 
-enum CmpMode
-{
+enum CmpMode {
     kCmpNone,
     kCmpRead,
     kCmpWrite,
@@ -52,7 +50,7 @@ inline std::string to_string(std::string x)
 template<typename... Args>
 std::string Concat(std::string key, Args&&... args)
 {
-    std::vector<std::string> args_str{detail::to_string((Args &&) args)...};
+    std::vector<std::string> args_str{detail::to_string((Args&&)args)...};
     for (const auto& s : args_str) {
         key.append("_");
         key.append(s);
diff --git a/tests/csrc/unittests/test_context_attention_layer.cu b/tests/csrc/unittests/test_context_attention_layer.cu
index 948cd88a68..87693de34d 100644
--- a/tests/csrc/unittests/test_context_attention_layer.cu
+++ b/tests/csrc/unittests/test_context_attention_layer.cu
@@ -278,6 +278,8 @@ int main(int argc, const char* argv[])
     // auto* input_lengths  = (int*)allocator.malloc(sizeof(int) * batch_size, false);
     thrust::device_vector<int> input_lengths(batch_size);
     thrust::host_vector<int>   input_lengths_host(batch_size);
+    thrust::device_vector<int> kv_lengths(batch_size);
+    thrust::host_vector<int>   kv_lengths_host(batch_size);
 
     cudaRandomUniform<scalar_t>(query_ptr, batch_size * num_heads * seq_len * size_per_head);
     cudaRandomUniform<scalar_t>(key_ptr, batch_size * num_heads * key_len * size_per_head);
@@ -285,13 +287,12 @@ int main(int argc, const char* argv[])
     cudaRandomUniform<scalar_t>(mask_ptr, batch_size * seq_len * key_len);
 
     // create random length for batch
-    std::uniform_int_distribution<int> dist{seq_len / 2, seq_len};
-    auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
-    std::generate(begin(input_lengths_host), end(input_lengths_host), gen);
-    // for(int batch_id=0;batch_id<batch_size;++batch_id){
-    //     input_lengths_host[batch_id] = seq_len;
-    // }
-    thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin());
+    {
+        std::uniform_int_distribution<int> dist{seq_len / 2, seq_len};
+        auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
+        std::generate(begin(input_lengths_host), end(input_lengths_host), gen);
+        thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin());
+    }
     size_t  h_token_num = 0;
     size_t* h_pinned_token_num;
     auto    input_lengths_ptr = thrust::raw_pointer_cast(input_lengths.data());
@@ -306,10 +307,16 @@ int main(int argc, const char* argv[])
                                        stream);
     cudaFreeHost((void*)h_pinned_token_num);
 
-    int* k_lens = (int*)allocator.malloc(batch_size * sizeof(int));
-    deviceFill(k_lens, batch_size, key_len, stream);
+    {
+        std::uniform_int_distribution<int> dist{seq_len, key_len};
+        auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
+        std::generate(begin(kv_lengths_host), end(kv_lengths_host), gen);
+        thrust::copy(kv_lengths_host.begin(), kv_lengths_host.end(), kv_lengths.begin());
+    }
+    auto kv_lengths_ptr = thrust::raw_pointer_cast(kv_lengths.data());
+    // deviceFill(kv_lengths_ptr, batch_size, key_len, stream);
 
-    invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, k_lens, seq_len, key_len, batch_size, stream);
+    invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, kv_lengths_ptr, seq_len, key_len, batch_size, stream);
     // deviceFill(mask_ptr, batch_size*key_len*seq_len, scalar_t(1), stream);
 
     // compute gt
@@ -356,6 +363,8 @@ int main(int argc, const char* argv[])
                                              accum_buf_ptr,
                                              cu_seqlens_ptr,
                                              nullptr,
+                                             nullptr,
+                                             kv_lengths_ptr,
                                              1,
                                              layout_q,
                                              layout_k,
@@ -367,10 +376,10 @@ int main(int argc, const char* argv[])
     int num_rows = 8;
     // printf("query:\n");
     // printMatrix(query_ptr, num_rows, 8, size_per_head, true);
-    printf("expect:\n");
-    printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true);
-    printf("actual:\n");
-    printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true);
+    // printf("expect:\n");
+    // printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true);
+    // printf("actual:\n");
+    // printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true);
     checkResult(
         "all close:", actual_out_ptr, expect_out_ptr, batch_size * num_heads * seq_len * size_per_head, true, true);
 
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 83487f1f03..dcf04d5c28 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -7,7 +7,7 @@ def test_base_model():
     model = MODELS.get('llama')()
     assert model is not None
     assert model.capability == 'chat'
-    assert model.get_prompt('test') is None
+    assert model.get_prompt('test') == 'test'
     assert model.stop_words is None
 
     model = MODELS.get('internlm')(capability='completion')
@@ -72,7 +72,7 @@ def test_baichuan():
 
     model = MODELS.get('baichuan-7b')(capability='chat')
     _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+    assert _prompt == prompt
 
 
 def test_llama2():