diff --git a/lmdeploy/model.py b/lmdeploy/model.py index fd724a2a19..7ed6d1039c 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -55,7 +55,7 @@ def get_prompt(self, prompt, sequence_start=True): @abstractmethod def decorate_prompt(self, prompt, sequence_start): - pass + return prompt @staticmethod def _translate_messages(messages: List): @@ -176,8 +176,8 @@ class InternLMChat7B(BaseModel): def __init__(self, system='', user='<|User|>', - eoh='', - eoa='', + eoh='', + eoa='', assistant='<|Bot|>', **kwargs): super().__init__(**kwargs) @@ -231,19 +231,22 @@ def messages2prompt(self, messages, sequence_start=True): @property def stop_words(self): """Return the stop-words' token ids.""" - return [103027, 103028] + return [103028] @MODELS.register_module(name='internlm-chat-7b-8k') class InternLMChat7B8K(InternLMChat7B): + """Chat template and generation parameters of InternLM-Chat-7B-8K.""" - def __init__(self, session_len=8192, **kwargs): + def __init__(self, session_len=8192, repetition_penalty=1.02, **kwargs): super(InternLMChat7B8K, self).__init__(**kwargs) self.session_len = session_len + self.repetition_penalty = repetition_penalty @MODELS.register_module(name='baichuan-7b') class Baichuan7B(BaseModel): + """Generation parameters of Baichuan-7B base model.""" def __init__(self, repetition_penalty=1.1, **kwargs): super().__init__(**kwargs) @@ -252,6 +255,8 @@ def __init__(self, repetition_penalty=1.1, **kwargs): @MODELS.register_module(name='baichuan2-7b') class Baichuan2_7B(BaseModel): + """Chat template and generation parameters of Baichuan2-7B-Base and + Baichuan2-7B-Chat models.""" def __init__(self, temperature=0.3, diff --git a/lmdeploy/turbomind/tokenizer.py b/lmdeploy/turbomind/tokenizer.py index 98db9c2b61..966f24148e 100644 --- a/lmdeploy/turbomind/tokenizer.py +++ b/lmdeploy/turbomind/tokenizer.py @@ -112,7 +112,7 @@ class HuggingFaceTokenizer: def __init__(self, model_dir: str): from transformers import (AutoTokenizer, CodeLlamaTokenizerFast, - LlamaTokenizerFast) + LlamaTokenizer, LlamaTokenizerFast) model_file = osp.join(model_dir, 'tokenizer.model') backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') model_file_exists = osp.exists(model_file) @@ -121,8 +121,9 @@ def __init__(self, model_dir: str): 'It may take long time to initialize the tokenizer.') self.model = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - self.need_padding = isinstance(self.model, LlamaTokenizerFast) \ - or isinstance(self.model, CodeLlamaTokenizerFast) + self.need_padding = type(self.model) in [ + LlamaTokenizer, LlamaTokenizerFast, CodeLlamaTokenizerFast + ] self._no_prefix_space_tokens = None # save tokenizer.json to reuse if not osp.exists(backend_tokenizer_file) and model_file_exists: diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index c2b6039d67..01592b4ac4 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -1422,7 +1422,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params // Trigger the stores to global memory. if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) { - int offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh + size_t offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh + co * QK_ELTS_IN_16B + ci; if (!QUANT_POLICY) { diff --git a/src/turbomind/models/llama/Barrier.h b/src/turbomind/models/llama/Barrier.h index 6eb0df9585..ab69a746ca 100644 --- a/src/turbomind/models/llama/Barrier.h +++ b/src/turbomind/models/llama/Barrier.h @@ -19,9 +19,9 @@ class Barrier { FT_CHECK(count == 1); } - Barrier(const Barrier&) = delete; - Barrier& operator=(const Barrier&) = delete; - Barrier(Barrier&&) noexcept = delete; + Barrier(const Barrier&) = delete; + Barrier& operator=(const Barrier&) = delete; + Barrier(Barrier&&) noexcept = delete; Barrier& operator=(Barrier&&) noexcept = delete; void wait() {} @@ -39,9 +39,9 @@ class Barrier { pthread_barrier_init(&barrier_, nullptr, count); } - Barrier(const Barrier&) = delete; - Barrier& operator=(const Barrier&) = delete; - Barrier(Barrier&&) noexcept = delete; + Barrier(const Barrier&) = delete; + Barrier& operator=(const Barrier&) = delete; + Barrier(Barrier&&) noexcept = delete; Barrier& operator=(Barrier&&) noexcept = delete; void wait() diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 83db7ad65d..42002f9ada 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -899,8 +899,9 @@ void LlamaBatch::outputContextLogits(T* context_decoder_ if (context_logits_buf_ == nullptr) { NcclGuard guard(llama_->tensor_para_, stream_, true); - context_logits_buf_ = (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_); - const auto tp = llama_->tensor_para_.world_size_; + context_logits_buf_ = + (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_); + const auto tp = llama_->tensor_para_.world_size_; if (tp > 1) { FT_CHECK(llama_->vocab_size_padded_ % tp == 0); const auto local_vocab_size = llama_->vocab_size_padded_ / tp; diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc index e8f77e1c74..881582acea 100644 --- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc @@ -215,6 +215,7 @@ inline void LlamaContextAttentionLayer::forward(TensorMap* layer_offset, attention_mask, cu_seqlens, + input_tensors->at("context_lengths").getPtr(), batch_size, max_q_len, max_k_len, @@ -258,6 +259,7 @@ void LlamaContextAttentionLayer::fusedMultiHeadAttention(T** key_cache_ptr size_t cache_layer_offset, T* attention_mask, int* cu_seqlens, + int* context_lengths, int batch_size, int max_q_len, int max_k_len, @@ -274,13 +276,13 @@ void LlamaContextAttentionLayer::fusedMultiHeadAttention(T** key_cache_ptr int(size_per_head_), int(max_seq_len * size_per_head_), false, - int(cache_layer_offset), + cache_layer_offset, key_cache_ptrs}; Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_), int(size_per_head_), int(max_seq_len * size_per_head_), false, - int(cache_layer_offset), + cache_layer_offset, val_cache_ptrs}; Layout layout_o{ int(local_head_num_ * max_q_len * size_per_head_), @@ -298,6 +300,8 @@ void LlamaContextAttentionLayer::fusedMultiHeadAttention(T** key_cache_ptr qk_buf_float_, cu_seqlens, nullptr, + nullptr, + context_lengths, group_size, layout_q, layout_k, diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.h b/src/turbomind/models/llama/LlamaContextAttentionLayer.h index 235b575b8e..f79eaa4ef2 100644 --- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h +++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h @@ -72,6 +72,7 @@ class LlamaContextAttentionLayer { size_t cache_layer_offset, T* attention_mask, int* cu_seqlens, + int* context_lengths, int batch_size, int max_q_len, int max_k_len, diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index 2141f72e7f..8d97481c35 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -38,7 +38,7 @@ struct LlamaDecoderLayerWeight { size_t tensor_para_size, size_t tensor_para_rank); ~LlamaDecoderLayerWeight(); - LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; + LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete; void loadModel(std::string dir_path, FtCudaDataType model_file_type); diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc index 3caaf59068..103b32e88f 100644 --- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc @@ -130,7 +130,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf, params.hidden_size_per_head = size_per_head; params.rotary_embedding_dim = rotary_embedding_dim; - params.rotary_embedding_base = rotary_embedding_base; + params.rotary_embedding_base = rotary_embedding_base; params.max_position_embeddings = max_position_embeddings; params.use_dynamic_ntk = use_dynamic_ntk; params.use_logn_attn = use_logn_attn; diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 410667f1bd..a0596ab3b1 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -25,8 +25,7 @@ namespace turbomind { -enum class WeightType : int -{ +enum class WeightType : int { kFP32, kFP16, kFP8, // not supported yet diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h index 0e783df33d..35a99d1a50 100644 --- a/src/turbomind/models/llama/LlamaLinear.h +++ b/src/turbomind/models/llama/LlamaLinear.h @@ -15,8 +15,7 @@ namespace turbomind { template class LlamaLinear { public: - enum Type - { + enum Type { kGemm, kFusedSiluFfn }; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index beaf3c3f6d..9c48e4f818 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -93,7 +93,8 @@ LlamaV2::LlamaV2(size_t head_num, TM_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_); - vocab_size_padded_ = (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_; + vocab_size_padded_ = + (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_; size_t elem_bits = 0; if (quant_policy & QuantPolicy::kCacheKVInt8) { @@ -171,7 +172,7 @@ void LlamaV2::initialize(const LlamaAttentionParams& attn_params, dynamic_decode_layer_ = new DynamicDecodeLayer(vocab_size_, vocab_size_padded_, - 0, // end_id, deprecated + 0, // end_id, deprecated stream_, cublas_wrapper_, allocator_, diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 511cbe5bbf..80e561442a 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -95,8 +95,10 @@ void LlamaWeight::loadModel(std::string dir_path) loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type); - loadWeightFromBin( - (T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_padded_}, dir_path + "output.weight", model_file_type); + loadWeightFromBin((T*)post_decoder_embedding_kernel, + {hidden_units_ * vocab_size_padded_}, + dir_path + "output.weight", + model_file_type); for (unsigned layer = 0; layer < num_layer_; ++layer) { decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type); diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index be7fda2b98..411c41f207 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -42,7 +42,7 @@ struct LlamaWeight { ~LlamaWeight(); - LlamaWeight(const LlamaWeight& other) = delete; + LlamaWeight(const LlamaWeight& other) = delete; LlamaWeight& operator=(const LlamaWeight& other) = delete; void loadModel(std::string dir_path); diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h index cb2d1858a3..6af0bdabae 100644 --- a/src/turbomind/models/llama/Request.h +++ b/src/turbomind/models/llama/Request.h @@ -25,8 +25,7 @@ struct Request { using Callback = std::function*)>; Callback stream_cb; - enum - { + enum { kInvalid = 1, kConflict = 2, kBusy = 3, diff --git a/src/turbomind/models/llama/flash_attention2/block_info.h b/src/turbomind/models/llama/flash_attention2/block_info.h index 310d1f22bf..38b6aa2583 100644 --- a/src/turbomind/models/llama/flash_attention2/block_info.h +++ b/src/turbomind/models/llama/flash_attention2/block_info.h @@ -15,10 +15,14 @@ struct BlockInfo { __device__ BlockInfo(const Params& params, const int bidb): sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb]), sum_s_k(!Varlen || params.cu_seqlens_k == nullptr ? -1 : params.cu_seqlens_k[bidb]), - actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : - params.cu_seqlens_q[bidb + 1] - sum_s_q), - actual_seqlen_k(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : - params.cu_seqlens_k[bidb + 1] - sum_s_k) + actual_seqlen_q(params.actual_seqlen_q == nullptr ? + (!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : + params.cu_seqlens_q[bidb + 1] - sum_s_q) : + params.actual_seqlen_q[bidb]), + actual_seqlen_k(params.actual_seqlen_k == nullptr ? + (!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : + params.cu_seqlens_k[bidb + 1] - sum_s_k) : + params.actual_seqlen_k[bidb]) { } diff --git a/src/turbomind/models/llama/flash_attention2/flash.h b/src/turbomind/models/llama/flash_attention2/flash.h index 576cbc8d9c..8a5a7c5794 100644 --- a/src/turbomind/models/llama/flash_attention2/flash.h +++ b/src/turbomind/models/llama/flash_attention2/flash.h @@ -16,7 +16,7 @@ constexpr int D_DIM = 2; //////////////////////////////////////////////////////////////////////////////////////////////////// struct Qkv_params { - using index_t = uint32_t; + using index_t = size_t; // The QKV matrices. void* __restrict__ q_ptr; void* __restrict__ k_ptr; @@ -25,8 +25,8 @@ struct Qkv_params { // batched ptr inputs. void** __restrict__ k_batched_ptr = nullptr; void** __restrict__ v_batched_ptr = nullptr; - int k_batched_offset = 0; - int v_batched_offset = 0; + size_t k_batched_offset = 0; + size_t v_batched_offset = 0; // The stride between rows of the Q, K and V matrices. index_t q_batch_stride; @@ -72,6 +72,10 @@ struct Flash_fwd_params: public Qkv_params { int* __restrict__ cu_seqlens_q; int* __restrict__ cu_seqlens_k; + // array of length b with actual length of each sequence + int* __restrict__ actual_seqlen_q; + int* __restrict__ actual_seqlen_k; + void* __restrict__ blockmask; bool is_bf16; diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp index e2f12c7233..55bc92c1ff 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp +++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp @@ -121,6 +121,9 @@ class FlashAttentionOpImpl::impl { fwd_params.cu_seqlens_q = params.cu_seqlens_q; fwd_params.cu_seqlens_k = params.cu_seqlens_k; + fwd_params.actual_seqlen_q = params.actual_seqlen_q; + fwd_params.actual_seqlen_k = params.actual_seqlen_k; + fwd_params.blockmask = reinterpret_cast(params.mask); fwd_params.is_bf16 = false; diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index 29035421c1..4fae69bd08 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -70,10 +70,10 @@ struct LlamaAttentionKernel: scalar_t** v_batch_seqs_ptr = nullptr; output_t** o_batch_seqs_ptr = nullptr; - int q_batch_seqs_offset = 0; - int k_batch_seqs_offset = 0; - int v_batch_seqs_offset = 0; - int o_batch_seqs_offset = 0; + size_t q_batch_seqs_offset = 0; + size_t k_batch_seqs_offset = 0; + size_t v_batch_seqs_offset = 0; + size_t o_batch_seqs_offset = 0; int32_t group_size = 1; @@ -81,7 +81,7 @@ struct LlamaAttentionKernel: template CUTLASS_DEVICE void - update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, int batch_seq_offset, int batch_id, int strideB) + update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, size_t batch_seq_offset, int batch_id, int strideB) { if (batch_seq_ptr != nullptr) data_ptr = batch_seq_ptr[batch_id] + batch_seq_offset; diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h index 6bd4644f0d..dbd6d16704 100644 --- a/src/turbomind/models/llama/llama_kernels.h +++ b/src/turbomind/models/llama/llama_kernels.h @@ -83,9 +83,9 @@ struct BaseAttentionLayout { int stride_batch; int stride_seq; int stride_head; - bool use_seqlens = false; - int batch_seqs_offset = 0; - T** batch_seqs = nullptr; + bool use_seqlens = false; + size_t batch_seqs_offset = 0; + T** batch_seqs = nullptr; }; template @@ -95,10 +95,12 @@ struct BaseAttentionParams { T* key; T* val; T* mask; - float* out_accum = nullptr; - int* cu_seqlens_q = nullptr; - int* cu_seqlens_k = nullptr; - size_t group_size = 1; + float* out_accum = nullptr; + int* cu_seqlens_q = nullptr; + int* cu_seqlens_k = nullptr; + int* actual_seqlen_q = nullptr; + int* actual_seqlen_k = nullptr; + size_t group_size = 1; BaseAttentionLayout layout_q; BaseAttentionLayout layout_k; BaseAttentionLayout layout_v; diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h index 05c10be80b..011dcca78c 100644 --- a/src/turbomind/models/llama/llama_utils.h +++ b/src/turbomind/models/llama/llama_utils.h @@ -9,8 +9,7 @@ namespace turbomind { -enum QuantPolicy -{ +enum QuantPolicy { kNone = 0x00, // reserve 0x01 and 0x02 for backward compatibility kReserve1 = 0x01, @@ -19,8 +18,7 @@ enum QuantPolicy kCacheKVInt8 = 0x04, }; -enum CmpMode -{ +enum CmpMode { kCmpNone, kCmpRead, kCmpWrite, @@ -52,7 +50,7 @@ inline std::string to_string(std::string x) template std::string Concat(std::string key, Args&&... args) { - std::vector args_str{detail::to_string((Args &&) args)...}; + std::vector args_str{detail::to_string((Args&&)args)...}; for (const auto& s : args_str) { key.append("_"); key.append(s); diff --git a/tests/csrc/unittests/test_context_attention_layer.cu b/tests/csrc/unittests/test_context_attention_layer.cu index 948cd88a68..87693de34d 100644 --- a/tests/csrc/unittests/test_context_attention_layer.cu +++ b/tests/csrc/unittests/test_context_attention_layer.cu @@ -278,6 +278,8 @@ int main(int argc, const char* argv[]) // auto* input_lengths = (int*)allocator.malloc(sizeof(int) * batch_size, false); thrust::device_vector input_lengths(batch_size); thrust::host_vector input_lengths_host(batch_size); + thrust::device_vector kv_lengths(batch_size); + thrust::host_vector kv_lengths_host(batch_size); cudaRandomUniform(query_ptr, batch_size * num_heads * seq_len * size_per_head); cudaRandomUniform(key_ptr, batch_size * num_heads * key_len * size_per_head); @@ -285,13 +287,12 @@ int main(int argc, const char* argv[]) cudaRandomUniform(mask_ptr, batch_size * seq_len * key_len); // create random length for batch - std::uniform_int_distribution dist{seq_len / 2, seq_len}; - auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); }; - std::generate(begin(input_lengths_host), end(input_lengths_host), gen); - // for(int batch_id=0;batch_id dist{seq_len / 2, seq_len}; + auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); }; + std::generate(begin(input_lengths_host), end(input_lengths_host), gen); + thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin()); + } size_t h_token_num = 0; size_t* h_pinned_token_num; auto input_lengths_ptr = thrust::raw_pointer_cast(input_lengths.data()); @@ -306,10 +307,16 @@ int main(int argc, const char* argv[]) stream); cudaFreeHost((void*)h_pinned_token_num); - int* k_lens = (int*)allocator.malloc(batch_size * sizeof(int)); - deviceFill(k_lens, batch_size, key_len, stream); + { + std::uniform_int_distribution dist{seq_len, key_len}; + auto gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); }; + std::generate(begin(kv_lengths_host), end(kv_lengths_host), gen); + thrust::copy(kv_lengths_host.begin(), kv_lengths_host.end(), kv_lengths.begin()); + } + auto kv_lengths_ptr = thrust::raw_pointer_cast(kv_lengths.data()); + // deviceFill(kv_lengths_ptr, batch_size, key_len, stream); - invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, k_lens, seq_len, key_len, batch_size, stream); + invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, kv_lengths_ptr, seq_len, key_len, batch_size, stream); // deviceFill(mask_ptr, batch_size*key_len*seq_len, scalar_t(1), stream); // compute gt @@ -356,6 +363,8 @@ int main(int argc, const char* argv[]) accum_buf_ptr, cu_seqlens_ptr, nullptr, + nullptr, + kv_lengths_ptr, 1, layout_q, layout_k, @@ -367,10 +376,10 @@ int main(int argc, const char* argv[]) int num_rows = 8; // printf("query:\n"); // printMatrix(query_ptr, num_rows, 8, size_per_head, true); - printf("expect:\n"); - printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true); - printf("actual:\n"); - printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true); + // printf("expect:\n"); + // printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true); + // printf("actual:\n"); + // printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true); checkResult( "all close:", actual_out_ptr, expect_out_ptr, batch_size * num_heads * seq_len * size_per_head, true, true); diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py index 83487f1f03..dcf04d5c28 100644 --- a/tests/test_lmdeploy/test_model.py +++ b/tests/test_lmdeploy/test_model.py @@ -7,7 +7,7 @@ def test_base_model(): model = MODELS.get('llama')() assert model is not None assert model.capability == 'chat' - assert model.get_prompt('test') is None + assert model.get_prompt('test') == 'test' assert model.stop_words is None model = MODELS.get('internlm')(capability='completion') @@ -72,7 +72,7 @@ def test_baichuan(): model = MODELS.get('baichuan-7b')(capability='chat') _prompt = model.get_prompt(prompt, sequence_start=True) - assert _prompt is None + assert _prompt == prompt def test_llama2():