diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index 75e8a9b90..dd8ceb4ff 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -57,7 +57,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | Yes | | Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes | | Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes | -| Llama3.2 | 1B, 3B | LLM | Yes | Yes\* | Yes\* | Yes | Yes | +| Llama3.2 | 1B, 3B | LLM | Yes | Yes | Yes | Yes | Yes | | Llama3.2-VL | 11B, 90B | MLLM | Yes | Yes | Yes | - | - | | InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes | | InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes | @@ -72,7 +72,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | QWen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | Yes | | QWen1.5 | 0.5B - 110B | LLM | Yes | Yes | Yes | Yes | Yes | | QWen1.5-MoE | A2.7B | LLM | Yes | Yes | Yes | No | No | -| QWen2 | 0.5B - 72B | LLM | Yes | Yes\* | No | Yes | Yes | +| QWen2 | 0.5B - 72B | LLM | Yes | Yes | No | Yes | Yes | | Qwen2.5 | 0.5B - 72B | LLM | Yes | Yes | No | Yes | Yes | | QWen2-VL | 2B, 7B | MLLM | Yes | Yes | No | No | No | | DeepSeek-MoE | 16B | LLM | Yes | No | No | No | No | @@ -88,8 +88,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | CogVLM-Chat | 17B | MLLM | Yes | Yes | Yes | - | - | | CogVLM2-Chat | 19B | MLLM | Yes | Yes | Yes | - | - | | LLaVA(1.5,1.6) | 7B-34B | MLLM | Yes | Yes | Yes | - | - | -| InternVL(v1.5) | 2B-26B | MLLM | Yes | Yes | Yes | Yes | Yes | -| InternVL2 | 1B-40B | MLLM | Yes | Yes\* | Yes\* | - | - | +| InternVL(v1.5) | 2B-26B | MLLM | Yes | Yes | Yes | No | Yes | | InternVL2 | 1B-40B | MLLM | Yes | Yes | Yes | - | - | | Mono-InternVL | 2B | MLLM | Yes\* | Yes | Yes | - | - | | ChemVLM | 8B-26B | MLLM | Yes | Yes | No | - | - | @@ -103,7 +102,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine ```{note} * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead. -* When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference ``` ## PyTorchEngine on Huawei Ascend Platform diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 3e918e58f..3ec3688e1 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -18,7 +18,7 @@ | InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes | | Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | | Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes | -| Qwen2 | 0.5B - 72B | LLM | Yes | Yes\* | No | Yes | +| Qwen2 | 0.5B - 72B | LLM | Yes | Yes\* | Yes\* | Yes | | Qwen2-MoE | 57BA14B | LLM | Yes | Yes | Yes | Yes | | Qwen2.5 | 0.5B - 72B | LLM | Yes | Yes | Yes | Yes | | Mistral | 7B | LLM | Yes | Yes | Yes | No | @@ -34,7 +34,6 @@ | LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes | | InternVL | v1.1 - v1.5 | MLLM | Yes | Yes | Yes | Yes | | InternVL2 | 1-2B, 8B - 76B | MLLM | Yes | Yes\* | Yes\* | Yes | -| Mono-InternVL | 2B | MLLM | Yes\* | Yes | Yes | - | | ChemVLM | 8B - 26B | MLLM | Yes | Yes | Yes | Yes | | MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes | | MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes | diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py index 87de31e06..01ecb31f9 100644 --- a/lmdeploy/vl/model/base.py +++ b/lmdeploy/vl/model/base.py @@ -134,8 +134,8 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): if self.backend == 'turbomind': raise NotImplementedError() - @classmethod - def collect_images(cls, messages): + @staticmethod + def collect_images(messages): """gather all images along with their respective parameters from the messages and compile them into a single list. Each image is converted to RGB color space. @@ -157,8 +157,8 @@ def collect_images(cls, messages): ]) return images - @classmethod - def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer, + @staticmethod + def to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start): """auxiliary function to pack the preprocessing results in a format compatible with what is required by pytorch engine. @@ -196,8 +196,8 @@ def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer, return dict(prompt=prompt, input_ids=input_ids, multimodal=preps) - @classmethod - def to_turbomind_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer, + @staticmethod + def to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start): """auxiliary function to pack the forwarding results in a format compatible with what is required by turbomind engine. diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py index 8b1ebbc67..abeeff31c 100644 --- a/lmdeploy/vl/model/cogvlm.py +++ b/lmdeploy/vl/model/cogvlm.py @@ -42,8 +42,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: messages.append(dict(role='preprocess', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] for message in messages: @@ -83,5 +83,5 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py index 33f54784d..8a7284de7 100644 --- a/lmdeploy/vl/model/deepseek.py +++ b/lmdeploy/vl/model/deepseek.py @@ -138,8 +138,8 @@ def forward(self, messages.append(dict(role='forward', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): # apply chat template to get the prompt prompt_messages = [] IMAGE_TOKEN = '' @@ -182,11 +182,11 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py index aa3372a5f..7cdd96d5d 100644 --- a/lmdeploy/vl/model/glm_4v.py +++ b/lmdeploy/vl/model/glm_4v.py @@ -63,8 +63,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: messages.append(dict(role='preprocess', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -85,5 +85,5 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py index 6bc6dbdf0..f66ec8447 100644 --- a/lmdeploy/vl/model/internvl.py +++ b/lmdeploy/vl/model/internvl.py @@ -245,8 +245,8 @@ def forward(self, messages.append(dict(role='forward', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -279,11 +279,11 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py index d97a736b9..3e52d84a9 100644 --- a/lmdeploy/vl/model/llava_hf.py +++ b/lmdeploy/vl/model/llava_hf.py @@ -120,8 +120,8 @@ def forward(self, messages.append(dict(role='forward', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -145,11 +145,11 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/mini_gemeni.py b/lmdeploy/vl/model/mini_gemeni.py index f7054628c..deb401f78 100644 --- a/lmdeploy/vl/model/mini_gemeni.py +++ b/lmdeploy/vl/model/mini_gemeni.py @@ -347,8 +347,8 @@ def forward(self, outputs = [x.squeeze() for x in outputs] messages.append(dict(role='forward', cotent=outputs)) - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -375,5 +375,5 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py index 129bc9766..49abde3f3 100644 --- a/lmdeploy/vl/model/minicpmv.py +++ b/lmdeploy/vl/model/minicpmv.py @@ -280,11 +280,11 @@ def proc_messages(self, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py index 779ef4f92..13a6b3a48 100644 --- a/lmdeploy/vl/model/mllama.py +++ b/lmdeploy/vl/model/mllama.py @@ -31,8 +31,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: messages.append(dict(role='preprocess', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '<|image|>' @@ -56,5 +56,5 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py index 2eec485f0..672595987 100644 --- a/lmdeploy/vl/model/molmo.py +++ b/lmdeploy/vl/model/molmo.py @@ -144,7 +144,8 @@ def forward(self, embeddings=embeddings))) return messages - def proc_messages(cls, messages): + @staticmethod + def proc_messages(messages): prompt = [] IMAGE_TOKEN = '' for message in messages: diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py index a822da1e4..04ae0b50c 100644 --- a/lmdeploy/vl/model/qwen.py +++ b/lmdeploy/vl/model/qwen.py @@ -119,8 +119,8 @@ def forward(self, messages.append(dict(role='forward', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -150,11 +150,11 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py index cd4ef4b6c..8bac991e3 100644 --- a/lmdeploy/vl/model/qwen2.py +++ b/lmdeploy/vl/model/qwen2.py @@ -114,8 +114,8 @@ def forward(self, """ assert 0, 'TODO: support turbomind engine' - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -152,8 +152,8 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): """return to the information needed by pytorch engine.""" prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): assert 0, 'TODO: support turbomind engine' diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py index 67101216e..2b6f17207 100644 --- a/lmdeploy/vl/model/xcomposer2.py +++ b/lmdeploy/vl/model/xcomposer2.py @@ -275,8 +275,8 @@ def forward(self, messages.append(dict(role='forward', content=outputs)) return messages - @classmethod - def proc_messages(cls, messages, chat_template, sequence_start): + @staticmethod + def proc_messages(messages, chat_template, sequence_start): """apply chat template to get the prompt.""" prompt_messages = [] IMAGE_TOKEN = '' @@ -300,11 +300,11 @@ def proc_messages(cls, messages, chat_template, sequence_start): def to_pytorch(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, - sequence_start) + return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start) def to_turbomind(self, messages, chat_template, tokenizer, sequence_start): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start) - return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN, - tokenizer, sequence_start) + return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, + sequence_start)