Merge branch 'refactor-vl' of github.com:InternLM/lmdeploy into refac…

…tor-vl
InternLM · Dec 12, 2024 · d005bc8 · d005bc8
2 parents 1b6ea24 + 6a9342e
commit d005bc8
Show file tree

Hide file tree

Showing 15 changed files with 66 additions and 68 deletions.
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -57,7 +57,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |  Yes\*  |  Yes\*  | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
@@ -72,7 +72,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |  Yes\*  |   No    | Yes  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    Qwen2.5     | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
@@ -88,8 +88,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |  Yes\*  |  Yes\*  |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
@@ -103,7 +102,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
-* When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference
 ```
 
 ## PyTorchEngine on Huawei Ascend Platform

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -18,7 +18,7 @@
 | InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |   No    |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |       Qwen2-MoE       |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen2.5        |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
@@ -34,7 +34,6 @@
 |    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
-|     Mono-InternVL     |       2B       | MLLM |   Yes\*   |   Yes   |   Yes   |   -   |
 |        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |

diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
@@ -134,8 +134,8 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
-    @classmethod
-    def collect_images(cls, messages):
+    @staticmethod
+    def collect_images(messages):
         """gather all images along with their respective parameters from the
         messages and compile them into a single list. Each image is converted
         to RGB color space.
@@ -157,8 +157,8 @@ def collect_images(cls, messages):
             ])
         return images
 
-    @classmethod
-    def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,
+    @staticmethod
+    def to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
                        sequence_start):
         """auxiliary function to pack the preprocessing results in a format
         compatible with what is required by pytorch engine.
@@ -196,8 +196,8 @@ def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,
 
         return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
 
-    @classmethod
-    def to_turbomind_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,
+    @staticmethod
+    def to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
                          sequence_start):
         """auxiliary function to pack the forwarding results in a format
         compatible with what is required by turbomind engine.

diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -42,8 +42,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         for message in messages:
@@ -83,5 +83,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
@@ -138,8 +138,8 @@ def forward(self,
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         # apply chat template to get the prompt
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -182,11 +182,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
@@ -63,8 +63,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -85,5 +85,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
@@ -245,8 +245,8 @@ def forward(self,
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -279,11 +279,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
@@ -120,8 +120,8 @@ def forward(self,
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -145,11 +145,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/mini_gemeni.py b/lmdeploy/vl/model/mini_gemeni.py
@@ -347,8 +347,8 @@ def forward(self,
         outputs = [x.squeeze() for x in outputs]
         messages.append(dict(role='forward', cotent=outputs))
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -375,5 +375,5 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
@@ -280,11 +280,11 @@ def proc_messages(self, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
@@ -31,8 +31,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<|image|>'
@@ -56,5 +56,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
@@ -144,7 +144,8 @@ def forward(self,
                                   embeddings=embeddings)))
         return messages
 
-    def proc_messages(cls, messages):
+    @staticmethod
+    def proc_messages(messages):
         prompt = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
         for message in messages:

diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
@@ -119,8 +119,8 @@ def forward(self,
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -150,11 +150,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
@@ -114,8 +114,8 @@ def forward(self,
         """
         assert 0, 'TODO: support turbomind engine'
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -152,8 +152,8 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         """return to the information needed by pytorch engine."""
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         assert 0, 'TODO: support turbomind engine'
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
@@ -275,8 +275,8 @@ def forward(self,
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @classmethod
-    def proc_messages(cls, messages, chat_template, sequence_start):
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
@@ -300,11 +300,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
     def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
-                                      sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
 
     def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
                                                  sequence_start)
-        return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
-                                        tokenizer, sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)