Merge pull request #2879 from lvhan028/fix-refactor-vl

Fix VLM batch inference error
InternLM · Dec 10, 2024 · 02a25eb · 02a25eb
2 parents 4e2f1f8 + ee022ad
commit 02a25eb
Show file tree

Hide file tree

Showing 13 changed files with 132 additions and 109 deletions.
diff --git a/README.md b/README.md
@@ -125,6 +125,8 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -136,6 +138,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>

diff --git a/README_ja.md b/README_ja.md
@@ -122,6 +122,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -133,6 +135,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -126,6 +126,8 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -137,6 +139,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -10,17 +10,21 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |  InternLM-XComposer2  |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|       Qwen2-MoE       |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen2.5        |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-V2      |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|     DeepSeek-V2.5     |      236B      | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -29,7 +33,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -41,7 +45,8 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 "-" means not verified yet.
 
 ```{note}
-The TurboMind engine doesn't support window attention. Therefore, for models that have applied window attention and have the corresponding switch "use_sliding_window" enabled, such as Mistral, Qwen1.5 and etc., please choose the PyTorch engine for inference.
+* The TurboMind engine doesn't support window attention. Therefore, for models that have applied window attention and have the corresponding switch "use_sliding_window" enabled, such as Mistral, Qwen1.5 and etc., please choose the PyTorch engine for inference.
+* When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference
 ```
 
 ## PyTorchEngine on CUDA Platform
@@ -68,11 +73,13 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     QWen2      | 0.5B - 72B  | LLM  |    Yes    |  Yes\*  |   No    | Yes  |  Yes  |
+|    Qwen2.5     | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+| DeepSeek-V2.5  |    236B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -83,6 +90,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |  Yes\*  |  Yes\*  |  -   |   -   |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -18,9 +18,13 @@
 | InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |   No    |  Yes  |
+|       Qwen2-MoE       |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen2.5        |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-V2      |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|     DeepSeek-V2.5     |      236B      | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -30,6 +34,7 @@
 |    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|     Mono-InternVL     |       2B       | MLLM |   Yes\*   |   Yes   |   Yes   |   -   |
 |        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -69,11 +74,13 @@
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
+|    Qwen2.5     | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+| DeepSeek-V2.5  |    236B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -82,7 +89,7 @@
 |  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
@@ -95,7 +102,7 @@
 | Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
-* Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
+* 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16。
 ```
 
 ## PyTorchEngine 华为昇腾平台

diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -124,6 +124,8 @@ async def async_convert_to_pil_images(cls,
         def _inner_call(i, in_messages, out_messages):
             role = in_messages[i]['role']
             content = in_messages[i]['content']
+            assert role in ['sytem', 'user', 'assistant'], \
+                f'unsupported role "{role}"'
             if role != 'user' or isinstance(content, str):
                 # the content is a user's prompt or an assistant's prompt,
                 # returning it directly

diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
@@ -153,22 +153,28 @@ def proc_messages(cls, messages, chat_template, sequence_start):
                 x['text'] for x in message['content'] if x['type'] == 'text'
             ]
             content = content[0]
-            if IMAGE_TOKEN not in content:
+            n_image = sum(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            n_placeholder = content.count(IMAGE_TOKEN)
+            if n_placeholder == 0:
                 logger.warning(
                     f"""for deepseek-vl model, the user should insert the {IMAGE_TOKEN}
                     to user prompt manually, please read https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html
                     for more details.""")  # noqa
-                n_images = len(
-                    [1 for x in message['content'] if x['type'] == 'image'])
-                if n_images == 1:
+            if n_placeholder != 0 and n_placeholder != n_image:
+                logger.error(
+                    f'unmatched placeholder and image: {n_placeholder} vs '
+                    f'{n_image}. Ignore the placeholder')
+                content = content.replace(IMAGE_TOKEN, '')
+                n_placeholder = 0
+            if n_placeholder == 0:
+                if n_image == 1:
                     content = f'{IMAGE_TOKEN}{content}'
                 else:
                     content = ''.join([
                         f'{IMAGE_TOKEN} is Figure {str(i)}.\n'
-                        for i in range(n_images)
+                        for i in range(n_image)
                     ]) + content
-            else:
-                logger.error('TODO deepseek-vl')
             prompt_messages.append(dict(role='user', content=content))
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
         return prompt, IMAGE_TOKEN

diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
@@ -39,16 +39,27 @@ def build_preprocessor(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """refers to the spec of `super.preprocess()"""
-        images = self.collect_images(messages)
         outputs = []
-        for image, params in images:
-            image = image.convert('RGB')
-            pixel_values = self.image_transform(image)
-            outputs.append(
-                dict(pixel_values=pixel_values,
-                     image_size=image.size,
+        for message in messages:
+            if not isinstance(message['content'], List):
+                continue
+            images = [
+                x['image'] for x in message['content'] if x['type'] == 'image'
+            ]
+            if len(images) > 1:
+                logger.warning(
+                    f'glm4v does not support the input of multiple images'
+                    f' in a single chat round, but got {len(images)} images.')
+            # we still pass all the images to the model and let the
+            # model decide what to do
+            images = [x.convert('RGB') for x in images]
+            pixel_values = [self.image_transform(x) for x in images]
+            outputs.extend([
+                dict(pixel_values=_2,
+                     image_size=_1.size,
                      image_tokens=self.n_token_per_image,
-                     image_token_id=0))
+                     image_token_id=0) for _1, _2 in zip(images, pixel_values)
+            ])
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 

diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
@@ -175,10 +175,7 @@ def _forward_v1_5(self, inputs, max_batch_size):
             pixel_values = [
                 x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
             ]
-            split = [
-                x['pixel_values'].shape[0]
-                for x in inputs[idx:idx + max_batch_size]
-            ]
+            split = [x.shape[0] for x in pixel_values]
             pixel_values = torch.cat(pixel_values, dim=0)
             pixel_values = pixel_values.to(self.model.device,
                                            dtype=torch.float16)
@@ -202,7 +199,7 @@ def _forward(self, inputs, max_batch_size):
             pixel_values = [
                 x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
             ]
-            pixel_values = torch.cat(outputs, dim=0)
+            pixel_values = torch.cat(pixel_values, dim=0)
             pixel_values = pixel_values.to(self.model.device,
                                            dtype=torch.float16)
             logger.info(f'vision forward shape: {pixel_values.shape}')

diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py
@@ -4,7 +4,6 @@
 from typing import Dict, List
 
 import torch
-from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
@@ -20,12 +19,7 @@ class LlavaNextVisionModel(LlavaHfVisionModel):
     _arch = 'LlavaNextForConditionalGeneration'
 
     def build_preprocessor(self):
-        processor = AutoProcessor.from_pretrained(self.model_path,
-                                                  trust_remote_code=True)
-        if hasattr(processor, 'tokenizer'):
-            del processor.tokenizer
-            processor.prtokenizer = None
-        self.processor = processor.image_processor
+        super().build_preprocessor()
         # build the model with empty weights. The model will be used in
         # `preprocess` to get the image token number
         from accelerate import init_empty_weights
@@ -94,10 +88,10 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
                     patch_size=self.hf_config.vision_config.image_size,
                 ) for imsize in result['image_sizes']
             ]
-            # TODO(remove hardcode 576)
+
             hidden_size = self.hf_config.text_config.hidden_size
             fake_image_features = torch.zeros(
-                [image_num_patches[0], 576, hidden_size])
+                [image_num_patches[0], self.n_token_per_image, hidden_size])
             image_sizes = result['image_sizes']
             image_newline = torch.randn(self.hf_config.text_config.hidden_size)
             strategy = self.hf_config.vision_feature_select_strategy