Skip to content

Commit

Permalink
Merge branch 'refactor-vl' of github.com:InternLM/lmdeploy into refac…
Browse files Browse the repository at this point in the history
…tor-vl
  • Loading branch information
grimoire committed Dec 12, 2024
2 parents 1b6ea24 + 6a9342e commit d005bc8
Show file tree
Hide file tree
Showing 15 changed files with 66 additions and 68 deletions.
8 changes: 3 additions & 5 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | Yes |
| Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes |
| Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | Yes |
| Llama3.2 | 1B, 3B | LLM | Yes | Yes\* | Yes\* | Yes | Yes |
| Llama3.2 | 1B, 3B | LLM | Yes | Yes | Yes | Yes | Yes |
| Llama3.2-VL | 11B, 90B | MLLM | Yes | Yes | Yes | - | - |
| InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes |
| InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | Yes |
Expand All @@ -72,7 +72,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| QWen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | Yes |
| QWen1.5 | 0.5B - 110B | LLM | Yes | Yes | Yes | Yes | Yes |
| QWen1.5-MoE | A2.7B | LLM | Yes | Yes | Yes | No | No |
| QWen2 | 0.5B - 72B | LLM | Yes | Yes\* | No | Yes | Yes |
| QWen2 | 0.5B - 72B | LLM | Yes | Yes | No | Yes | Yes |
| Qwen2.5 | 0.5B - 72B | LLM | Yes | Yes | No | Yes | Yes |
| QWen2-VL | 2B, 7B | MLLM | Yes | Yes | No | No | No |
| DeepSeek-MoE | 16B | LLM | Yes | No | No | No | No |
Expand All @@ -88,8 +88,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| CogVLM-Chat | 17B | MLLM | Yes | Yes | Yes | - | - |
| CogVLM2-Chat | 19B | MLLM | Yes | Yes | Yes | - | - |
| LLaVA(1.5,1.6) | 7B-34B | MLLM | Yes | Yes | Yes | - | - |
| InternVL(v1.5) | 2B-26B | MLLM | Yes | Yes | Yes | Yes | Yes |
| InternVL2 | 1B-40B | MLLM | Yes | Yes\* | Yes\* | - | - |
| InternVL(v1.5) | 2B-26B | MLLM | Yes | Yes | Yes | No | Yes |
| InternVL2 | 1B-40B | MLLM | Yes | Yes | Yes | - | - |
| Mono-InternVL | 2B | MLLM | Yes\* | Yes | Yes | - | - |
| ChemVLM | 8B-26B | MLLM | Yes | Yes | No | - | - |
Expand All @@ -103,7 +102,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine

```{note}
* Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
* When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference
```

## PyTorchEngine on Huawei Ascend Platform
Expand Down
3 changes: 1 addition & 2 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
| InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes |
| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes |
| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes |
| Qwen2 | 0.5B - 72B | LLM | Yes | Yes\* | No | Yes |
| Qwen2 | 0.5B - 72B | LLM | Yes | Yes\* | Yes\* | Yes |
| Qwen2-MoE | 57BA14B | LLM | Yes | Yes | Yes | Yes |
| Qwen2.5 | 0.5B - 72B | LLM | Yes | Yes | Yes | Yes |
| Mistral | 7B | LLM | Yes | Yes | Yes | No |
Expand All @@ -34,7 +34,6 @@
| LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes |
| InternVL | v1.1 - v1.5 | MLLM | Yes | Yes | Yes | Yes |
| InternVL2 | 1-2B, 8B - 76B | MLLM | Yes | Yes\* | Yes\* | Yes |
| Mono-InternVL | 2B | MLLM | Yes\* | Yes | Yes | - |
| ChemVLM | 8B - 26B | MLLM | Yes | Yes | Yes | Yes |
| MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes |
| MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes |
Expand Down
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
if self.backend == 'turbomind':
raise NotImplementedError()

@classmethod
def collect_images(cls, messages):
@staticmethod
def collect_images(messages):
"""gather all images along with their respective parameters from the
messages and compile them into a single list. Each image is converted
to RGB color space.
Expand All @@ -157,8 +157,8 @@ def collect_images(cls, messages):
])
return images

@classmethod
def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,
@staticmethod
def to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start):
"""auxiliary function to pack the preprocessing results in a format
compatible with what is required by pytorch engine.
Expand Down Expand Up @@ -196,8 +196,8 @@ def to_pytorch_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,

return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)

@classmethod
def to_turbomind_aux(cls, messages, prompt, IMAGE_TOKEN, tokenizer,
@staticmethod
def to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start):
"""auxiliary function to pack the forwarding results in a format
compatible with what is required by turbomind engine.
Expand Down
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/cogvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
messages.append(dict(role='preprocess', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
for message in messages:
Expand Down Expand Up @@ -83,5 +83,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def forward(self,
messages.append(dict(role='forward', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
# apply chat template to get the prompt
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand Down Expand Up @@ -182,11 +182,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/glm_4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
messages.append(dict(role='preprocess', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand All @@ -85,5 +85,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,8 @@ def forward(self,
messages.append(dict(role='forward', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand Down Expand Up @@ -279,11 +279,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/llava_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ def forward(self,
messages.append(dict(role='forward', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand All @@ -145,11 +145,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/mini_gemeni.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,8 @@ def forward(self,
outputs = [x.squeeze() for x in outputs]
messages.append(dict(role='forward', cotent=outputs))

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand All @@ -375,5 +375,5 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,11 @@ def proc_messages(self, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
messages.append(dict(role='preprocess', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<|image|>'
Expand All @@ -56,5 +56,5 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
3 changes: 2 additions & 1 deletion lmdeploy/vl/model/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ def forward(self,
embeddings=embeddings)))
return messages

def proc_messages(cls, messages):
@staticmethod
def proc_messages(messages):
prompt = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
for message in messages:
Expand Down
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def forward(self,
messages.append(dict(role='forward', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand Down Expand Up @@ -150,11 +150,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
8 changes: 4 additions & 4 deletions lmdeploy/vl/model/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def forward(self,
"""
assert 0, 'TODO: support turbomind engine'

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand Down Expand Up @@ -152,8 +152,8 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
"""return to the information needed by pytorch engine."""
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
assert 0, 'TODO: support turbomind engine'
12 changes: 6 additions & 6 deletions lmdeploy/vl/model/xcomposer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,8 @@ def forward(self,
messages.append(dict(role='forward', content=outputs))
return messages

@classmethod
def proc_messages(cls, messages, chat_template, sequence_start):
@staticmethod
def proc_messages(messages, chat_template, sequence_start):
"""apply chat template to get the prompt."""
prompt_messages = []
IMAGE_TOKEN = '<IMAGE_TOKEN>'
Expand All @@ -300,11 +300,11 @@ def proc_messages(cls, messages, chat_template, sequence_start):
def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)
return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
sequence_start)
return super().to_turbomind_aux(messages, prompt, IMAGE_TOKEN,
tokenizer, sequence_start)
return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
sequence_start)

0 comments on commit d005bc8

Please sign in to comment.