Skip to content

Commit

Permalink
Merge branch 'InternLM:main' into tzy/fix_vl
Browse files Browse the repository at this point in the history
  • Loading branch information
jinminxi104 authored Dec 15, 2024
2 parents 0d494f8 + 96e82eb commit 2a2895f
Show file tree
Hide file tree
Showing 68 changed files with 7,597 additions and 3,251 deletions.
10 changes: 8 additions & 2 deletions docs/en/multi_modal/llava.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@ LMDeploy supports the following llava series of models, which are detailed in th
| :----------------------------------: | :--: | :------------------------: |
| llava-hf/Llava-interleave-qwen-7b-hf | 7B | TurboMind, PyTorch |
| llava-hf/llava-1.5-7b-hf | 7B | TurboMind, PyTorch |
| liuhaotian/llava-v1.6-vicuna-7b | 7B | TurboMind, PyTorch |
| liuhaotian/llava-v1.6-mistral-7b | 7B | TurboMind, PyTorch |
| llava-hf/llava-v1.6-mistral-7b-hf | 7B | PyTorch |
| llava-hf/llava-v1.6-vicuna-7b-hf | 7B | PyTorch |
| liuhaotian/llava-v1.6-mistral-7b | 7B | TurboMind |
| liuhaotian/llava-v1.6-vicuna-7b | 7B | TurboMind |

The next chapter demonstrates how to deploy an Llava model using LMDeploy, with [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.

```{note}
PyTorch engine removes the support of original llava models after v0.6.4. Please use their corresponding transformers models instead, which can be found in https://huggingface.co/llava-hf
```

## Installation

Please install LMDeploy by following the [installation guide](../get_started/installation.md).
Expand Down
2 changes: 1 addition & 1 deletion docs/en/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in

| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen-VL-Chat | - | TurboMind |
| Qwen2-VL | 2B, 7B | PyTorch |

The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.
Expand Down
10 changes: 8 additions & 2 deletions docs/zh_cn/multi_modal/llava.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@ LMDeploy 支持以下 LLaVA 系列模型,具体如下表所示:
| :----------------------------------: | :--: | :----------------: |
| llava-hf/Llava-interleave-qwen-7b-hf | 7B | TurboMind, PyTorch |
| llava-hf/llava-1.5-7b-hf | 7B | TurboMind, PyTorch |
| liuhaotian/llava-v1.6-vicuna-7b | 7B | TurboMind, PyTorch |
| liuhaotian/llava-v1.6-mistral-7b | 7B | TurboMind, PyTorch |
| llava-hf/llava-v1.6-mistral-7b-hf | 7B | PyTorch |
| llava-hf/llava-v1.6-vicuna-7b-hf | 7B | PyTorch |
| liuhaotian/llava-v1.6-vicuna-7b | 7B | TurboMind |
| liuhaotian/llava-v1.6-mistral-7b | 7B | TurboMind |

接下来的章节将演示如何使用 LMDeploy 部署 LLaVA 模型,并以 [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) 为例。

```{note}
自 0.6.4 之后,PyTorch 引擎移除了对 llava 原始模型的支持。我们建议使用它们对应的 transformers 格式的模型。这些模型可以在 https://huggingface.co/llava-hf 中找到
```

## 安装

请按照[安装指南](../get_started/installation.md)安装 LMDeploy。
Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LMDeploy 支持 Qwen-VL 系列模型,具体如下:

| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen-VL-Chat | - | TurboMind |
| Qwen2-VL | 2B, 7B | PyTorch |

本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例,演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法
Expand Down
31 changes: 17 additions & 14 deletions lmdeploy/lite/apis/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,23 @@ def calibrate(model: str,

model_type, _ = get_task(model)
make_compatible_internvl_config(model)
if model_type == 'llm':
# Load tokenizer and configuration
tokenizer = AutoTokenizer.from_pretrained(model,
trust_remote_code=True)

model = load_hf_from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
vl_model = None
elif model_type == 'vlm':
from lmdeploy.vl.model.builder import vl_model_with_tokenizer
vl_model, model, tokenizer = vl_model_with_tokenizer(model_path=model)

model.config.use_cache = False

# Load tokenizer and configuration
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)

model = load_hf_from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
vl_model = None
if model_type == 'vlm':
vl_model = model
if hasattr(model, 'language_model'):
model = model.language_model
if hasattr(model, 'llm'):
model = model.llm
model.config.use_cache = False
model = model.half().eval()

model_type = type(model).__name__
if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
raise RuntimeError(
Expand Down
3 changes: 3 additions & 0 deletions lmdeploy/pytorch/backends/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
alibi: bool = None,
sliding_window: int = None,
logit_softcapping: float = None,
causal: bool = True,
**kwargs,
) -> None:
if scale is None:
Expand All @@ -53,6 +54,7 @@ def __init__(
self.alibi = alibi
self.sliding_window = sliding_window
self.logit_softcapping = logit_softcapping
self.causal = causal

@abstractmethod
def forward(
Expand Down Expand Up @@ -82,6 +84,7 @@ def build(
alibi: bool = False,
sliding_window: int = None,
logical_softcapping: float = None,
causal: bool = True,
**kwargs,
) -> AttentionImpl[T]:
"""build."""
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pytorch/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

class OpType(Enum):
"""Layer type enumerate."""
Attention = auto()
PagedAttention = auto()
FlashAttention = auto()
Linear = auto()
RotaryEmbedding = auto()
ApplyRotaryEmb = auto()
Expand Down
6 changes: 6 additions & 0 deletions lmdeploy/pytorch/backends/cuda/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
alibi: bool = False,
sliding_window: int = None,
logit_softcapping: float = None,
causal: bool = True,
**kwargs,
):
super().__init__(
Expand All @@ -52,8 +53,10 @@ def __init__(
alibi=alibi,
sliding_window=sliding_window,
logit_softcapping=logit_softcapping,
causal=causal,
**kwargs,
)
assert not (alibi and not causal)

from lmdeploy.pytorch.kernels.cuda import (alibi_paged_attention_fwd,
fill_kv_cache,
Expand Down Expand Up @@ -172,6 +175,7 @@ def forward(
window_size=self.sliding_window,
sm_scale=self.scale,
logit_softcapping=self.logit_softcapping,
causal=self.causal,
)
else:
self.alibi_paged_attention_fwd(
Expand Down Expand Up @@ -207,6 +211,7 @@ def build(
alibi: bool = False,
sliding_window: int = None,
logical_softcapping: float = None,
causal: bool = True,
**kwargs,
) -> TritonAttentionImpl:
"""build."""
Expand All @@ -218,4 +223,5 @@ def build(
alibi=alibi,
sliding_window=sliding_window,
logical_softcapping=logical_softcapping,
causal=causal,
**kwargs)
101 changes: 101 additions & 0 deletions lmdeploy/pytorch/backends/cuda/flash_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) OpenMMLab. All rights reserved.
from torch import Tensor

from ..flash_attention import FlashAttentionBuilder, FlashAttentionImpl


class TritonFlashAttentionImpl(FlashAttentionImpl):
"""triton flash attention implementation."""

def __init__(
self,
num_heads: int,
head_dim: int,
scale: float = None,
num_kv_heads: int = None,
v_head_dim: int = None,
causal: bool = True,
sliding_window: int = None,
logical_softcapping: float = None,
):
if scale is None:
scale = 1.0 / (head_dim**0.5)

if num_kv_heads is None:
num_kv_heads = num_heads

if v_head_dim is None:
v_head_dim = head_dim

self.num_heads = num_heads
self.head_dim = head_dim
self.scale = scale
self.num_kv_heads = num_kv_heads
self.v_head_dim = v_head_dim
self.causal = causal
self.sliding_window = sliding_window
self.logical_softcapping = logical_softcapping

from lmdeploy.pytorch.kernels.cuda import flash_attention_fwd
self.flash_attention_fwd = flash_attention_fwd

def forward(self,
query: Tensor,
key: Tensor,
value: Tensor,
q_start_loc: Tensor,
q_seqlens: Tensor,
kv_start_loc: Tensor,
kv_seqlens: Tensor,
max_q_seqlen: int = None):
"""forward."""

q_shape = query.shape
o_shape = q_shape[:-1] + (self.v_head_dim, )
out = query.new_empty(o_shape)
self.flash_attention_fwd(
query,
key,
value,
out,
q_start_loc=q_start_loc,
q_seqlens=q_seqlens,
kv_start_loc=kv_start_loc,
kv_seqlens=kv_seqlens,
max_seqlen=max_q_seqlen,
window_size=self.sliding_window,
sm_scale=self.scale,
logit_softcapping=self.logical_softcapping,
causal=self.causal,
kv_layout='shd',
)

return out


class TritonFlashAttentionBuilder(FlashAttentionBuilder):
"""triton attention builder."""

@staticmethod
def build(
num_heads: int,
head_dim: int,
scale: float = None,
num_kv_heads: int = None,
v_head_dim: int = None,
causal: bool = True,
sliding_window: int = None,
logical_softcapping: float = None,
**kwargs,
) -> FlashAttentionImpl:
"""build."""
return TritonFlashAttentionImpl(
num_heads=num_heads,
head_dim=head_dim,
scale=scale,
num_kv_heads=num_kv_heads,
v_head_dim=v_head_dim,
causal=causal,
sliding_window=sliding_window,
logical_softcapping=logical_softcapping,
)
51 changes: 27 additions & 24 deletions lmdeploy/pytorch/backends/cuda/op_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@ def get_name() -> str:
@classmethod
def get_layer_impl_builder(cls, layer_type: OpType):
"""get cuda layer builder."""
if layer_type == OpType.Attention:
if layer_type == OpType.PagedAttention:
from .attention import TritonAttentionBuilder
return TritonAttentionBuilder
elif layer_type == OpType.FlashAttention:
from .flash_attention import TritonFlashAttentionBuilder
return TritonFlashAttentionBuilder
elif layer_type == OpType.ApplyRotaryEmb:
from .apply_rotary_emb import TritonApplyRotaryEmbBuilder
return TritonApplyRotaryEmbBuilder
Expand Down Expand Up @@ -125,30 +128,30 @@ def update_step_context(cls, step_context):
quant_policy=step_context.kv_quant_policy,
)

cross_attn_metadata = None
fill_seqlens = None
if step_context.cross_attention_states is not None:
fill_seqlens = torch.zeros_like(q_seqlens)
for idx, state in enumerate(step_context.cross_attention_states):
if state is not None:
fill_seqlens[idx] = state.shape[-2]
cross_seqlens = step_context.cross_seqlens
cross_kv_seqlens = step_context.cross_kv_seqlens
cross_kv_start_loc = None
cross_kv_flatten_size = None
if not step_context.is_decoding and cross_kv_seqlens is not None:
cross_kv_start_loc = cross_kv_seqlens.cumsum(0) - cross_kv_seqlens
cross_kv_flatten_size = cross_kv_seqlens.sum().item()
cross_attn_metadata = attn_meta_cls(
step_context.is_decoding,
step_context.block_offsets,
q_start_loc=q_start_loc,
q_seqlens=q_seqlens,
kv_start_loc=cross_kv_start_loc,
kv_seqlens=cross_kv_seqlens,
kv_flatten_size=cross_kv_flatten_size,
fill_seqlens=fill_seqlens,
quant_policy=step_context.kv_quant_policy,
)
cross_attn_metadata = None
if cross_seqlens is not None:
fill_seqlens = cross_seqlens
if fill_seqlens.sum().item() == 0:
fill_seqlens = None
cross_kv_start_loc = None
cross_kv_flatten_size = None
if not step_context.is_decoding and cross_kv_seqlens is not None:
cross_kv_start_loc = cross_kv_seqlens.cumsum(
0) - cross_kv_seqlens
cross_kv_flatten_size = cross_kv_seqlens.sum().item()
cross_attn_metadata = attn_meta_cls(
step_context.is_decoding,
step_context.block_offsets,
q_start_loc=q_start_loc,
q_seqlens=q_seqlens,
kv_start_loc=cross_kv_start_loc,
kv_seqlens=cross_kv_seqlens,
kv_flatten_size=cross_kv_flatten_size,
fill_seqlens=fill_seqlens,
quant_policy=step_context.kv_quant_policy,
)

step_context.attn_metadata = attn_metadata
step_context.cross_attn_metadata = cross_attn_metadata
Expand Down
5 changes: 5 additions & 0 deletions lmdeploy/pytorch/backends/dlinfer/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ def __init__(
alibi: bool = None,
sliding_window: int = None,
logit_softcapping: float = None,
causal: bool = True,
**kwargs,
):
assert causal
super().__init__(
num_heads,
head_size,
Expand All @@ -42,6 +44,7 @@ def __init__(
alibi,
sliding_window,
logit_softcapping,
causal=causal,
**kwargs,
)

Expand Down Expand Up @@ -152,6 +155,7 @@ def build(
alibi_scale: float = None,
sliding_window: int = None,
logical_softcapping: float = None,
causal: bool = True,
**kwargs,
) -> DlinferAttentionImpl:
"""build."""
Expand All @@ -163,4 +167,5 @@ def build(
alibi_scale=alibi_scale,
sliding_window=sliding_window,
logical_softcapping=logical_softcapping,
causal=causal,
**kwargs)
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/backends/dlinfer/op_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_name() -> str:
@classmethod
def get_layer_impl_builder(cls, layer_type: OpType):
"""get dlinfer layer builder."""
if layer_type == OpType.Attention:
if layer_type == OpType.PagedAttention:
from .attention import DlinferAttentionBuilder
return DlinferAttentionBuilder
elif layer_type == OpType.ApplyRotaryEmb:
Expand Down
Loading

0 comments on commit 2a2895f

Please sign in to comment.