From 522108c64cd854f0e5d555f4eb7c761aa9eae377 Mon Sep 17 00:00:00 2001 From: Lyu Han Date: Tue, 29 Oct 2024 14:40:39 +0800 Subject: [PATCH] Bump version to v0.6.2 (#2659) * bump version to v0.6.2 * update supported-models list * update news * update * update * update * upate * merge main and fix lint --- README.md | 2 + README_ja.md | 1 + README_zh-CN.md | 2 + docs/en/get_started/installation.md | 2 +- docs/en/supported_models/supported_models.md | 63 ++++++++++--------- docs/zh_cn/get_started/installation.md | 2 +- .../supported_models/supported_models.md | 63 ++++++++++--------- lmdeploy/version.py | 2 +- lmdeploy/vl/model/llava_hf.py | 6 +- 9 files changed, 76 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 61c0eba45b..6ca5fadedd 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ ______________________________________________________________________
2024 +- \[2024/10\] PyTorchEngine supports graph mode on ascend platform, doubling the inference speed - \[2024/09\] LMDeploy PyTorchEngine adds support for [Huawei Ascend](./docs/en/get_started/ascend/get_started.md). See supported models [here](docs/en/supported_models/supported_models.md) - \[2024/09\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph - \[2024/08\] LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference @@ -162,6 +163,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
  • Phi-3-vision (4.2B)
  • Phi-3.5-vision (4.2B)
  • GLM-4V (9B)
  • +
  • Llama3.2-vision (11B, 90B)
  • diff --git a/README_ja.md b/README_ja.md index 999ebc9f0b..df4647d868 100644 --- a/README_ja.md +++ b/README_ja.md @@ -160,6 +160,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
  • Phi-3-vision (4.2B)
  • Phi-3.5-vision (4.2B)
  • GLM-4V (9B)
  • +
  • Llama3.2-vision (11B, 90B)
  • diff --git a/README_zh-CN.md b/README_zh-CN.md index f002899c60..663b7b24ab 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -26,6 +26,7 @@ ______________________________________________________________________
    2024 +- \[2024/10\] PyTorchEngine 在 ascend 平台上支持了图模式,推理性能提高了 1 倍 - \[2024/09\] LMDeploy PyTorchEngine 增加了对 [华为 Ascend](docs/zh_cn/get_started/ascend/get_started.md) 的支持。支持的模型请见[这里](docs/zh_cn/supported_models/supported_models.md) - \[2024/09\] 通过引入 CUDA Graph,LMDeploy PyTorchEngine 在 Llama3-8B 推理上实现了 1.3 倍的加速 - \[2024/08\] LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift),成为 VLMs 推理的默认加速引擎 @@ -163,6 +164,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
  • Phi-3-vision (4.2B)
  • Phi-3.5-vision (4.2B)
  • GLM-4V (9B)
  • +
  • Llama3.2-vision (11B, 90B)
  • diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md index ab7ee0b30e..b7d03b28a6 100644 --- a/docs/en/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -23,7 +23,7 @@ pip install lmdeploy The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by: ```shell -export LMDEPLOY_VERSION=0.6.1 +export LMDEPLOY_VERSION=0.6.2 export PYTHON_VERSION=38 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index 260120efe0..cd38a60025 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -4,36 +4,37 @@ The following tables detail the models supported by LMDeploy's TurboMind engine ## TurboMind on CUDA Platform -| Model | Size | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 | -| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: | -| Llama | 7B - 65B | LLM | Yes | Yes | Yes | Yes | -| Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3.2 | 3B | LLM | Yes | Yes | Yes | Yes | -| InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | -| InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | -| InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | -| InternLM-XComposer2 | 7B, 4khd-7B | MLLM | Yes | Yes | Yes | Yes | -| InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes | -| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | -| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes | -| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes | -| Mistral | 7B | LLM | Yes | Yes | Yes | - | -| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes | -| DeepSeek-VL | 7B | MLLM | Yes | Yes | Yes | Yes | -| Baichuan | 7B | LLM | Yes | Yes | Yes | Yes | -| Baichuan2 | 7B | LLM | Yes | Yes | Yes | Yes | -| Code Llama | 7B - 34B | LLM | Yes | Yes | Yes | No | -| YI | 6B - 34B | LLM | Yes | Yes | Yes | Yes | -| LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes | -| InternVL | v1.1- v1.5 | MLLM | Yes | Yes | Yes | Yes | -| InternVL2 | 2B-76B | MLLM | Yes | Yes | Yes | Yes | -| MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes | -| MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes | -| MiniGeminiLlama | 7B | MLLM | Yes | - | - | Yes | -| GLM4 | 9B | LLM | Yes | Yes | Yes | Yes | -| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | +| Model | Size | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 | +| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: | +| Llama | 7B - 65B | LLM | Yes | Yes | Yes | Yes | +| Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3.2 | 3B | LLM | Yes | Yes | Yes | Yes | +| InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | +| InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | +| InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | +| InternLM-XComposer2 | 7B, 4khd-7B | MLLM | Yes | Yes | Yes | Yes | +| InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes | +| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | +| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes | +| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes | +| Mistral | 7B | LLM | Yes | Yes | Yes | Yes | +| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes | +| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes | +| DeepSeek-VL | 7B | MLLM | Yes | Yes | Yes | Yes | +| Baichuan | 7B | LLM | Yes | Yes | Yes | Yes | +| Baichuan2 | 7B | LLM | Yes | Yes | Yes | Yes | +| Code Llama | 7B - 34B | LLM | Yes | Yes | Yes | No | +| YI | 6B - 34B | LLM | Yes | Yes | Yes | Yes | +| LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes | +| InternVL | v1.1 - v1.5 | MLLM | Yes | Yes | Yes | Yes | +| InternVL2 | 2B, 8B - 76B | MLLM | Yes | Yes | Yes | Yes | +| MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes | +| MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes | +| MiniGeminiLlama | 7B | MLLM | Yes | - | - | Yes | +| GLM4 | 9B | LLM | Yes | Yes | Yes | Yes | +| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | "-" means not verified yet. @@ -60,7 +61,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha | Falcon | 7B - 180B | LLM | Yes | Yes | Yes | No | No | | YI | 6B - 34B | LLM | Yes | Yes | Yes | No | Yes | | Mistral | 7B | LLM | Yes | Yes | Yes | No | No | -| Mixtral | 8x7B | LLM | Yes | Yes | Yes | No | No | +| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | No | No | | QWen | 1.8B - 72B | LLM | Yes | Yes | Yes | No | Yes | | QWen1.5 | 0.5B - 110B | LLM | Yes | Yes | Yes | No | Yes | | QWen1.5-MoE | A2.7B | LLM | Yes | Yes | Yes | No | No | diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md index f7eedecfaa..3108d64815 100644 --- a/docs/zh_cn/get_started/installation.md +++ b/docs/zh_cn/get_started/installation.md @@ -23,7 +23,7 @@ pip install lmdeploy 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3),你可以使用以下命令安装 lmdeploy: ```shell -export LMDEPLOY_VERSION=0.6.1 +export LMDEPLOY_VERSION=0.6.2 export PYTHON_VERSION=38 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 26930cf3ce..9bdbf0d45d 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -4,36 +4,37 @@ ## TurboMind CUDA 平台 -| Model | Size | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 | -| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: | -| Llama | 7B - 65B | LLM | Yes | Yes | Yes | Yes | -| Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | -| Llama3.2 | 3B | LLM | Yes | Yes | Yes | Yes | -| InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | -| InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | -| InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | -| InternLM-XComposer2 | 7B, 4khd-7B | MLLM | Yes | Yes | Yes | Yes | -| InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes | -| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | -| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes | -| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes | -| Mistral | 7B | LLM | Yes | Yes | Yes | - | -| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes | -| DeepSeek-VL | 7B | MLLM | Yes | Yes | Yes | Yes | -| Baichuan | 7B | LLM | Yes | Yes | Yes | Yes | -| Baichuan2 | 7B | LLM | Yes | Yes | Yes | Yes | -| Code Llama | 7B - 34B | LLM | Yes | Yes | Yes | No | -| YI | 6B - 34B | LLM | Yes | Yes | Yes | Yes | -| LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes | -| InternVL | v1.1- v1.5 | MLLM | Yes | Yes | Yes | Yes | -| InternVL2 | 2B-76B | MLLM | Yes | Yes | Yes | Yes | -| MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes | -| MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes | -| MiniGeminiLlama | 7B | MLLM | Yes | - | - | Yes | -| GLM4 | 9B | LLM | Yes | Yes | Yes | Yes | -| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | +| Model | Size | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 | +| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: | +| Llama | 7B - 65B | LLM | Yes | Yes | Yes | Yes | +| Llama2 | 7B - 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3.1 | 8B, 70B | LLM | Yes | Yes | Yes | Yes | +| Llama3.2 | 3B | LLM | Yes | Yes | Yes | Yes | +| InternLM | 7B - 20B | LLM | Yes | Yes | Yes | Yes | +| InternLM2 | 7B - 20B | LLM | Yes | Yes | Yes | Yes | +| InternLM2.5 | 7B | LLM | Yes | Yes | Yes | Yes | +| InternLM-XComposer2 | 7B, 4khd-7B | MLLM | Yes | Yes | Yes | Yes | +| InternLM-XComposer2.5 | 7B | MLLM | Yes | Yes | Yes | Yes | +| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes | +| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes | +| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes | +| Mistral | 7B | LLM | Yes | Yes | Yes | Yes | +| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes | +| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes | +| DeepSeek-VL | 7B | MLLM | Yes | Yes | Yes | Yes | +| Baichuan | 7B | LLM | Yes | Yes | Yes | Yes | +| Baichuan2 | 7B | LLM | Yes | Yes | Yes | Yes | +| Code Llama | 7B - 34B | LLM | Yes | Yes | Yes | No | +| YI | 6B - 34B | LLM | Yes | Yes | Yes | Yes | +| LLaVA(1.5,1.6) | 7B - 34B | MLLM | Yes | Yes | Yes | Yes | +| InternVL | v1.1 - v1.5 | MLLM | Yes | Yes | Yes | Yes | +| InternVL2 | 2B, 8B - 76B | MLLM | Yes | Yes | Yes | Yes | +| MiniCPM-Llama3-V-2_5 | - | MLLM | Yes | Yes | Yes | Yes | +| MiniCPM-V-2_6 | - | MLLM | Yes | Yes | Yes | Yes | +| MiniGeminiLlama | 7B | MLLM | Yes | - | - | Yes | +| GLM4 | 9B | LLM | Yes | Yes | Yes | Yes | +| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | “-” 表示还没有验证。 @@ -60,7 +61,7 @@ turbomind 引擎不支持 window attention。所以,对于应用了 window att | Falcon | 7B - 180B | LLM | Yes | Yes | Yes | No | No | | YI | 6B - 34B | LLM | Yes | Yes | Yes | No | Yes | | Mistral | 7B | LLM | Yes | Yes | Yes | No | No | -| Mixtral | 8x7B | LLM | Yes | Yes | Yes | No | No | +| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | No | No | | QWen | 1.8B - 72B | LLM | Yes | Yes | Yes | No | Yes | | QWen1.5 | 0.5B - 110B | LLM | Yes | Yes | Yes | No | Yes | | QWen1.5-MoE | A2.7B | LLM | Yes | Yes | Yes | No | No | diff --git a/lmdeploy/version.py b/lmdeploy/version.py index 5237d5f859..b9f76b5761 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.6.1' +__version__ = '0.6.2' short_version = __version__ diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py index c2a0e4afa0..31be101ae8 100644 --- a/lmdeploy/vl/model/llava_hf.py +++ b/lmdeploy/vl/model/llava_hf.py @@ -32,14 +32,16 @@ def build_model(self): self.vl_model = model # fix for llava-hf/llava-interleave-qwen-7b-hf - setattr(model.config, "tie_word_embeddings", False) + setattr(model.config, 'tie_word_embeddings', False) with disable_logging(): load_checkpoint_and_dispatch( model=model, max_memory=self.max_memory, checkpoint=self.model_path, device_map='auto' if not self.with_llm else {'': 'cpu'}, - no_split_module_classes=['CLIPEncoderLayer', 'SiglipEncoderLayer'], + no_split_module_classes=[ + 'CLIPEncoderLayer', 'SiglipEncoderLayer' + ], dtype=torch.half) model.eval() self.model = model