From 4b6f35da5e0773114ae19039242ac1afa217f2c4 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 1 Nov 2024 15:25:47 +0800 Subject: [PATCH] [torch.compile] Adding torch compile annotations to some models (#9876) Signed-off-by: youkaichao Co-authored-by: youkaichao --- docs/source/models/supported_models.rst | 2 +- tests/distributed/test_pipeline_parallel.py | 2 +- vllm/model_executor/models/falcon.py | 2 ++ vllm/model_executor/models/phi.py | 2 ++ vllm/model_executor/models/qwen.py | 2 ++ vllm/model_executor/models/qwen2.py | 2 ++ vllm/model_executor/models/qwen2_moe.py | 2 ++ 7 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 3279e7a108232..e493cebf1e9f4 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -281,7 +281,7 @@ Text Generation - ✅︎ * - :code:`Qwen2ForCausalLM` - Qwen2 - - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ * - :code:`Qwen2MoeForCausalLM` diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index ed6360f9d6148..1489a60891761 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -166,7 +166,7 @@ def iter_params(self, model_name: str): "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), - "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), + "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(), diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 467a33505ee12..36c85e37783ab 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -27,6 +27,7 @@ from transformers import FalconConfig as HF_FalconConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -329,6 +330,7 @@ def forward( return output +@support_torch_compile class FalconModel(nn.Module): def __init__( diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index ec20cb249ba9b..497eae4e8905b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -42,6 +42,7 @@ from transformers import PhiConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -193,6 +194,7 @@ def forward( return hidden_states +@support_torch_compile class PhiModel(nn.Module): def __init__(self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 998016ea28c26..61665768eacf5 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -20,6 +20,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, @@ -549,6 +550,7 @@ def forward( return hidden_states, residual +@support_torch_compile class QWenModel(nn.Module): def __init__( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index db1029345a8ac..db7556b3b5f4b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -29,6 +29,7 @@ from transformers import Qwen2Config from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul @@ -237,6 +238,7 @@ def forward( return hidden_states, residual +@support_torch_compile class Qwen2Model(nn.Module): def __init__( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index d4475b7ca27af..dac85e35d369d 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -30,6 +30,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, @@ -312,6 +313,7 @@ def forward( return hidden_states, residual +@support_torch_compile class Qwen2MoeModel(nn.Module): def __init__(