diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 78b6ad2e51238..d0279f273db7a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -26,8 +26,7 @@ is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models import SupportsLoRA, supports_multimodal -from vllm.model_executor.models.module_mapping import (ModelComposeMethod, - MultiModelKeys) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer from vllm.utils import is_pin_memory_available @@ -578,29 +577,11 @@ def _filter_unsupported_mm_module(self, module_name: str) -> bool: language model. LoRA for other modules, such as the vision tower, will be filtered out. """ - module_mapping: MultiModelKeys = self.model.get_mm_mapping() - - def _verify_decoupled_model(): - """ - Suitable for MiniCPMV, InternVL, etc. - """ - prefix = module_name.split(".")[0] - return (prefix in module_mapping.connector - or prefix in module_mapping.tower_model) - - def _verify_coupled_model(): - """ - Suitable for QWenVL, GLM4V, etc. - """ + if self.supports_mm: + module_mapping: MultiModelKeys = self.model.get_mm_mapping() prefix_lst = module_mapping.connector + module_mapping.tower_model return any( [module_name.startswith(prefix) for prefix in prefix_lst]) - - if self.supports_mm: - if module_mapping.compose_type == ModelComposeMethod.Decoupled: - return _verify_decoupled_model() - else: - return _verify_coupled_model() return False def _register_packed_modules(self, module_full_name: str) -> None: diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 5410711d173f1..2ec51dc4647f5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -48,8 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.minicpm import MiniCPMModel -from vllm.model_executor.models.module_mapping import (ModelComposeMethod, - MultiModelKeys) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -636,11 +635,9 @@ def get_mm_mapping(self) -> MultiModelKeys: """ Get the module prefix in multimodal models """ - return MultiModelKeys.from_string_field( - language_model="llm", - connector="resampler", - tower_model="vpm", - compose_type=ModelComposeMethod.Decoupled) + return MultiModelKeys.from_string_field(language_model="llm", + connector="resampler", + tower_model="vpm") def init_llm( self, diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 269cdc640c8df..a9102a6073a2f 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -2,46 +2,9 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from enum import IntEnum from typing import List, Union -class ModelComposeMethod(IntEnum): - """ - `ModelComposeMethod` distinguishes between two architectural patterns in - multi-modal models, focusing on how vision model, language model, and - projector are implemented: - 1. Decoupled Architecture (like mllama, InternVL, miniCPMV), complete - independent implementation with its own layers, for example: - ``` - InternVLChatModel - ├── vision_model (visual encoder) - │ ├── embeddings - │ └── encoder - ├── language_model (language model) - │ ├── tok_embeddings - │ └── layers - └── mlp1 (projector) - ``` - 2. Coupled Architecture (like QWenVL, GLM4V), Integrated as a sub-module - with shared architectural patterns , for example: - - ``` - QWenVL - └── transformer - ├── wte - ├── h (language model) - ├── ln_f - └── visual (visual encoder) - ├── conv1 - ├── transformer - └── attn_pool (projector) - ``` - """ - Decoupled = 0 - Coupled = 1 - - @dataclass class ModelKeys: model_type: str = None @@ -78,8 +41,6 @@ class ModelKeys: output: str = None - compose_type: str = None - @dataclass class MultiModelKeys(ModelKeys): @@ -94,9 +55,7 @@ def from_string_field(language_model: Union[str, List[str]] = None, connector: Union[str, List[str]] = None, tower_model: Union[str, List[str]] = None, generator: Union[str, List[str]] = None, - compose_type: str = None, **kwargs) -> 'MultiModelKeys': - assert compose_type, "compose_type is not allowed to be None" def to_list(value): if value is None: @@ -107,5 +66,4 @@ def to_list(value): connector=to_list(connector), tower_model=to_list(tower_model), generator=to_list(generator), - compose_type=compose_type, **kwargs) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 04fdb27f42141..0a1b40927e9f9 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -40,8 +40,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.module_mapping import (ModelComposeMethod, - MultiModelKeys) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.base import MultiModalInputs @@ -1042,8 +1041,7 @@ def get_mm_mapping(self) -> MultiModelKeys: return MultiModelKeys.from_string_field( language_model="transformer.h", connector="transformer.visual.attn_pool", - tower_model="transformer.visual.transformer", - compose_type=ModelComposeMethod.Coupled) + tower_model="transformer.visual.transformer") @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)