From e1ff0f7930929576524c19cddd03c8ea9b141707 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Fri, 18 Oct 2024 11:12:48 -0300 Subject: [PATCH 01/10] Adds method to read the pooling types from model's files Signed-off-by: Flavia Beo --- vllm/config.py | 9 +++ vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/pooler.py | 25 ++++++++ vllm/model_executor/model_loader/loader.py | 12 +++- vllm/model_executor/models/bert.py | 8 ++- vllm/transformers_utils/config.py | 75 +++++++++++++++++++--- 6 files changed, 117 insertions(+), 14 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 25f841231dedd..dd46bba7c2acf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -10,11 +10,13 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import (ConfigFormat, get_config, get_hf_image_processor_config, + get_pooling_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_hip, is_openvino, print_warning_once) @@ -173,6 +175,7 @@ def __init__(self, code_revision, rope_scaling, rope_theta, config_format) self.hf_text_config = get_hf_text_config(self.hf_config) + self.pooling_config = self.get_pooling_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -405,6 +408,12 @@ def _verify_bnb_config(self) -> None: "fallback to the eager mode.") self.enforce_eager = True + def get_pooling_config(self) -> PoolingConfig: + pooling_config = get_pooling_config(self.model, + self.revision) + return PoolingConfig(pooling_config["pooling_type"], + pooling_config["normalize"]) + def verify_async_output_proc(self, parallel_config, speculative_config, device_config) -> None: if not self.use_async_output_proc: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1dd0f097c74ff..0c92875aaf62b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -290,7 +290,7 @@ def __init__( model_config.use_async_output_proc, use_cached_outputs, model_config.chat_template_text_format, - model_config.mm_processor_kwargs, + model_config.mm_processor_kwargs ) # TODO(woosuk): Print more configs in debug mode. self.model_config = model_config diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 3455a4ccf282f..d24b6452e7b6c 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -13,6 +13,31 @@ class PoolingType(IntEnum): LAST = 0 ALL = 1 CLS = 2 + MEAN = 3 + MAX = 4 + + +class PoolingConfig(): + """A class that configures the pooling operation. + + Attributes: + pooling_type (str): The type of pooling to use. + normalize (bool): Whether to normalize the pooled data. + + Methods: + get_pooling_type(pooling_type_name): Returns the pooling + type enum value corresponding to the given string. + """ + def __init__(self, pooling_type: str, normalize: bool): + self.pooling_type = self.get_pooling_type(pooling_type) + self.normalize = normalize + + def get_pooling_type(self, pooling_type_name: str) -> PoolingType: + pooling_types = PoolingType.__dict__.items() + return PoolingType(next((value for key, + value in pooling_types if key.lower() + in pooling_type_name), + 2)) class Pooler(nn.Module): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 813f58339da37..bc319ea92354d 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -30,6 +30,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) @@ -122,7 +123,8 @@ def _get_model_initialization_kwargs( model_class: Type[nn.Module], lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig] = None) -> Dict[str, Any]: + scheduler_config: Optional[SchedulerConfig] = None + ) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" extra_kwargs: Dict[str, Any] = {} @@ -152,14 +154,17 @@ def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig, quant_config: Optional[QuantizationConfig], *, lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig]) -> nn.Module: + scheduler_config: Optional[SchedulerConfig], + pooling_config: Optional[PoolingConfig] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, - scheduler_config) + scheduler_config + ) return model_class(config=hf_config, cache_config=cache_config, quant_config=quant_config, + pooling_config=pooling_config, **extra_kwargs) @@ -180,6 +185,7 @@ def _initialize_model( lora_config=lora_config, multimodal_config=model_config.multimodal_config, scheduler_config=scheduler_config, + pooling_config=model_config.pooling_config ) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 4c0a0e303e655..b5df80c995451 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -387,10 +387,14 @@ def __init__( config: BertConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + pooling_config: Optional[PoolingConfig] = None ) -> None: super().__init__() self.model = BertModel(config, cache_config, quant_config) - self._pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True) + print(pooling_config.pooling_type) + print(pooling_config.normalize) + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) def forward( self, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9bd2531d7a15c..2801df0431968 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -231,6 +231,72 @@ def get_config( return config +def get_hf_file_to_dict(file_name, model, revision): + """ + Downloads a file from the Hugging Face Hub and returns + its contents as a dictionary. + + Parameters: + - file_name (str): The name of the file to download. + - model (str): The name of the model on the Hugging Face Hub. + - revision (str): The specific version of the model. + + Returns: + - config_dict (dict): A dictionary containing + the contents of the downloaded file. + """ + file_path = Path(model) / file_name + + if not file_path.is_file(): + file_path = Path( + hf_hub_download(model, file_name, revision=revision)) + + with open(file_path, "r") as file: + config_dict = json.load(file) + + return config_dict + +def get_pooling_config(model, revision='main'): + """ + This function gets the pooling and normalize + config from the model. + + Args: + model (str): The name of the Hugging Face model. + revision (str, optional): The specific version + of the model to use. Defaults to 'main'. + + Returns: + dict: A dictionary containing the pooling + type and whether normalization is used. + """ + + modules_file_name = "modules.json" + modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) + + pooling = next((item for item in modules_dict if + item["type"] == "sentence_transformers.models.Pooling"), + None) + normalize = next((item for item in modules_dict if + item["type"] == + "sentence_transformers.models.Normalize"), + False) + + if pooling: + + pooling_file_name = "{}/config.json".format(pooling["path"]) + pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) + pooling_type_name = next((item for item, + val in pooling_dict.items() if val is True), + None) + + return { + "pooling_type": pooling_type_name, + "normalize": normalize + } + + return None + def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None: """Try to register HF model configuration class to serialize by value @@ -300,14 +366,7 @@ def load_params_config(model, revision) -> PretrainedConfig: config_file_name = "params.json" - config_path = Path(model) / config_file_name - - if not config_path.is_file(): - config_path = Path( - hf_hub_download(model, config_file_name, revision=revision)) - - with open(config_path, "r") as file: - config_dict = json.load(file) + config_dict = get_hf_file_to_dict(config_file_name, model, revision) config_mapping = { "dim": "hidden_size", From 5bc9a7d050b50985a0d3f936b222a7b11ef2a293 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Mon, 21 Oct 2024 12:29:05 -0300 Subject: [PATCH 02/10] Adds MEAN pooling type Signed-off-by: Flavia Beo --- vllm/model_executor/layers/pooler.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index d24b6452e7b6c..8376915532087 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -14,7 +14,6 @@ class PoolingType(IntEnum): ALL = 1 CLS = 2 MEAN = 3 - MAX = 4 class PoolingConfig(): @@ -49,7 +48,7 @@ class Pooler(nn.Module): 3. Returns structured results as `PoolerOutput`. Attributes: - pooling_type: The type of pooling to use (LAST, ALL, CLS). + pooling_type: The type of pooling to use (LAST, ALL, CLS, MEAN). normalize: Whether to normalize the pooled data. """ @@ -83,6 +82,17 @@ def forward( for prompt_len in prompt_lens: pooled_data.append(hidden_states[offset:offset + prompt_len]) offset += prompt_len + elif self.pooling_type == PoolingType.MEAN: + # Calculate mean pooling + cumsum = torch.cumsum(hidden_states, dim=0) + start_indices = torch.cat([ + torch.tensor([0], device=hidden_states.device), + torch.cumsum(prompt_lens[:-1], dim=0) + ]) + end_indices = torch.cumsum(prompt_lens, dim=0) + pooled_data = ( + cumsum[end_indices - 1] - cumsum[start_indices] + + hidden_states[start_indices]) / prompt_lens.unsqueeze(1) else: raise ValueError(f"Invalid pooling type: {self.pooling_type}") From 7119bb3e412f23d1bb39cfbfb8074d372a9bb136 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Mon, 21 Oct 2024 13:27:00 -0300 Subject: [PATCH 03/10] Make normalize variable return bool value Signed-off-by: Flavia Beo --- vllm/transformers_utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2801df0431968..416ddb46ff55e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -277,10 +277,10 @@ def get_pooling_config(model, revision='main'): pooling = next((item for item in modules_dict if item["type"] == "sentence_transformers.models.Pooling"), None) - normalize = next((item for item in modules_dict if + normalize = bool(next((item for item in modules_dict if item["type"] == "sentence_transformers.models.Normalize"), - False) + False)) if pooling: From 5b0a9f364e5eabe7efce4565339e2ef6b720ea7a Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Mon, 21 Oct 2024 14:40:38 -0300 Subject: [PATCH 04/10] Adds test for model loading with the params Signed-off-by: Flavia Beo --- .../test_model_load_with_params.py | 23 +++++++++++ tests/test_config.py | 20 ++++++++++ vllm/config.py | 13 +++---- vllm/engine/llm_engine.py | 13 ++----- vllm/model_executor/layers/pooler.py | 18 +++++---- vllm/model_executor/model_loader/loader.py | 35 +++++++++-------- vllm/model_executor/models/bert.py | 21 +++++----- vllm/transformers_utils/config.py | 38 +++++++++---------- 8 files changed, 108 insertions(+), 73 deletions(-) create mode 100644 tests/model_executor/test_model_load_with_params.py diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py new file mode 100644 index 0000000000000..a2a85ef13e1d1 --- /dev/null +++ b/tests/model_executor/test_model_load_with_params.py @@ -0,0 +1,23 @@ +import os + +import torch + +MAX_MODEL_LEN = 128 +MODEL_NAME = os.environ.get("MODEL_NAME", + "sentence-transformers/all-MiniLM-L12-v2") +REVISION = os.environ.get("REVISION", "main") +QUANTIZATION = os.environ.get("QUANTIZATION", "auto") + + +def test_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME, + revision=REVISION, + dtype=torch.half if QUANTIZATION == "gptq" else "auto", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + print(output) + assert output diff --git a/tests/test_config.py b/tests/test_config.py index 69918b67607d9..18f26082eaeb3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,7 @@ import pytest from vllm.config import ModelConfig +from vllm.model_executor.layers.pooler import PoolingConfig, PoolingType @pytest.mark.parametrize(("model_id", "expected_task"), [ @@ -102,6 +103,25 @@ def test_get_sliding_window(): assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW +def test_get_pooling_config(): + minilm_model_config = ModelConfig( + "sentence-transformers/all-MiniLM-L12-v2", + "sentence-transformers/all-MiniLM-L12-v2", + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + minilm_pooling_config = minilm_model_config.get_pooling_config() + + assert isinstance(minilm_model_config.pooling_config, PoolingConfig) + assert minilm_pooling_config.normalize + assert isinstance(minilm_pooling_config.pooling_type, PoolingType) + assert minilm_pooling_config.pooling_type == PoolingType.MEAN + + def test_rope_customization(): TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 diff --git a/vllm/config.py b/vllm/config.py index dd46bba7c2acf..719181bc4229a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -5,18 +5,18 @@ Mapping, Optional, Set, Tuple, Type, Union) import torch -from transformers import PretrainedConfig import vllm.envs as envs +from transformers import PretrainedConfig from vllm.logger import init_logger +from vllm.model_executor.layers.pooler import PoolingConfig # noqa: F401 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback +from vllm.transformers_utils.config import get_pooling_config # noqa: F401 from vllm.transformers_utils.config import (ConfigFormat, get_config, get_hf_image_processor_config, - get_pooling_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_hip, is_openvino, print_warning_once) @@ -175,7 +175,7 @@ def __init__(self, code_revision, rope_scaling, rope_theta, config_format) self.hf_text_config = get_hf_text_config(self.hf_config) - self.pooling_config = self.get_pooling_config() + self.pooling_config = self.get_pooling_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -409,9 +409,8 @@ def _verify_bnb_config(self) -> None: self.enforce_eager = True def get_pooling_config(self) -> PoolingConfig: - pooling_config = get_pooling_config(self.model, - self.revision) - return PoolingConfig(pooling_config["pooling_type"], + pooling_config = get_pooling_config(self.model, self.revision) + return PoolingConfig(pooling_config["pooling_type"], pooling_config["normalize"]) def verify_async_output_proc(self, parallel_config, speculative_config, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0c92875aaf62b..fc7b2d0914457 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -274,15 +274,10 @@ def __init__( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, + model_config.quantization, model_config.enforce_eager, + cache_config.cache_dtype, model_config.quantization_param_path, + device_config.device, decoding_config, observability_config, + model_config.seed, model_config.served_model_name, scheduler_config.num_scheduler_steps, scheduler_config.chunked_prefill_enabled, scheduler_config.multi_step_stream_outputs, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 8376915532087..d7cb111742836 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from enum import IntEnum import torch @@ -16,8 +17,11 @@ class PoolingType(IntEnum): MEAN = 3 -class PoolingConfig(): - """A class that configures the pooling operation. +@dataclass +class PoolingConfig: + """A class that configures the pooling operation which + only applies to sentence-transformers models. + More at: https://www.sbert.net/ Attributes: pooling_type (str): The type of pooling to use. @@ -27,16 +31,16 @@ class PoolingConfig(): get_pooling_type(pooling_type_name): Returns the pooling type enum value corresponding to the given string. """ + def __init__(self, pooling_type: str, normalize: bool): self.pooling_type = self.get_pooling_type(pooling_type) - self.normalize = normalize + self.normalize = normalize def get_pooling_type(self, pooling_type_name: str) -> PoolingType: pooling_types = PoolingType.__dict__.items() - return PoolingType(next((value for key, - value in pooling_types if key.lower() - in pooling_type_name), - 2)) + return PoolingType( + next((value for key, value in pooling_types + if key.lower() in pooling_type_name), 2)) class Pooler(nn.Module): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bc319ea92354d..2a648e0a20b2d 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -18,9 +18,9 @@ import torch from huggingface_hub import HfApi, hf_hub_download from torch import nn -from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.utils import SAFE_WEIGHTS_INDEX_NAME +from transformers import AutoModelForCausalLM, PretrainedConfig from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, SchedulerConfig) @@ -28,9 +28,9 @@ get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger +from vllm.model_executor.layers.pooler import PoolingConfig # noqa: F401 from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) @@ -123,8 +123,7 @@ def _get_model_initialization_kwargs( model_class: Type[nn.Module], lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig] = None - ) -> Dict[str, Any]: + scheduler_config: Optional[SchedulerConfig] = None) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" extra_kwargs: Dict[str, Any] = {} @@ -149,17 +148,18 @@ def _get_model_initialization_kwargs( return extra_kwargs -def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig, +def build_model(model_class: Type[nn.Module], + hf_config: PretrainedConfig, cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], *, + quant_config: Optional[QuantizationConfig], + *, lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], scheduler_config: Optional[SchedulerConfig], pooling_config: Optional[PoolingConfig] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, - scheduler_config - ) + scheduler_config) return model_class(config=hf_config, cache_config=cache_config, @@ -177,16 +177,15 @@ def _initialize_model( """Initialize a model with the given configurations.""" model_class, _ = get_model_architecture(model_config) - return build_model( - model_class, - model_config.hf_config, - cache_config=cache_config, - quant_config=_get_quantization_config(model_config, load_config), - lora_config=lora_config, - multimodal_config=model_config.multimodal_config, - scheduler_config=scheduler_config, - pooling_config=model_config.pooling_config - ) + return build_model(model_class, + model_config.hf_config, + cache_config=cache_config, + quant_config=_get_quantization_config( + model_config, load_config), + lora_config=lora_config, + multimodal_config=model_config.multimodal_config, + scheduler_config=scheduler_config, + pooling_config=model_config.pooling_config) class BaseModelLoader(ABC): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index b5df80c995451..a0181bfd2a64f 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -2,8 +2,8 @@ import torch from torch import nn -from transformers import BertConfig +from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.xformers import XFormersImpl from vllm.config import CacheConfig @@ -12,7 +12,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingConfig +from vllm.model_executor.layers.pooler import (Pooler, # noqa: F401 + PoolingConfig) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -382,18 +383,14 @@ class BertEmbeddingModel(nn.Module): _pooler: An instance of Pooler used for pooling operations. """ - def __init__( - self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - pooling_config: Optional[PoolingConfig] = None - ) -> None: + def __init__(self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + pooling_config: Optional[PoolingConfig] = None) -> None: super().__init__() self.model = BertModel(config, cache_config, quant_config) - print(pooling_config.pooling_type) - print(pooling_config.normalize) - self._pooler = Pooler(pooling_config.pooling_type, + self._pooler = Pooler(pooling_config.pooling_type, pooling_config.normalize) def forward( diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 416ddb46ff55e..fbf8e96817e61 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,13 +6,13 @@ import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, try_to_load_from_cache) -from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME +from transformers import GenerationConfig, PretrainedConfig from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block @@ -231,6 +231,7 @@ def get_config( return config + def get_hf_file_to_dict(file_name, model, revision): """ Downloads a file from the Hugging Face Hub and returns @@ -248,18 +249,19 @@ def get_hf_file_to_dict(file_name, model, revision): file_path = Path(model) / file_name if not file_path.is_file(): - file_path = Path( - hf_hub_download(model, file_name, revision=revision)) + file_path = Path(hf_hub_download(model, file_name, revision=revision)) with open(file_path, "r") as file: config_dict = json.load(file) return config_dict + def get_pooling_config(model, revision='main'): """ This function gets the pooling and normalize - config from the model. + config from the model - only applies to + sentence-transformers models. Args: model (str): The name of the Hugging Face model. @@ -274,26 +276,22 @@ def get_pooling_config(model, revision='main'): modules_file_name = "modules.json" modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) - pooling = next((item for item in modules_dict if - item["type"] == "sentence_transformers.models.Pooling"), - None) - normalize = bool(next((item for item in modules_dict if - item["type"] == - "sentence_transformers.models.Normalize"), - False)) + pooling = next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Pooling"), + None) + normalize = bool( + next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Normalize"), + False)) - if pooling: + if pooling: pooling_file_name = "{}/config.json".format(pooling["path"]) pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) - pooling_type_name = next((item for item, - val in pooling_dict.items() if val is True), - None) - - return { - "pooling_type": pooling_type_name, - "normalize": normalize - } + pooling_type_name = next( + (item for item, val in pooling_dict.items() if val is True), None) + + return {"pooling_type": pooling_type_name, "normalize": normalize} return None From d16eefd478e865f992070acd917f598d458ade16 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 22 Oct 2024 20:45:02 -0300 Subject: [PATCH 05/10] Adds method and attribute for bert sentence_transformer config files Signed-off-by: Flavia Beo --- .../test_model_load_with_params.py | 16 ++++---- tests/test_config.py | 35 +++++++++++++++++ vllm/config.py | 33 ++++++++++++---- vllm/engine/llm_engine.py | 16 ++++++-- vllm/model_executor/layers/pooler.py | 2 +- vllm/model_executor/model_loader/loader.py | 32 +++++++-------- vllm/model_executor/models/bert.py | 5 +-- vllm/transformers_utils/config.py | 39 ++++++++++++++++++- .../tokenizer_group/__init__.py | 3 ++ 9 files changed, 141 insertions(+), 40 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index a2a85ef13e1d1..a73667947bd11 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,12 +1,8 @@ import os -import torch - MAX_MODEL_LEN = 128 -MODEL_NAME = os.environ.get("MODEL_NAME", - "sentence-transformers/all-MiniLM-L12-v2") +MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") -QUANTIZATION = os.environ.get("QUANTIZATION", "auto") def test_model_loading_with_params(vllm_runner): @@ -15,9 +11,15 @@ def test_model_loading_with_params(vllm_runner): """ with vllm_runner(model_name=MODEL_NAME, revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", + dtype="float16", max_model_len=MAX_MODEL_LEN) as model: output = model.encode("Write a short story about a robot that" " dreams for the first time.\n") - print(output) + + model_config = model.model.llm_engine.model_config + + assert model_config.max_model_len == 512 + assert model_config.do_lower_case + assert model_config.pooling_config.pooling_type == 2 + assert model_config.pooling_config.normalize assert output diff --git a/tests/test_config.py b/tests/test_config.py index 18f26082eaeb3..65c532be1156d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -122,6 +122,41 @@ def test_get_pooling_config(): assert minilm_pooling_config.pooling_type == PoolingType.MEAN +def test_get_bert_sentence_transformer_config(): + bge_model_config = ModelConfig( + model="BAAI/bge-base-en-v1.5", + task="auto", + tokenizer="BAAI/bge-base-en-v1.5", + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + bert_bge_model_config = bge_model_config._get_bert_config() + + assert bert_bge_model_config["max_seq_length"] == 512 + assert bert_bge_model_config["do_lower_case"] + + +def test_get_tokenization_sentence_transformer_config(): + bge_model_config = ModelConfig( + model="BAAI/bge-base-en-v1.5", + task="auto", + tokenizer="BAAI/bge-base-en-v1.5", + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + bert_config = bge_model_config._get_bert_tokenization_config() + + assert bert_config + + def test_rope_customization(): TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 diff --git a/vllm/config.py b/vllm/config.py index 719181bc4229a..4655430dba511 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -5,19 +5,19 @@ Mapping, Optional, Set, Tuple, Type, Union) import torch +from transformers import PretrainedConfig import vllm.envs as envs -from transformers import PretrainedConfig from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import PoolingConfig # noqa: F401 +from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback -from vllm.transformers_utils.config import get_pooling_config # noqa: F401 -from vllm.transformers_utils.config import (ConfigFormat, get_config, - get_hf_image_processor_config, - get_hf_text_config) +from vllm.transformers_utils.config import ( + ConfigFormat, get_config, get_hf_image_processor_config, + get_hf_text_config, get_pooling_config, + get_sentence_transformer_bert_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_hip, is_openvino, print_warning_once) @@ -176,6 +176,8 @@ def __init__(self, config_format) self.hf_text_config = get_hf_text_config(self.hf_config) self.pooling_config = self.get_pooling_config() + self.bert_config = self._get_bert_config() + self.do_lower_case = self._get_bert_tokenization_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -208,7 +210,8 @@ def __init__(self, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window_len=self.get_hf_config_sliding_window(), - spec_target_max_model_len=spec_target_max_model_len) + spec_target_max_model_len=spec_target_max_model_len, + bert_config=self.bert_config) self.served_model_name = get_served_model_name(model, served_model_name) self.multimodal_config = self._init_multimodal_config( @@ -245,6 +248,17 @@ def _init_multimodal_config( return None + def _get_bert_config(self): + bert_config = get_sentence_transformer_bert_config( + self.model, self.revision) + if bert_config is not None: + return bert_config + return None + + def _get_bert_tokenization_config(self): + if self.bert_config: + return self.bert_config.get("do_lower_case") + def _init_attention_free(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.is_attention_free_model(architectures) @@ -1723,6 +1737,7 @@ def _get_and_verify_max_len( disable_sliding_window: bool, sliding_window_len: Optional[Union[int, List[Optional[int]]]], spec_target_max_model_len: Optional[int] = None, + bert_config: Optional[Any] = None, ) -> int: """Get and verify the model's maximum length.""" derived_max_model_len = float("inf") @@ -1805,6 +1820,9 @@ def _get_and_verify_max_len( "original_max_position_embeddings"] derived_max_model_len *= scaling_factor + if bert_config and "max_seq_lenght" in bert_config: + derived_max_model_len = bert_config["max_seq_length"] + # If the user specified a max length, make sure it is smaller than the # derived length from the HF model config. if max_model_len is None: @@ -1837,6 +1855,7 @@ def _get_and_verify_max_len( raise ValueError( f"{msg} To allow overriding this maximum, set " "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1") + return int(max_model_len) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fc7b2d0914457..8383682d977e9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -254,6 +254,7 @@ def __init__( "num_scheduler_steps=%d, chunked_prefill_enabled=%s " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, " + "pooling_config_type=%s, normalize=%s, " "chat_template_text_format=%s, mm_processor_kwargs=%s)", VLLM_VERSION, model_config.model, @@ -274,16 +275,23 @@ def __init__( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, parallel_config.disable_custom_all_reduce, - model_config.quantization, model_config.enforce_eager, - cache_config.cache_dtype, model_config.quantization_param_path, - device_config.device, decoding_config, observability_config, - model_config.seed, model_config.served_model_name, + model_config.quantization, + model_config.enforce_eager, + cache_config.cache_dtype, + model_config.quantization_param_path, + device_config.device, + decoding_config, + observability_config, + model_config.seed, + model_config.served_model_name, scheduler_config.num_scheduler_steps, scheduler_config.chunked_prefill_enabled, scheduler_config.multi_step_stream_outputs, cache_config.enable_prefix_caching, model_config.use_async_output_proc, use_cached_outputs, + model_config.pooling_config.pooling_type, + model_config.pooling_config.normalize, model_config.chat_template_text_format, model_config.mm_processor_kwargs ) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index d7cb111742836..221bb77434868 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -40,7 +40,7 @@ def get_pooling_type(self, pooling_type_name: str) -> PoolingType: pooling_types = PoolingType.__dict__.items() return PoolingType( next((value for key, value in pooling_types - if key.lower() in pooling_type_name), 2)) + if key.lower() in pooling_type_name), PoolingType.CLS)) class Pooler(nn.Module): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 2a648e0a20b2d..12a9fa0aee6ee 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -18,9 +18,9 @@ import torch from huggingface_hub import HfApi, hf_hub_download from torch import nn +from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from transformers import AutoModelForCausalLM, PretrainedConfig from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, SchedulerConfig) @@ -28,7 +28,7 @@ get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import PoolingConfig # noqa: F401 +from vllm.model_executor.layers.pooler import PoolingConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.tensorizer import ( @@ -148,18 +148,17 @@ def _get_model_initialization_kwargs( return extra_kwargs -def build_model(model_class: Type[nn.Module], - hf_config: PretrainedConfig, +def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig, cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - *, + quant_config: Optional[QuantizationConfig], *, lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], scheduler_config: Optional[SchedulerConfig], pooling_config: Optional[PoolingConfig] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, - scheduler_config) + scheduler_config + ) return model_class(config=hf_config, cache_config=cache_config, @@ -177,15 +176,16 @@ def _initialize_model( """Initialize a model with the given configurations.""" model_class, _ = get_model_architecture(model_config) - return build_model(model_class, - model_config.hf_config, - cache_config=cache_config, - quant_config=_get_quantization_config( - model_config, load_config), - lora_config=lora_config, - multimodal_config=model_config.multimodal_config, - scheduler_config=scheduler_config, - pooling_config=model_config.pooling_config) + return build_model( + model_class, + model_config.hf_config, + cache_config=cache_config, + quant_config=_get_quantization_config(model_config, load_config), + lora_config=lora_config, + multimodal_config=model_config.multimodal_config, + scheduler_config=scheduler_config, + pooling_config=model_config.pooling_config + ) class BaseModelLoader(ABC): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index a0181bfd2a64f..54b85b05287fb 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -2,8 +2,8 @@ import torch from torch import nn - from transformers import BertConfig + from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.xformers import XFormersImpl from vllm.config import CacheConfig @@ -12,8 +12,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (Pooler, # noqa: F401 - PoolingConfig) +from vllm.model_executor.layers.pooler import Pooler, PoolingConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fbf8e96817e61..2ba3863f33e3d 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,13 +6,16 @@ import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, try_to_load_from_cache) +from huggingface_hub.utils import (RepositoryNotFoundError, + RevisionNotFoundError, EntryNotFoundError, + LocalEntryNotFoundError) +from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME -from transformers import GenerationConfig, PretrainedConfig from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block @@ -249,7 +252,13 @@ def get_hf_file_to_dict(file_name, model, revision): file_path = Path(model) / file_name if not file_path.is_file(): - file_path = Path(hf_hub_download(model, file_name, revision=revision)) + try: + hf_hub_file = hf_hub_download(model, file_name, revision=revision) + except (RepositoryNotFoundError, RevisionNotFoundError, + EntryNotFoundError, LocalEntryNotFoundError) as e: + logger.info("File or repository not found in hf_hub_download", e) + return None + file_path = Path(hf_hub_file) with open(file_path, "r") as file: config_dict = json.load(file) @@ -296,6 +305,32 @@ def get_pooling_config(model, revision='main'): return None +def get_sentence_transformer_bert_config(model, revision='main'): + """ + Returns the configuration dictionary for a + given Sentence Transformer BERT model. + + Parameters: + - model (str): The name of the Sentence Transformer + BERT model. + - revision (str, optional): The revision of the m + odel to use. Defaults to 'main'. + + Returns: + - dict: A dictionary containing the configuration parameters + for the Sentence Transformer BERT model. + """ + bert_dict = get_hf_file_to_dict("sentence_bert_config.json", model, + revision) + + if not bert_dict: + return None + + if all(k in bert_dict for k in ("max_seq_length", "do_lower_case")): + return bert_dict + return None + + def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None: """Try to register HF model configuration class to serialize by value diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 9a4149251d747..f653c782e1203 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -25,6 +25,9 @@ def init_tokenizer_from_configs(model_config: ModelConfig, trust_remote_code=model_config.trust_remote_code, revision=model_config.tokenizer_revision) + if model_config.do_lower_case is not None: + init_kwargs["do_lower_case"] = model_config.do_lower_case + return get_tokenizer_group(parallel_config.tokenizer_pool_config, **init_kwargs) From 6315c33bd3699cf0e05ad91a2174dd7be01b866d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1via=20B=C3=A9o?= <119421251+flaviabeo@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:36:18 -0300 Subject: [PATCH 06/10] Adds other file names for the bert models config - Asserts on the correct tokenizer loaded - Linting fixes Signed-off-by: Flavia Beo --- .../test_model_load_with_params.py | 20 +++++++++++-- tests/test_config.py | 19 +------------ vllm/config.py | 17 +++++------ vllm/model_executor/model_loader/loader.py | 28 +++++++++---------- vllm/transformers_utils/config.py | 26 +++++++++++------ .../tokenizer_group/__init__.py | 6 ++-- 6 files changed, 63 insertions(+), 53 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index a73667947bd11..e3480f0c8f6db 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,5 +1,7 @@ import os +from vllm.model_executor.layers.pooler import PoolingType + MAX_MODEL_LEN = 128 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") @@ -18,8 +20,20 @@ def test_model_loading_with_params(vllm_runner): model_config = model.model.llm_engine.model_config - assert model_config.max_model_len == 512 - assert model_config.do_lower_case - assert model_config.pooling_config.pooling_type == 2 + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.bert_config["max_seq_length"] == 512 + assert model_config.bert_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooling_config.pooling_type == PoolingType.CLS assert model_config.pooling_config.normalize + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" + assert model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer.model_max_length == 512 + + # assert output assert output diff --git a/tests/test_config.py b/tests/test_config.py index 65c532be1156d..85202a136f478 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -122,7 +122,7 @@ def test_get_pooling_config(): assert minilm_pooling_config.pooling_type == PoolingType.MEAN -def test_get_bert_sentence_transformer_config(): +def test_get_bert_tokenization_sentence_transformer_config(): bge_model_config = ModelConfig( model="BAAI/bge-base-en-v1.5", task="auto", @@ -140,23 +140,6 @@ def test_get_bert_sentence_transformer_config(): assert bert_bge_model_config["do_lower_case"] -def test_get_tokenization_sentence_transformer_config(): - bge_model_config = ModelConfig( - model="BAAI/bge-base-en-v1.5", - task="auto", - tokenizer="BAAI/bge-base-en-v1.5", - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="float16", - revision=None, - ) - - bert_config = bge_model_config._get_bert_tokenization_config() - - assert bert_config - - def test_rope_customization(): TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 diff --git a/vllm/config.py b/vllm/config.py index 4655430dba511..88bde618a65d3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, - get_sentence_transformer_bert_config) + get_sentence_transformer_tokenizer_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_hip, is_openvino, print_warning_once) @@ -112,6 +112,10 @@ class ModelConfig: can not be gathered from the vllm arguments. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. + pooling_config: pooling and normalize config from the model - + only applies to sentence-transformers models. + bert_config: tokenizationconfiguration dictionary for a given + Sentence Transformer BERT model. mm_processor_kwargs: Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. """ @@ -177,7 +181,7 @@ def __init__(self, self.hf_text_config = get_hf_text_config(self.hf_config) self.pooling_config = self.get_pooling_config() self.bert_config = self._get_bert_config() - self.do_lower_case = self._get_bert_tokenization_config() + self.do_lower_case = self._get_bert_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -249,16 +253,12 @@ def _init_multimodal_config( return None def _get_bert_config(self): - bert_config = get_sentence_transformer_bert_config( + bert_config = get_sentence_transformer_tokenizer_config( self.model, self.revision) if bert_config is not None: return bert_config return None - def _get_bert_tokenization_config(self): - if self.bert_config: - return self.bert_config.get("do_lower_case") - def _init_attention_free(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) return ModelRegistry.is_attention_free_model(architectures) @@ -1757,6 +1757,7 @@ def _get_and_verify_max_len( "max_seq_length", "seq_len", ] + # Choose the smallest "max_length" from the possible keys. max_len_key = None for key in possible_keys: @@ -1820,7 +1821,7 @@ def _get_and_verify_max_len( "original_max_position_embeddings"] derived_max_model_len *= scaling_factor - if bert_config and "max_seq_lenght" in bert_config: + if bert_config and "max_seq_length" in bert_config: derived_max_model_len = bert_config["max_seq_length"] # If the user specified a max length, make sure it is smaller than the diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 12a9fa0aee6ee..133654e53c5d4 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -148,17 +148,18 @@ def _get_model_initialization_kwargs( return extra_kwargs -def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig, +def build_model(model_class: Type[nn.Module], + hf_config: PretrainedConfig, cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], *, + quant_config: Optional[QuantizationConfig], + *, lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], scheduler_config: Optional[SchedulerConfig], pooling_config: Optional[PoolingConfig] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, - scheduler_config - ) + scheduler_config) return model_class(config=hf_config, cache_config=cache_config, @@ -176,16 +177,15 @@ def _initialize_model( """Initialize a model with the given configurations.""" model_class, _ = get_model_architecture(model_config) - return build_model( - model_class, - model_config.hf_config, - cache_config=cache_config, - quant_config=_get_quantization_config(model_config, load_config), - lora_config=lora_config, - multimodal_config=model_config.multimodal_config, - scheduler_config=scheduler_config, - pooling_config=model_config.pooling_config - ) + return build_model(model_class, + model_config.hf_config, + cache_config=cache_config, + quant_config=_get_quantization_config( + model_config, load_config), + lora_config=lora_config, + multimodal_config=model_config.multimodal_config, + scheduler_config=scheduler_config, + pooling_config=model_config.pooling_config) class BaseModelLoader(ABC): diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2ba3863f33e3d..a92146eafffd9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,16 +6,16 @@ import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, try_to_load_from_cache) -from huggingface_hub.utils import (RepositoryNotFoundError, - RevisionNotFoundError, EntryNotFoundError, - LocalEntryNotFoundError) -from transformers import GenerationConfig, PretrainedConfig +from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError) from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME +from transformers import GenerationConfig, PretrainedConfig from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block @@ -305,9 +305,9 @@ def get_pooling_config(model, revision='main'): return None -def get_sentence_transformer_bert_config(model, revision='main'): +def get_sentence_transformer_tokenizer_config(model, revision='main'): """ - Returns the configuration dictionary for a + Returns the tokenization configuration dictionary for a given Sentence Transformer BERT model. Parameters: @@ -320,8 +320,18 @@ def get_sentence_transformer_bert_config(model, revision='main'): - dict: A dictionary containing the configuration parameters for the Sentence Transformer BERT model. """ - bert_dict = get_hf_file_to_dict("sentence_bert_config.json", model, - revision) + for config_name in [ + "sentence_bert_config.json", + "sentence_roberta_config.json", + "sentence_distilbert_config.json", + "sentence_camembert_config.json", + "sentence_albert_config.json", + "sentence_xlm-roberta_config.json", + "sentence_xlnet_config.json", + ]: + bert_dict = get_hf_file_to_dict(config_name, model, revision) + if bert_dict: + break if not bert_dict: return None diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index f653c782e1203..95ac1d4e6baf7 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -25,8 +25,10 @@ def init_tokenizer_from_configs(model_config: ModelConfig, trust_remote_code=model_config.trust_remote_code, revision=model_config.tokenizer_revision) - if model_config.do_lower_case is not None: - init_kwargs["do_lower_case"] = model_config.do_lower_case + if (model_config.bert_config is not None + and "do_lower_case" in model_config.bert_config): + init_kwargs["do_lower_case"] = model_config.bert_config[ + "do_lower_case"] return get_tokenizer_group(parallel_config.tokenizer_pool_config, **init_kwargs) From 1bcd3e8447db4784fa1c6f1ac3575fcab076abbc Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 24 Oct 2024 19:00:23 -0300 Subject: [PATCH 07/10] fix loading of non-bert models and fix tests Signed-off-by: Max de Bayser --- .../test_model_load_with_params.py | 6 +++ tests/test_config.py | 6 ++- vllm/config.py | 14 +++--- vllm/engine/llm_engine.py | 47 +++++++------------ vllm/model_executor/model_loader/loader.py | 10 ++-- vllm/transformers_utils/config.py | 7 ++- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index e3480f0c8f6db..7eab521848ad6 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,6 +1,7 @@ import os from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.models.bert import BertEmbeddingModel MAX_MODEL_LEN = 128 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") @@ -35,5 +36,10 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize # assert output assert output diff --git a/tests/test_config.py b/tests/test_config.py index 85202a136f478..9c484dd4f4266 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -104,9 +104,11 @@ def test_get_sliding_window(): def test_get_pooling_config(): + model_id = "sentence-transformers/all-MiniLM-L12-v2" minilm_model_config = ModelConfig( - "sentence-transformers/all-MiniLM-L12-v2", - "sentence-transformers/all-MiniLM-L12-v2", + model_id, + task="auto", + tokenizer=model_id, tokenizer_mode="auto", trust_remote_code=False, seed=0, diff --git a/vllm/config.py b/vllm/config.py index 88bde618a65d3..b7be41517bbb7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -181,7 +181,6 @@ def __init__(self, self.hf_text_config = get_hf_text_config(self.hf_config) self.pooling_config = self.get_pooling_config() self.bert_config = self._get_bert_config() - self.do_lower_case = self._get_bert_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -253,11 +252,8 @@ def _init_multimodal_config( return None def _get_bert_config(self): - bert_config = get_sentence_transformer_tokenizer_config( + return get_sentence_transformer_tokenizer_config( self.model, self.revision) - if bert_config is not None: - return bert_config - return None def _init_attention_free(self) -> bool: architectures = getattr(self.hf_config, "architectures", []) @@ -422,10 +418,12 @@ def _verify_bnb_config(self) -> None: "fallback to the eager mode.") self.enforce_eager = True - def get_pooling_config(self) -> PoolingConfig: + def get_pooling_config(self) -> Optional[PoolingConfig]: pooling_config = get_pooling_config(self.model, self.revision) - return PoolingConfig(pooling_config["pooling_type"], - pooling_config["normalize"]) + if pooling_config is not None: + return PoolingConfig(pooling_config["pooling_type"], + pooling_config["normalize"]) + return None def verify_async_output_proc(self, parallel_config, speculative_config, device_config) -> None: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8383682d977e9..c636d50627ffb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -256,42 +256,29 @@ def __init__( "use_async_output_proc=%s, use_cached_outputs=%s, " "pooling_config_type=%s, normalize=%s, " "chat_template_text_format=%s, mm_processor_kwargs=%s)", - VLLM_VERSION, - model_config.model, - speculative_config, - model_config.tokenizer, - model_config.skip_tokenizer_init, - model_config.tokenizer_mode, - model_config.revision, - model_config.override_neuron_config, - model_config.rope_scaling, - model_config.rope_theta, - model_config.tokenizer_revision, - model_config.trust_remote_code, - model_config.dtype, - model_config.max_model_len, - load_config.download_dir, - load_config.load_format, - parallel_config.tensor_parallel_size, + VLLM_VERSION, model_config.model, speculative_config, + model_config.tokenizer, model_config.skip_tokenizer_init, + model_config.tokenizer_mode, model_config.revision, + model_config.override_neuron_config, model_config.rope_scaling, + model_config.rope_theta, model_config.tokenizer_revision, + model_config.trust_remote_code, model_config.dtype, + model_config.max_model_len, load_config.download_dir, + load_config.load_format, parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, + model_config.quantization, model_config.enforce_eager, + cache_config.cache_dtype, model_config.quantization_param_path, + device_config.device, decoding_config, observability_config, + model_config.seed, model_config.served_model_name, scheduler_config.num_scheduler_steps, scheduler_config.chunked_prefill_enabled, scheduler_config.multi_step_stream_outputs, cache_config.enable_prefix_caching, - model_config.use_async_output_proc, - use_cached_outputs, - model_config.pooling_config.pooling_type, - model_config.pooling_config.normalize, + model_config.use_async_output_proc, use_cached_outputs, + model_config.pooling_config.pooling_type + if model_config.pooling_config is not None else None, + model_config.pooling_config.normalize + if model_config.pooling_config is not None else None, model_config.chat_template_text_format, model_config.mm_processor_kwargs ) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 133654e53c5d4..91c7b870cf671 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -123,7 +123,8 @@ def _get_model_initialization_kwargs( model_class: Type[nn.Module], lora_config: Optional[LoRAConfig], multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig] = None) -> Dict[str, Any]: + scheduler_config: Optional[SchedulerConfig] = None, + pooling_config: Optional[PoolingConfig] = None) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" extra_kwargs: Dict[str, Any] = {} @@ -145,6 +146,9 @@ def _get_model_initialization_kwargs( if has_inner_state(model_class) and scheduler_config: extra_kwargs["scheduler_config"] = scheduler_config + if pooling_config is not None: + extra_kwargs["pooling_config"] = pooling_config + return extra_kwargs @@ -159,12 +163,12 @@ def build_model(model_class: Type[nn.Module], pooling_config: Optional[PoolingConfig] = None) -> nn.Module: extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, multimodal_config, - scheduler_config) + scheduler_config, + pooling_config) return model_class(config=hf_config, cache_config=cache_config, quant_config=quant_config, - pooling_config=pooling_config, **extra_kwargs) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index a92146eafffd9..513541acaeeff 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -9,13 +9,13 @@ from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError) +from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME -from transformers import GenerationConfig, PretrainedConfig from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block @@ -256,7 +256,7 @@ def get_hf_file_to_dict(file_name, model, revision): hf_hub_file = hf_hub_download(model, file_name, revision=revision) except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError, LocalEntryNotFoundError) as e: - logger.info("File or repository not found in hf_hub_download", e) + logger.debug("File or repository not found in hf_hub_download", e) return None file_path = Path(hf_hub_file) @@ -285,6 +285,9 @@ def get_pooling_config(model, revision='main'): modules_file_name = "modules.json" modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) + if modules_dict is None: + return None + pooling = next((item for item in modules_dict if item["type"] == "sentence_transformers.models.Pooling"), None) From 69222e4c2c6785682a558176139eb3a288428193 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Fri, 25 Oct 2024 11:02:36 -0300 Subject: [PATCH 08/10] Extra check for if the files exists Signed-off-by: Flavia Beo --- vllm/engine/llm_engine.py | 3 +- vllm/transformers_utils/config.py | 62 ++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c636d50627ffb..06e3cdad2d0cf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -280,8 +280,7 @@ def __init__( model_config.pooling_config.normalize if model_config.pooling_config is not None else None, model_config.chat_template_text_format, - model_config.mm_processor_kwargs - ) + model_config.mm_processor_kwargs) # TODO(woosuk): Print more configs in debug mode. self.model_config = model_config self.cache_config = cache_config diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 513541acaeeff..65a7930824fae 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -205,7 +205,7 @@ def get_config( raise e elif config_format == ConfigFormat.MISTRAL: - config = load_params_config(model, revision) + config = load_params_config(model, revision, token=kwargs.get("token")) else: raise ValueError(f"Unsupported config format: {config_format}") @@ -235,7 +235,10 @@ def get_config( return config -def get_hf_file_to_dict(file_name, model, revision): +def get_hf_file_to_dict(file_name, + model, + revision, + token: Optional[str] = None): """ Downloads a file from the Hugging Face Hub and returns its contents as a dictionary. @@ -244,6 +247,7 @@ def get_hf_file_to_dict(file_name, model, revision): - file_name (str): The name of the file to download. - model (str): The name of the model on the Hugging Face Hub. - revision (str): The specific version of the model. + - token (str): The Hugging Face authentication token. Returns: - config_dict (dict): A dictionary containing @@ -251,22 +255,31 @@ def get_hf_file_to_dict(file_name, model, revision): """ file_path = Path(model) / file_name - if not file_path.is_file(): - try: - hf_hub_file = hf_hub_download(model, file_name, revision=revision) - except (RepositoryNotFoundError, RevisionNotFoundError, - EntryNotFoundError, LocalEntryNotFoundError) as e: - logger.debug("File or repository not found in hf_hub_download", e) - return None - file_path = Path(hf_hub_file) - - with open(file_path, "r") as file: - config_dict = json.load(file) + if file_or_path_exists(model=model, + config_name=file_name, + revision=revision, + token=token): - return config_dict + if not file_path.is_file(): + try: + hf_hub_file = hf_hub_download(model, + file_name, + revision=revision) + except (RepositoryNotFoundError, RevisionNotFoundError, + EntryNotFoundError, LocalEntryNotFoundError) as e: + logger.debug("File or repository not found in hf_hub_download", + e) + return None + file_path = Path(hf_hub_file) + + with open(file_path, "r") as file: + config_dict = json.load(file) + + return config_dict + return None -def get_pooling_config(model, revision='main'): +def get_pooling_config(model, revision='main', token: Optional[str] = None): """ This function gets the pooling and normalize config from the model - only applies to @@ -283,7 +296,8 @@ def get_pooling_config(model, revision='main'): """ modules_file_name = "modules.json" - modules_dict = get_hf_file_to_dict(modules_file_name, model, revision) + modules_dict = get_hf_file_to_dict(modules_file_name, model, revision, + token) if modules_dict is None: return None @@ -299,7 +313,8 @@ def get_pooling_config(model, revision='main'): if pooling: pooling_file_name = "{}/config.json".format(pooling["path"]) - pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision) + pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision, + token) pooling_type_name = next( (item for item, val in pooling_dict.items() if val is True), None) @@ -308,7 +323,9 @@ def get_pooling_config(model, revision='main'): return None -def get_sentence_transformer_tokenizer_config(model, revision='main'): +def get_sentence_transformer_tokenizer_config(model, + revision='main', + token: Optional[str] = None): """ Returns the tokenization configuration dictionary for a given Sentence Transformer BERT model. @@ -318,6 +335,7 @@ def get_sentence_transformer_tokenizer_config(model, revision='main'): BERT model. - revision (str, optional): The revision of the m odel to use. Defaults to 'main'. + - token (str): A Hugging Face access token. Returns: - dict: A dictionary containing the configuration parameters @@ -332,7 +350,7 @@ def get_sentence_transformer_tokenizer_config(model, revision='main'): "sentence_xlm-roberta_config.json", "sentence_xlnet_config.json", ]: - bert_dict = get_hf_file_to_dict(config_name, model, revision) + bert_dict = get_hf_file_to_dict(config_name, model, revision, token) if bert_dict: break @@ -406,13 +424,15 @@ def _reduce_modelconfig(mc: ModelConfig): exc_info=e) -def load_params_config(model, revision) -> PretrainedConfig: +def load_params_config(model, + revision, + token: Optional[str] = None) -> PretrainedConfig: # This function loads a params.json config which # should be used when loading models in mistral format config_file_name = "params.json" - config_dict = get_hf_file_to_dict(config_file_name, model, revision) + config_dict = get_hf_file_to_dict(config_file_name, model, revision, token) config_mapping = { "dim": "hidden_size", From 32ee574a3b36d9b73234c2c00df0b0fb7f96545d Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Fri, 25 Oct 2024 17:37:11 -0300 Subject: [PATCH 09/10] Reverts whitespaces Signed-off-by: Flavia Beo --- vllm/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b7be41517bbb7..adb084ca88aba 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1755,7 +1755,6 @@ def _get_and_verify_max_len( "max_seq_length", "seq_len", ] - # Choose the smallest "max_length" from the possible keys. max_len_key = None for key in possible_keys: @@ -1854,7 +1853,6 @@ def _get_and_verify_max_len( raise ValueError( f"{msg} To allow overriding this maximum, set " "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1") - return int(max_model_len) From 40256278281ae1fdd84e3c8f8ccbc1c6489c0027 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 28 Oct 2024 10:22:42 -0300 Subject: [PATCH 10/10] add pooling_config to models with a Pooler layer Signed-off-by: Max de Bayser --- vllm/model_executor/models/gemma2.py | 11 +++++++++-- vllm/model_executor/models/llama.py | 11 +++++++++-- vllm/model_executor/models/llava_next.py | 13 ++++++++++--- vllm/model_executor/models/phi3v.py | 13 ++++++++++--- vllm/model_executor/models/qwen2_rm.py | 11 +++++++++-- 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index d79248f93f5ae..6e62ef28926fd 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -31,7 +31,8 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (Pooler, PoolingConfig, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler, SamplerOutput @@ -473,12 +474,18 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP): def __init__( self, + pooling_config: Optional[PoolingConfig] = None, **kwargs, ) -> None: super().__init__() self.model = Gemma2Model(**kwargs) - self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + if pooling_config is not None: + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) + else: + self._pooler = Pooler(pooling_type=PoolingType.LAST, + normalize=True) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c346e3e808e3f..c3c992cf2e17f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -38,7 +38,8 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (Pooler, PoolingConfig, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) @@ -627,12 +628,18 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP): def __init__( self, + pooling_config: Optional[PoolingConfig] = None, **kwargs, ) -> None: super().__init__() self.model = LlamaModel(**kwargs) - self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + if pooling_config is not None: + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) + else: + self._pooler = Pooler(pooling_type=PoolingType.LAST, + normalize=True) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 2a582deeaa2c9..7cb719f5c57aa 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -13,7 +13,8 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (Pooler, PoolingConfig, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.pooling_metadata import PoolingMetadata @@ -285,7 +286,8 @@ def __init__(self, config: LlavaNextConfig, multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + pooling_config: Optional[PoolingConfig] = None) -> None: super().__init__() self.config = config @@ -306,7 +308,12 @@ def __init__(self, # The same model class supports both language generation and embedding # because the architecture name is the same - self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + if pooling_config is not None: + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) + else: + self._pooler = Pooler(pooling_type=PoolingType.LAST, + normalize=True) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 855a9b17585a4..6e8b323e89fb4 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -30,7 +30,8 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (Pooler, PoolingConfig, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -525,7 +526,8 @@ def __init__(self, config: PretrainedConfig, multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + pooling_config: Optional[PoolingConfig] = None) -> None: super().__init__() self.config = config @@ -547,7 +549,12 @@ def __init__(self, # The same model class supports both language generation and embedding # because the architecture name is the same - self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + if pooling_config is not None: + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) + else: + self._pooler = Pooler(pooling_type=PoolingType.LAST, + normalize=True) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index ee0eeb9db3808..3493aeffd2062 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -14,7 +14,8 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (Pooler, PoolingConfig, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -64,6 +65,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + pooling_config: Optional[PoolingConfig] = None, ) -> None: # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None @@ -93,7 +95,12 @@ def __init__( RowParallelLinear(config.hidden_size, 1, quant_config=quant_config), ) - self._pooler = Pooler(pooling_type=PoolingType.ALL, normalize=False) + if pooling_config is not None: + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) + else: + self._pooler = Pooler(pooling_type=PoolingType.ALL, + normalize=False) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors)