Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into hidden_states_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
abhigoyal1997 authored Aug 18, 2024
2 parents 08b3cd5 + ab7165f commit 5815ccc
Show file tree
Hide file tree
Showing 32 changed files with 256 additions and 286 deletions.
8 changes: 6 additions & 2 deletions tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,12 @@ def forward(

ModelRegistry.register_model("OPTForCausalLM", FakeAudioModel)

with patch("vllm.entrypoints.chat_utils._mm_token_str",
lambda *_, **__: "_"):
with patch(
"vllm.entrypoints.chat_utils._mm_token_str",
lambda *_, **__: "_"), patch(
"vllm.model_executor.models.ModelRegistry.is_multimodal_model"
) as mock:
mock.return_value = True
sys.argv = ["placeholder.py"] + \
(f"--model {MODEL_NAME} --gpu-memory-utilization 0.10 "
"--dtype bfloat16 --enforce-eager --api-key token-abc123 "
Expand Down
18 changes: 9 additions & 9 deletions tests/multimodal/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from transformers import CLIPImageProcessor, LlavaNextImageProcessor

from vllm.config import ModelConfig, MultiModalConfig
from vllm.config import ModelConfig
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size

Expand All @@ -30,10 +30,10 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
seed=0,
dtype=dtype,
revision=None,
limit_mm_per_prompt={"image": 1},
)
mm_config = MultiModalConfig(limit_per_prompt={"image": 1})

mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
mm_registry.init_mm_limits_per_prompt(model_config)

for asset in image_assets:
image = rescale_image_size(asset.pil_image, size_factor)
Expand Down Expand Up @@ -73,10 +73,10 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
seed=0,
dtype=dtype,
revision=None,
limit_mm_per_prompt={"image": 1},
)
mm_config = MultiModalConfig(limit_per_prompt={"image": 1})

mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
mm_registry.init_mm_limits_per_prompt(model_config)

for asset in image_assets:
image = rescale_image_size(asset.pil_image, size_factor)
Expand Down Expand Up @@ -115,10 +115,10 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
seed=0,
dtype="half",
revision=None,
limit_mm_per_prompt={"image": limit},
)
mm_config = MultiModalConfig(limit_per_prompt={"image": limit})

mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
mm_registry.init_mm_limits_per_prompt(model_config)

image = image_assets[0].pil_image
if num_images == 0:
Expand All @@ -145,10 +145,10 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
seed=0,
dtype="half",
revision=None,
limit_mm_per_prompt={"image": num_images},
)
mm_config = MultiModalConfig(limit_per_prompt={"image": num_images})

mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
mm_registry.init_mm_limits_per_prompt(model_config)

image = image_assets[0].pil_image
mm_inputs = {"image": [image] * num_images}
Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _nvml():

class RemoteOpenAIServer:
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
MAX_START_WAIT_S = 120 # wait for server to start for 120 seconds
MAX_START_WAIT_S = 240 # wait for server to start for 240 seconds

def __init__(
self,
Expand Down
39 changes: 34 additions & 5 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"AquilaForCausalLM",
"DeepseekV2ForCausalLM",
"InternLMForCausalLM",
"JAISLMHeadModel",
"LlamaForCausalLM",
"LLaMAForCausalLM",
"MistralForCausalLM",
Expand Down Expand Up @@ -108,6 +109,8 @@ class ModelConfig:
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
"""

def __init__(
Expand All @@ -133,7 +136,7 @@ def __init__(
disable_sliding_window: bool = False,
skip_tokenizer_init: bool = False,
served_model_name: Optional[Union[str, List[str]]] = None,
multimodal_config: Optional["MultiModalConfig"] = None,
limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
) -> None:
self.model = model
self.tokenizer = tokenizer
Expand Down Expand Up @@ -210,14 +213,29 @@ def __init__(
sliding_window_len=self.get_hf_config_sliding_window())
self.served_model_name = get_served_model_name(model,
served_model_name)
self.multimodal_config = multimodal_config

self.multimodal_config = self._init_multimodal_config(
limit_mm_per_prompt)
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()

def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
) -> Optional["MultiModalConfig"]:
architectures = getattr(self.hf_config, "architectures", [])
if any(
ModelRegistry.is_multimodal_model(arch)
for arch in architectures):
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
else:
if limit_mm_per_prompt:
raise ValueError(
"limit_mm_per_prompt is only supported for multimodal "
"models.")
return None

def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = self.tokenizer_mode.lower()
if tokenizer_mode not in ["auto", "slow"]:
Expand Down Expand Up @@ -466,6 +484,18 @@ def _get_num_seqlen_agnostic_layers(
if t != "attention"
])

def get_multimodal_config(self) -> "MultiModalConfig":
"""
Get the multimodal configuration of the model.
Raises:
ValueError: If the model is not multimodal.
"""
if self.multimodal_config is None:
raise ValueError("The model is not multimodal.")

return self.multimodal_config

@property
def is_encoder_decoder_model(self) -> bool:
"""Extract the HF encoder/decoder model flag."""
Expand Down Expand Up @@ -1449,7 +1479,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
class MultiModalConfig:
"""Controls the behavior of multimodal models."""

limit_per_prompt: Mapping[str, int]
limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
"""
The maximum number of multi-modal input instances allowed per prompt
for each :class:`~vllm.multimodal.MultiModalPlugin`.
Expand Down Expand Up @@ -1709,7 +1739,6 @@ class EngineConfig:
device_config: DeviceConfig
load_config: LoadConfig
lora_config: Optional[LoRAConfig]
multimodal_config: Optional[MultiModalConfig]
speculative_config: Optional[SpeculativeConfig]
decoding_config: Optional[DecodingConfig]
observability_config: Optional[ObservabilityConfig]
Expand Down
9 changes: 3 additions & 6 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
MultiModalConfig, ObservabilityConfig, ParallelConfig,
ObservabilityConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig, TokenizerPoolConfig)
from vllm.executor.executor_base import ExecutorBase
Expand Down Expand Up @@ -765,9 +765,6 @@ def create_engine_config(self, ) -> EngineConfig:
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

multimodal_config = MultiModalConfig(
limit_per_prompt=self.limit_mm_per_prompt or {})

device_config = DeviceConfig(device=self.device)
model_config = ModelConfig(
model=self.model,
Expand All @@ -791,7 +788,8 @@ def create_engine_config(self, ) -> EngineConfig:
disable_sliding_window=self.disable_sliding_window,
skip_tokenizer_init=self.skip_tokenizer_init,
served_model_name=self.served_model_name,
multimodal_config=multimodal_config)
limit_mm_per_prompt=self.limit_mm_per_prompt,
)
cache_config = CacheConfig(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
Expand Down Expand Up @@ -970,7 +968,6 @@ def create_engine_config(self, ) -> EngineConfig:
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
multimodal_config=multimodal_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config,
Expand Down
7 changes: 1 addition & 6 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import vllm.envs as envs
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
MultiModalConfig, ObservabilityConfig, ParallelConfig,
ObservabilityConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
Expand Down Expand Up @@ -100,8 +100,6 @@ class LLMEngine:
scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
lora_config (Optional): The configuration related to serving multi-LoRA.
multimodal_config (Optional): The configuration related to multimodal
models.
speculative_config (Optional): The configuration related to speculative
decoding.
executor_class: The model executor class for managing distributed
Expand Down Expand Up @@ -172,7 +170,6 @@ def __init__(
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
speculative_config: Optional[SpeculativeConfig],
decoding_config: Optional[DecodingConfig],
observability_config: Optional[ObservabilityConfig],
Expand Down Expand Up @@ -235,7 +232,6 @@ def __init__(
self.model_config = model_config
self.cache_config = cache_config
self.lora_config = lora_config
self.multimodal_config = multimodal_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
Expand Down Expand Up @@ -278,7 +274,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
multimodal_config=multimodal_config,
speculative_config=speculative_config,
load_config=load_config,
prompt_adapter_config=prompt_adapter_config,
Expand Down
1 change: 0 additions & 1 deletion vllm/executor/cpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def _create_worker(
rank=rank,
distributed_init_method=self.distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
kv_cache_dtype=self.cache_config.cache_dtype,
prompt_adapter_config=self.prompt_adapter_config,
is_driver_worker=rank == 0,
Expand Down
6 changes: 2 additions & 4 deletions vllm/executor/executor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import List, Optional, Set, Tuple

from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ObservabilityConfig,
ParallelConfig, PromptAdapterConfig, SchedulerConfig,
ModelConfig, ObservabilityConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
Expand All @@ -29,7 +29,6 @@ def __init__(
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
speculative_config: Optional[SpeculativeConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
observability_config: Optional[ObservabilityConfig],
Expand All @@ -41,7 +40,6 @@ def __init__(
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.multimodal_config = multimodal_config
self.speculative_config = speculative_config
self.prompt_adapter_config = prompt_adapter_config
self.observability_config = observability_config
Expand Down
1 change: 0 additions & 1 deletion vllm/executor/gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def _get_worker_kwargs(
rank=rank,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
speculative_config=self.speculative_config,
prompt_adapter_config=self.prompt_adapter_config,
is_driver_worker=(not self.parallel_config)
Expand Down
1 change: 0 additions & 1 deletion vllm/executor/openvino_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def _init_worker(self):
rank=0,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=True,
)
Expand Down
8 changes: 2 additions & 6 deletions vllm/executor/ray_xpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@

import vllm.envs as envs
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
ModelConfig, ParallelConfig, PromptAdapterConfig,
SchedulerConfig, SpeculativeConfig)
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
DistributedGPUExecutor, DistributedGPUExecutorAsync)
from vllm.executor.ray_utils import RayWorkerWrapper, ray
Expand Down Expand Up @@ -46,7 +45,6 @@ def __init__(
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
speculative_config: Optional[SpeculativeConfig],
) -> None:
Expand All @@ -61,7 +59,6 @@ def __init__(
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.multimodal_config = multimodal_config
self.prompt_adapter_config = prompt_adapter_config

placement_group = self.parallel_config.placement_group
Expand Down Expand Up @@ -203,7 +200,6 @@ def collect_arg_helper_func(**kwargs):
rank=rank,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
is_driver_worker=rank == 0,
))
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
Expand Down
1 change: 0 additions & 1 deletion vllm/executor/tpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def _get_worker_kwargs(
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
multimodal_config=self.multimodal_config,
is_driver_worker=rank == 0,
)

Expand Down
7 changes: 2 additions & 5 deletions vllm/executor/xpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import torch

from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
ModelConfig, ParallelConfig, PromptAdapterConfig,
SchedulerConfig, SpeculativeConfig)
from vllm.executor.executor_base import ExecutorAsyncBase
from vllm.executor.gpu_executor import GPUExecutor
from vllm.logger import init_logger
Expand All @@ -29,7 +28,6 @@ def __init__(
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
speculative_config: Optional[SpeculativeConfig],
) -> None:
Expand All @@ -46,7 +44,6 @@ def __init__(
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.multimodal_config = multimodal_config
self.prompt_adapter_config = prompt_adapter_config
self.speculative_config = None

Expand Down
Loading

0 comments on commit 5815ccc

Please sign in to comment.