Skip to content

test #18848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed

test #18848

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 46 additions & 30 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,51 +25,62 @@
model_validator)
from pydantic.dataclasses import dataclass
from torch.distributed import ProcessGroup, ReduceOp
from transformers import PretrainedConfig
from typing_extensions import deprecated, runtime_checkable

import vllm.envs as envs
from vllm import version
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
QuantizationMethods,
get_quantization_config)
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.tracing import is_otel_available, otel_import_error_traceback
from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config,
get_hf_text_config, get_pooling_config,
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
try_get_generation_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
# yapf conflicts with isort for this block
# yapf: disable
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
LayerBlockType, cuda_device_count_stateless,
get_cpu_memory, get_open_port, is_torch_equal_or_newer,
random_uuid, resolve_obj_by_qualname)
LayerBlockType, LazyLoader,
cuda_device_count_stateless, get_cpu_memory,
get_open_port, is_torch_equal_or_newer, random_uuid,
resolve_obj_by_qualname)

# yapf: enable

if TYPE_CHECKING:
from _typeshed import DataclassInstance
from ray.util.placement_group import PlacementGroup
from transformers.configuration_utils import PretrainedConfig

import vllm.model_executor.layers.quantization as me_quant
import vllm.model_executor.models as me_models
from vllm.executor.executor_base import ExecutorBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader import BaseModelLoader
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

ConfigType = type[DataclassInstance]
HfOverrides = Union[dict, Callable[[type], type]]
else:
PlacementGroup = Any
ExecutorBase = Any
QuantizationConfig = Any
QuantizationMethods = Any
BaseModelLoader = Any
TensorizerConfig = Any
ConfigType = type
HfOverrides = Union[dict[str, Any], Callable[[type], type]]

me_quant = LazyLoader("model_executor", globals(),
"vllm.model_executor.layers.quantization")
me_models = LazyLoader("model_executor", globals(),
"vllm.model_executor.models")

logger = init_logger(__name__)

Expand All @@ -96,9 +107,6 @@
for task in tasks
}

HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
PretrainedConfig]]


@runtime_checkable
class SupportsHash(Protocol):
Expand Down Expand Up @@ -304,7 +312,7 @@ class ModelConfig:
- 25.6k -> 25,600"""
spec_target_max_model_len: Optional[int] = None
"""Specify the maximum length for spec decoding draft models."""
quantization: Optional[QuantizationMethods] = None
quantization: Optional["QuantizationMethods"] = None
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
`None`, we assume the model weights are not quantized and use `dtype` to
Expand Down Expand Up @@ -629,7 +637,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":

@property
def registry(self):
return ModelRegistry
return me_models.ModelRegistry

@property
def architectures(self) -> list[str]:
Expand All @@ -638,7 +646,7 @@ def architectures(self) -> list[str]:
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from S3 to temporary directory when needed.

Args:
model: Model name or path
tokenizer: Tokenizer name or path
Expand Down Expand Up @@ -841,14 +849,15 @@ def _parse_quant_hf_config(self):
return quant_cfg

def _verify_quantization(self) -> None:
supported_quantization = QUANTIZATION_METHODS
supported_quantization = me_quant.QUANTIZATION_METHODS
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
]
if self.quantization is not None:
self.quantization = cast(QuantizationMethods, self.quantization)
self.quantization = cast(me_quant.QuantizationMethods,
self.quantization)

# Parse quantization method from the HF model config, if available.
quant_cfg = self._parse_quant_hf_config()
Expand Down Expand Up @@ -882,14 +891,14 @@ def _verify_quantization(self) -> None:

# Detect which checkpoint is it
for name in quantization_methods:
method = get_quantization_config(name)
method = me_quant.get_quantization_config(name)
quantization_override = method.override_quantization_method(
quant_cfg, self.quantization)
if quantization_override is not None:
# Raise error if the override is not custom (custom would
# be in QUANTIZATION_METHODS but not QuantizationMethods)
# and hasn't been added to the overrides list.
if (name in get_args(QuantizationMethods)
if (name in get_args(me_quant.QuantizationMethods)
and name not in overrides):
raise ValueError(
f"Quantization method {name} is an override but "
Expand Down Expand Up @@ -924,6 +933,8 @@ def _verify_quantization(self) -> None:
"non-quantized models.", self.quantization)

def _verify_cuda_graph(self) -> None:
from vllm.platforms import current_platform

self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len)
# CUDAGraph capture not supported for enc-dec models and mllama on ROCm
Expand Down Expand Up @@ -1389,7 +1400,7 @@ def runner_type(self) -> RunnerType:
@property
def is_v1_compatible(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.is_v1_compatible(architectures)
return me_models.ModelRegistry.is_v1_compatible(architectures)

@property
def is_matryoshka(self) -> bool:
Expand Down Expand Up @@ -1806,7 +1817,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
return dp_group

@staticmethod
def has_unfinished_dp(dp_group: "ProcessGroup",
def has_unfinished_dp(dp_group: ProcessGroup,
has_unfinished: bool) -> bool:
tensor = torch.tensor([has_unfinished],
dtype=torch.int32,
Expand Down Expand Up @@ -2233,9 +2244,9 @@ class DeviceConfig:

device: Union[Device, torch.device] = "auto"
"""Device type for vLLM execution.
This parameter is deprecated and will be
removed in a future release.
It will now be set automatically based
This parameter is deprecated and will be
removed in a future release.
It will now be set automatically based
on the current platform."""
device_type: str = field(init=False)
"""Device type from the current platform. This is set in
Expand Down Expand Up @@ -2327,7 +2338,7 @@ class SpeculativeConfig:
according to the log probability settings in SamplingParams."""

# Draft model configuration
quantization: Optional[QuantizationMethods] = None
quantization: Optional[me_quant.QuantizationMethods] = None
"""Quantization method that was used to quantize the draft model weights.
If `None`, we assume the model weights are not quantized. Note that it only
takes effect when using the draft model-based speculative method."""
Expand Down Expand Up @@ -2418,7 +2429,8 @@ def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
return cls(**dict_value)

@staticmethod
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
def hf_config_override(
hf_config: "PretrainedConfig") -> "PretrainedConfig":
if hf_config.model_type == "deepseek_v3":
hf_config.model_type = "deepseek_mtp"
if hf_config.model_type == "deepseek_mtp":
Expand Down Expand Up @@ -2661,7 +2673,7 @@ def _maybe_override_draft_max_model_len(
def _verify_and_get_draft_tp(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: Optional[int],
draft_hf_config: PretrainedConfig) -> int:
draft_hf_config: "PretrainedConfig") -> int:
"""
Verifies and adjusts the tensor parallel size for a draft model
specified using speculative_draft_tensor_parallel_size.
Expand Down Expand Up @@ -3068,7 +3080,7 @@ def compute_hash(self) -> str:


def _get_and_verify_dtype(
config: PretrainedConfig,
config: "PretrainedConfig",
dtype: Union[str, torch.dtype],
) -> torch.dtype:
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
Expand Down Expand Up @@ -3159,14 +3171,16 @@ def _get_and_verify_dtype(


def _get_and_verify_max_len(
hf_config: PretrainedConfig,
hf_config: "PretrainedConfig",
max_model_len: Optional[int],
disable_sliding_window: bool,
sliding_window_len: Optional[Union[int, list[Optional[int]]]],
spec_target_max_model_len: Optional[int] = None,
encoder_config: Optional[Any] = None,
) -> int:
"""Get and verify the model's maximum length."""
from vllm.platforms import current_platform

derived_max_model_len = float("inf")
possible_keys = [
# OPT
Expand Down Expand Up @@ -3498,6 +3512,8 @@ def compute_hash(self) -> str:
return hash_str

def __post_init__(self):
from vllm.tracing import is_otel_available, otel_import_error_traceback

if (self.collect_detailed_traces is not None
and len(self.collect_detailed_traces) == 1
and "," in self.collect_detailed_traces[0]):
Expand Down Expand Up @@ -4259,7 +4275,7 @@ def get_quantization_config(

def with_hf_config(
self,
hf_config: PretrainedConfig,
hf_config: "PretrainedConfig",
architectures: Optional[list[str]] = None,
) -> "VllmConfig":
if architectures is not None:
Expand Down