vllm-project · davidxia · May 13, 2025 · May 28, 2025
diff --git a/vllm/config.py b/vllm/config.py
@@ -25,51 +25,62 @@
                       model_validator)
 from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
-from transformers import PretrainedConfig
 from typing_extensions import deprecated, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
-                                                     QuantizationMethods,
-                                                     get_quantization_config)
-from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
-from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                         POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
-                        LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
-                        random_uuid, resolve_obj_by_qualname)
+                        LayerBlockType, LazyLoader,
+                        cuda_device_count_stateless, get_cpu_memory,
+                        get_open_port, is_torch_equal_or_newer, random_uuid,
+                        resolve_obj_by_qualname)
+
+# yapf: enable
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
     from ray.util.placement_group import PlacementGroup
+    from transformers.configuration_utils import PretrainedConfig
 
+    import vllm.model_executor.layers.quantization as me_quant
+    import vllm.model_executor.models as me_models
     from vllm.executor.executor_base import ExecutorBase
+    from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
     from vllm.model_executor.model_loader import BaseModelLoader
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
     ConfigType = type[DataclassInstance]
+    HfOverrides = Union[dict, Callable[[type], type]]
 else:
     PlacementGroup = Any
     ExecutorBase = Any
     QuantizationConfig = Any
+    QuantizationMethods = Any
     BaseModelLoader = Any
     TensorizerConfig = Any
     ConfigType = type
+    HfOverrides = Union[dict[str, Any], Callable[[type], type]]
+
+    me_quant = LazyLoader("model_executor", globals(),
+                          "vllm.model_executor.layers.quantization")
+    me_models = LazyLoader("model_executor", globals(),
+                           "vllm.model_executor.models")
 
 logger = init_logger(__name__)
 
@@ -96,9 +107,6 @@
     for task in tasks
 }
 
-HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
-                                             PretrainedConfig]]
-
 
 @runtime_checkable
 class SupportsHash(Protocol):
@@ -304,7 +312,7 @@ class ModelConfig:
     - 25.6k -> 25,600"""
     spec_target_max_model_len: Optional[int] = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: Optional[QuantizationMethods] = None
+    quantization: Optional["QuantizationMethods"] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -629,7 +637,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
 
     @property
     def registry(self):
-        return ModelRegistry
+        return me_models.ModelRegistry
 
     @property
     def architectures(self) -> list[str]:
@@ -638,7 +646,7 @@ def architectures(self) -> list[str]:
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
-        
+
         Args:
             model: Model name or path
             tokenizer: Tokenizer name or path
@@ -841,14 +849,15 @@ def _parse_quant_hf_config(self):
         return quant_cfg
 
     def _verify_quantization(self) -> None:
-        supported_quantization = QUANTIZATION_METHODS
+        supported_quantization = me_quant.QUANTIZATION_METHODS
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
             "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
-            self.quantization = cast(QuantizationMethods, self.quantization)
+            self.quantization = cast(me_quant.QuantizationMethods,
+                                     self.quantization)
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
@@ -882,14 +891,14 @@ def _verify_quantization(self) -> None:
 
             # Detect which checkpoint is it
             for name in quantization_methods:
-                method = get_quantization_config(name)
+                method = me_quant.get_quantization_config(name)
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization)
                 if quantization_override is not None:
                     # Raise error if the override is not custom (custom would
                     # be in QUANTIZATION_METHODS but not QuantizationMethods)
                     # and hasn't been added to the overrides list.
-                    if (name in get_args(QuantizationMethods)
+                    if (name in get_args(me_quant.QuantizationMethods)
                             and name not in overrides):
                         raise ValueError(
                             f"Quantization method {name} is an override but "
@@ -924,6 +933,8 @@ def _verify_quantization(self) -> None:
                     "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
+        from vllm.platforms import current_platform
+
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
         # CUDAGraph capture not supported for enc-dec models and mllama on ROCm
@@ -1389,7 +1400,7 @@ def runner_type(self) -> RunnerType:
     @property
     def is_v1_compatible(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_v1_compatible(architectures)
+        return me_models.ModelRegistry.is_v1_compatible(architectures)
 
     @property
     def is_matryoshka(self) -> bool:
@@ -1806,7 +1817,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
         return dp_group
 
     @staticmethod
-    def has_unfinished_dp(dp_group: "ProcessGroup",
+    def has_unfinished_dp(dp_group: ProcessGroup,
                           has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished],
                               dtype=torch.int32,
@@ -2233,9 +2244,9 @@ class DeviceConfig:
 
     device: Union[Device, torch.device] = "auto"
     """Device type for vLLM execution.
-    This parameter is deprecated and will be 
-    removed in a future release. 
-    It will now be set automatically based 
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
     on the current platform."""
     device_type: str = field(init=False)
     """Device type from the current platform. This is set in
@@ -2327,7 +2338,7 @@ class SpeculativeConfig:
     according to the log probability settings in SamplingParams."""
 
     # Draft model configuration
-    quantization: Optional[QuantizationMethods] = None
+    quantization: Optional[me_quant.QuantizationMethods] = None
     """Quantization method that was used to quantize the draft model weights.
     If `None`, we assume the model weights are not quantized. Note that it only
     takes effect when using the draft model-based speculative method."""
@@ -2418,7 +2429,8 @@ def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
         return cls(**dict_value)
 
     @staticmethod
-    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+    def hf_config_override(
+            hf_config: "PretrainedConfig") -> "PretrainedConfig":
         if hf_config.model_type == "deepseek_v3":
             hf_config.model_type = "deepseek_mtp"
         if hf_config.model_type == "deepseek_mtp":
@@ -2661,7 +2673,7 @@ def _maybe_override_draft_max_model_len(
     def _verify_and_get_draft_tp(
             target_parallel_config: ParallelConfig,
             speculative_draft_tensor_parallel_size: Optional[int],
-            draft_hf_config: PretrainedConfig) -> int:
+            draft_hf_config: "PretrainedConfig") -> int:
         """
         Verifies and adjusts the tensor parallel size for a draft model
         specified using speculative_draft_tensor_parallel_size.
@@ -3068,7 +3080,7 @@ def compute_hash(self) -> str:
 
 
 def _get_and_verify_dtype(
-    config: PretrainedConfig,
+    config: "PretrainedConfig",
     dtype: Union[str, torch.dtype],
 ) -> torch.dtype:
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
@@ -3159,14 +3171,16 @@ def _get_and_verify_dtype(
 
 
 def _get_and_verify_max_len(
-    hf_config: PretrainedConfig,
+    hf_config: "PretrainedConfig",
     max_model_len: Optional[int],
     disable_sliding_window: bool,
     sliding_window_len: Optional[Union[int, list[Optional[int]]]],
     spec_target_max_model_len: Optional[int] = None,
     encoder_config: Optional[Any] = None,
 ) -> int:
     """Get and verify the model's maximum length."""
+    from vllm.platforms import current_platform
+
     derived_max_model_len = float("inf")
     possible_keys = [
         # OPT
@@ -3498,6 +3512,8 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
+        from vllm.tracing import is_otel_available, otel_import_error_traceback
+
         if (self.collect_detailed_traces is not None
                 and len(self.collect_detailed_traces) == 1
                 and "," in self.collect_detailed_traces[0]):
@@ -4259,7 +4275,7 @@ def get_quantization_config(
 
     def with_hf_config(
         self,
-        hf_config: PretrainedConfig,
+        hf_config: "PretrainedConfig",
         architectures: Optional[list[str]] = None,
     ) -> "VllmConfig":
         if architectures is not None: