From b75e314fff29bdc94b2fb1dd78519e92f9520e65 Mon Sep 17 00:00:00 2001 From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com> Date: Fri, 26 Jul 2024 00:42:49 +0800 Subject: [PATCH 01/45] [Bugfix] Add image placeholder for OpenAI Compatible Server of MiniCPM-V (#6787) Co-authored-by: hezhihui Co-authored-by: Cyrus Leung --- examples/minicpmv_example.py | 2 ++ vllm/entrypoints/chat_utils.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 52366a7030ad0..bf20a7ea04ad4 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -4,6 +4,8 @@ from vllm.assets.image import ImageAsset # 2.0 +# The official repo doesn't work yet, so we need to use a fork for now +# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # MODEL_NAME = "HwwwH/MiniCPM-V-2" # 2.5 MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index dca4523d1a27d..1f6d77b828459 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -100,7 +100,9 @@ def _image_token_str(model_config: ModelConfig, if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return "<|image_1|>" - if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"): + if model_type == "minicpmv": + return "(./)" + if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"): # These models do not use image tokens in the prompt return None if model_type.startswith("llava"): From 889da130e747b1382268ed428352f2e73e51a30b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:46:04 -0700 Subject: [PATCH 02/45] [ Misc ] `fp8-marlin` channelwise via `compressed-tensors` (#6524) Co-authored-by: mgoin --- .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml | 11 ++ .../lm-eval-harness/configs/models-small.txt | 1 + .../compressed_tensors/compressed_tensors.py | 61 ++++++++-- .../compressed_tensors/schemes/__init__.py | 2 + .../schemes/compressed_tensors_scheme.py | 3 +- .../schemes/compressed_tensors_unquantized.py | 3 +- .../schemes/compressed_tensors_w4a16_24.py | 3 +- .../schemes/compressed_tensors_w8a16_fp8.py | 105 ++++++++++++++++++ .../schemes/compressed_tensors_w8a8_fp8.py | 10 +- .../schemes/compressed_tensors_w8a8_int8.py | 19 ++-- .../schemes/compressed_tensors_wNa16.py | 3 +- .../model_executor/layers/quantization/fp8.py | 33 ++++-- .../quantization/utils/marlin_utils_fp8.py | 14 +-- 13 files changed, 219 insertions(+), 49 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml new file mode 100644 index 0000000000000..42936fbfbe7d4 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.578 + - name: "exact_match,flexible-extract" + value: 0.585 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 1d1b0ed38671d..109692395acf6 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -5,3 +5,4 @@ Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml +Qwen2-1.5B-Instruct-FP8W8.yaml diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c4d0c9cb981da..39d00bd5733ff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -10,7 +10,8 @@ W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensorsScheme, CompressedTensorsUnquantized, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsWNA16) + CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, + CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_matched_target, is_activation_quantization_format, @@ -100,14 +101,18 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": def get_config_filenames(cls) -> List[str]: return [] - def _check_scheme_supported(self, min_capability: int): + def _check_scheme_supported(self, + min_capability: int, + error: bool = True) -> bool: capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] - if capability < min_capability: + supported = capability >= min_capability + if error and not supported: raise RuntimeError( "Quantization scheme is not supported for ", f"the current GPU. Min capability: {min_capability}. ", f"Current capability: {capability}.") + return supported def _is_static_tensor_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -170,6 +175,29 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel, # All conditions satisfied. return True + def _is_fp8_w8a16(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: + # Confirm weights quantized. + if weight_quant is None: + return False + + # Confirm we have floating points. + if weight_quant.type != QuantizationType.FLOAT: + return False + + # Confirm weight scheme is supported. + is_symmetric_weight = weight_quant.symmetric + is_static_weight = not weight_quant.dynamic + is_per_tensor_or_channel_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + ]) + if not (is_symmetric_weight and is_static_weight + and is_per_tensor_or_channel_weight): + return False + + # All conditions satisfied. + return True + def _is_wNa16_group_channel(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: input_quant_none = input_quant is None @@ -204,9 +232,23 @@ def _get_scheme_from_parts( # Detect If Activation Quantization. if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Fp8( + is_fp8_w8a8_supported = self._check_scheme_supported( + CompressedTensorsW8A8Fp8.get_min_capability(), error=False) + if is_fp8_w8a8_supported: + return CompressedTensorsW8A8Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=(not input_quant.dynamic)) + else: + return CompressedTensorsW8A16Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=(input_quant + and not input_quant.dynamic)) + + if self._is_fp8_w8a16(weight_quant, input_quant): + return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(not input_quant.dynamic)) + is_static_input_scheme=(input_quant + and not input_quant.dynamic)) if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8( @@ -257,11 +299,10 @@ def get_scheme( targets=self.target_scheme_map.keys()) # Find the quant_scheme - scheme = self.target_scheme_map[matched_target] - - return self._get_scheme_from_parts( - weight_quant=scheme["weights"], - input_quant=scheme["input_activations"]) + scheme_dict = self.target_scheme_map[matched_target] + scheme = self._get_scheme_from_parts( + weight_quant=scheme_dict["weights"], + input_quant=scheme_dict["input_activations"]) # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index dd94c49827f62..ca9e286ce5b2d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -4,6 +4,7 @@ CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 +from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS, CompressedTensorsWNA16) @@ -11,6 +12,7 @@ "CompressedTensorsScheme", "CompressedTensorsUnquantized", "CompressedTensorsWNA16", + "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24", "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index d5f37b47bb87e..b4bab33e1fb1d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -12,8 +12,9 @@ class CompressedTensorsScheme(ABC): of different quantization schemes supported by CompressedTensors. """ + @classmethod @abstractmethod - def get_min_capability(self) -> int: + def get_min_capability(cls) -> int: """ Get minimum device capability. """ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py index 6203f02d25e90..b7ba29ddc9840 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -18,7 +18,8 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme): in a linear transformation. """ - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # volta and up return 70 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index eec523d00372c..b8ffb22d7a89d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -29,7 +29,8 @@ def __init__(self, raise ValueError( "group_size must be given when using strategy group") - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # ampere + up return 80 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py new file mode 100644 index 0000000000000..eeb7c042e1d1f --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -0,0 +1,105 @@ +from typing import Callable, List, Optional + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, create_per_channel_scale_param, + create_per_tensor_scale_param) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW8A16Fp8"] + +SUPPORTED_STRATEGIES = [ + QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR +] + + +class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): + + def __init__(self, strategy: str, is_static_input_scheme: bool): + self.strategy = strategy + self.is_static_input_scheme = is_static_input_scheme + + @classmethod + def get_min_capability(cls) -> int: + # ampere and up + return 80 + + # W8A8-Fp8 kernels support only per-tensor and per-channel cases. + # So if we have a fused module (QKV, MLP) with per tensor scales, + # we expand each scale to its shard's channels. + def process_weights_after_loading(self, layer) -> None: + if self.strategy == QuantizationStrategy.TENSOR: + ws_channelwise = convert_to_channelwise(layer.weight_scale, + layer.logical_widths) + layer.weight_scale = torch.nn.Parameter(ws_channelwise, + requires_grad=False) + + # Weights must be transposed for marlin + layer.weight = torch.nn.Parameter(layer.weight.t(), + requires_grad=False) + + prepare_fp8_layer_for_marlin(layer, strategy="channel") + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + + # WEIGHT + weight = torch.nn.Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + requires_grad=False) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + }) + + # WEIGHT SCALE + layer_kwargs = {"weight_loader": weight_loader} + if self.strategy == QuantizationStrategy.CHANNEL: + weight_scale = create_per_channel_scale_param( + output_partition_sizes, **layer_kwargs) + elif self.strategy == QuantizationStrategy.TENSOR: + weight_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + else: + raise ValueError( + f"Unsupported weight strategy={self.strategy}, " + f"supported strategies are {SUPPORTED_STRATEGIES}") + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE (to deal with converted checkpoints) + if self.is_static_input_scheme: + input_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return apply_fp8_marlin_linear(input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 51156a3bc07af..cc9d71db140c2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -23,7 +23,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool): self.is_static_input_scheme = is_static_input_scheme self.cutlass_fp8_supported = cutlass_fp8_supported() - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # lovelace and up return 89 @@ -77,19 +78,20 @@ def create_weights(self, layer: torch.nn.Module, }) # WEIGHT SCALE + layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: weight_scale = create_per_channel_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) else: assert self.strategy == QuantizationStrategy.TENSOR weight_scale = create_per_tensor_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: input_scale = create_per_tensor_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) layer.register_parameter("input_scale", input_scale) def apply_weights(self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index e81496c89ac7f..3a80863d3abbe 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -19,7 +19,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # turing and up return 75 @@ -68,19 +69,19 @@ def create_weights(self, layer: torch.nn.Module, # WEIGHT SCALE layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: - scale = create_per_channel_scale_param(output_partition_sizes, - **layer_kwargs) + weight_scale = create_per_channel_scale_param( + output_partition_sizes, **layer_kwargs) else: assert self.strategy == QuantizationStrategy.TENSOR - scale = create_per_tensor_scale_param(output_partition_sizes, - **layer_kwargs) - layer.register_parameter("weight_scale", scale) + weight_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: - scale = create_per_tensor_scale_param(output_partition_sizes, - **layer_kwargs) - layer.register_parameter("input_scale", scale) + input_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("input_scale", input_scale) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index e4cf0c0b5d95b..996cba315c556 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -42,7 +42,8 @@ def __init__(self, group_size=self.group_size, is_sym=True) - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # ampere and up return 80 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 3a4f2a49a3497..6649b317ca838 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -18,8 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - all_close_1d, apply_fp8_linear, create_per_tensor_scale_param, - cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale) + all_close_1d, apply_fp8_linear, convert_to_channelwise, + create_per_tensor_scale_param, cutlass_fp8_supported, + per_tensor_dequantize, requantize_with_max_scale) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import print_warning_once @@ -179,19 +180,29 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.weight_scale = Parameter(weight_scale, requires_grad=False) layer.input_scale = None - # If checkpoint is fp8, requantize the separately quantized logical - # weights into a single fp8 weight with a single weight scale. + # If checkpoint is fp8, handle that there are N scales for N + # shards in a fused module else: - # Dequant -> Quant with max scale. - max_w_scale, weight = requantize_with_max_scale( - weight=layer.weight, - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths, - ) + # If using marlin (w8a16), kernel uses channelwise weights, + # so extend the weight scales to be channelwise. + if self.use_marlin: + weight = layer.weight + weight_scale = convert_to_channelwise(layer.weight_scale, + layer.logical_widths) + + # If using w8a8, torch._scaled_mm needs per tensor, so + # requantize the logical shards as a single weight. + else: + # Dequant -> Quant with max scale so we can run per tensor. + weight_scale, weight = requantize_with_max_scale( + weight=layer.weight, + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ) # Update layer with new values. layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) if self.quant_config.activation_scheme == "static": layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index c878939580f10..5f9d8658a342f 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -46,7 +46,8 @@ def apply_fp8_marlin_linear( return output.reshape(out_shape) -def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None: +def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, + strategy: str = "tensor") -> None: print_warning_once( "Your GPU does not have native support for FP8 computation but " "FP8 quantization is being used. Weight-only FP8 compression will " @@ -74,16 +75,7 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None: layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) # WEIGHT SCALES - # Currently Marlin doesn't support per-tensor scales, so we - # expand it to channelwise - is_channelwise = (len(layer.weight_scale.shape) > 0 - and layer.weight_scale.shape[0] == part_size_n) - if is_channelwise: - scales = layer.weight_scale - else: - scales = layer.weight_scale.repeat(1, part_size_n) - scales = scales.to(layer.orig_dtype).to(device) - + scales = layer.weight_scale.to(layer.orig_dtype) # Permute scales marlin_scales = marlin_permute_scales(s=scales, size_k=part_size_k, From 65b1f121c885f169da210946eddb0d52524677f1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 25 Jul 2024 12:46:15 -0400 Subject: [PATCH 03/45] [Bugfix] Fix `kv_cache_dtype=fp8` without scales for FP8 checkpoints (#6761) --- tests/quantization/test_fp8.py | 12 ++++++++++-- vllm/model_executor/layers/quantization/kv_cache.py | 6 ++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 0602fedf0b8e3..ad92f1f189f65 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") -def test_load_fp16_model(vllm_runner) -> None: - with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: +@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) +def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None: + with vllm_runner("facebook/opt-125m", + quantization="fp8", + kv_cache_dtype=kv_cache_dtype) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index c1495711447fa..d79536d196b92 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -46,10 +46,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: elif layer.k_scale < 0.0 and layer.v_scale < 0.0: # If no scales were loaded (both scales are invalid negative # values), use the default value of 1.0 - k_scale = torch.nn.Parameter(torch.tensor(1.0), - requires_grad=False) - v_scale = torch.nn.Parameter(torch.tensor(1.0), - requires_grad=False) + k_scale = 1.0 + v_scale = 1.0 else: # If we find a single kv_scale in the checkpoint, we remap # kv_scale to k_scale during weight loading, and duplicate From 95db75de64bec34f4d80acff92c62d1cdfa94688 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 25 Jul 2024 13:40:01 -0400 Subject: [PATCH 04/45] [Bugfix] Add synchronize to prevent possible data race (#6788) Co-authored-by: Lucas Wilkinson --- vllm/distributed/parallel_state.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 128096c88a8b1..e9c6fc3a255e4 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -243,6 +243,13 @@ def graph_capture( ca_comm = self.ca_comm maybe_ca_context = nullcontext( ) if ca_comm is None else ca_comm.capture() + + # ensure all initialization operations complete before attempting to + # capture the graph on another stream + curr_stream = torch.cuda.current_stream() + if curr_stream != stream: + stream.wait_stream(curr_stream) + with torch.cuda.stream(stream), maybe_ca_context: # In graph mode, we have to be very careful about the collective # operations. The current status is: From 6a1e25b1514a25d3da96d0d78c4568f6e581e242 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 25 Jul 2024 11:57:16 -0700 Subject: [PATCH 05/45] [Doc] Add documentations for nightly benchmarks (#6412) --- .buildkite/nightly-benchmarks/README.md | 80 +++++++++++++++---- README.md | 2 +- docs/source/index.rst | 6 ++ .../performance_benchmark/benchmarks.rst | 23 ++++++ 4 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 docs/source/performance_benchmark/benchmarks.rst diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index c84e150934306..c1aebaf5b3bbe 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -3,30 +3,51 @@ ## Introduction -This directory contains the performance benchmarking CI for vllm. -The goal is to help developers know the impact of their PRs on the performance of vllm. +This directory contains two sets of benchmark for vllm. +- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance +- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. -This benchmark will be *triggered* upon: + +See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. + + +## Performance benchmark quick overview + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models. + +**Benchmarking Duration**: about 1hr. + +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. + + +## Nightly benchmark quick overview + +**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. + +**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. + +**Benchmarking Duration**: about 3.5hrs. + + + +## Trigger the benchmark + +Performance benchmark will be triggered when: - A PR being merged into vllm. - Every commit for those PRs with `perf-benchmarks` label. -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models. +Nightly benchmark will be triggered when: +- Every commit for those PRs with `nightly-benchmarks` label. -**Benchmarking Duration**: about 1hr. -**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. -## Configuring the workload +## Performance benchmark details -The benchmarking workload contains three parts: -- Latency tests in `latency-tests.json`. -- Throughput tests in `throughput-tests.json`. -- Serving tests in `serving-tests.json`. +See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. -See [descriptions.md](tests/descriptions.md) for detailed descriptions. -### Latency test +#### Latency test Here is an example of one test inside `latency-tests.json`: @@ -54,12 +75,12 @@ Note that the performance numbers are highly sensitive to the value of the param WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. -### Throughput test +#### Throughput test The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. -### Serving test +#### Serving test We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: ``` @@ -96,9 +117,36 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. -## Visualizing the results +#### Visualizing the results The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. + + + +## Nightly test details + +See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. + + +#### Workflow + +- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. +- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. +- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. +- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. + +#### Nightly tests + +In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. + +#### Docker containers + +The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. + +WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. + +WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). + diff --git a/README.md b/README.md index 8e508195cdceb..a9215f4c7e1c5 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ vLLM is fast with: - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache - Optimized CUDA kernels -**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). +**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). vLLM is flexible and easy to use with: diff --git a/docs/source/index.rst b/docs/source/index.rst index ded9a424ee68c..8f06f2f2e5469 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -117,6 +117,12 @@ Documentation automatic_prefix_caching/apc automatic_prefix_caching/details +.. toctree:: + :maxdepth: 1 + :caption: Performance benchmarks + + performance_benchmark/benchmarks + .. toctree:: :maxdepth: 2 :caption: Developer Documentation diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst new file mode 100644 index 0000000000000..9a23aab10d03d --- /dev/null +++ b/docs/source/performance_benchmark/benchmarks.rst @@ -0,0 +1,23 @@ +.. _benchmarks: + +Benchmark suites of vLLM +======================== + + + +vLLM contains two sets of benchmarks: + ++ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results. + ++ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_. + + +Trigger a benchmark +------------------- + +The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`. + + +.. note:: + + Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics. From cd7edc4e8726d4b87e121f9ec671ecb6dd0c45d6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 25 Jul 2024 18:05:09 -0400 Subject: [PATCH 06/45] [Bugfix] Fix empty (nullptr) channelwise scales when loading wNa16 using compressed tensors (#6798) --- .../schemes/compressed_tensors_wNa16.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 996cba315c556..a41962ccd66d8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -55,7 +55,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, output_size_per_partition = sum(output_partition_sizes) # If group_size is -1, we are in channelwise case. - group_size = input_size if self.group_size == -1 else self.group_size + channelwise = (self.group_size == -1) + group_size = input_size if channelwise else self.group_size + row_parallel = (input_size != input_size_per_partition) + # In the case of channelwise quantization, we need to replicate the + # scales across all gpus. + partition_scales = (row_parallel and not channelwise) verify_marlin_supports_shape( output_size_per_partition=output_size_per_partition, @@ -66,8 +71,8 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, weight_scale_dim = None scales_and_zp_size = input_size // group_size - if (input_size != input_size_per_partition - and self.group_size is not None): + if partition_scales: + assert input_size_per_partition % group_size == 0 weight_scale_dim = 1 scales_and_zp_size = input_size_per_partition // group_size From f3ff63c3f45974986f13f60647a258b09913c420 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 15:38:32 -0700 Subject: [PATCH 07/45] [doc][distributed] improve multinode serving doc (#6804) --- docs/source/serving/distributed_serving.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index 4fc36a680084c..5f14fd2b0ee0a 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -79,7 +79,7 @@ On the rest of the worker nodes, run the following command: $ --worker \ $ /path/to/the/huggingface/home/in/this/node -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -101,7 +101,7 @@ You can also use tensor parallel without pipeline parallel, just set the tensor To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. .. warning:: - After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. + After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. .. warning:: From b7215de2c5fcdf8af96cf941556d63934ea8f353 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 25 Jul 2024 16:47:55 -0700 Subject: [PATCH 08/45] [Docs] Publish 5th meetup slides (#6799) --- README.md | 10 +--------- docs/source/community/meetups.rst | 1 + 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a9215f4c7e1c5..5f23f0813f606 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone --- -**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)** - -We are excited to announce our fifth vLLM Meetup! -Join us to hear the vLLM's recent updates and the upcoming roadmap. -Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM. -Register now [here](https://lu.ma/lp0gyjqr) and be part of the event! - ---- - *Latest News* 🔥 +- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst index 0fde31ef9b059..3b01b109ebf2c 100644 --- a/docs/source/community/meetups.rst +++ b/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ - `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ - `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ - `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ From 1adddb14bf0e1a603581bca49e8d29e8bfb337dc Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Thu, 25 Jul 2024 16:53:25 -0700 Subject: [PATCH 09/45] [Core] Fix ray forward_dag error mssg (#6792) --- vllm/executor/ray_gpu_executor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e4aaeaa24c1bc..564fa79acfd40 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -29,6 +29,7 @@ class RayGPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: + self.forward_dag: Optional["ray.dag.CompiledDAG"] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. @@ -60,8 +61,6 @@ def _init_executor(self) -> None: # Create the parallel GPU workers. self._init_workers_ray(placement_group) - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None - def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: # If nsight profiling is enabled, we need to set the profiling From 443c7cf4cf891e6957d4b31655e58cabceb5a2a7 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 17:44:09 -0700 Subject: [PATCH 10/45] [ci][distributed] fix flaky tests (#6806) --- tests/distributed/test_pipeline_parallel.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index d666b8a1d44bd..5ff39ddfbf996 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,3 +1,10 @@ +""" +WARNING: This test runs in both single-node (4 GPUs) and multi-node + (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is + important to set the distributed backend to "mp" to avoid Ray scheduling + all workers in a node other than the head node, which can cause the test + to fail. +""" import os import pytest @@ -78,7 +85,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): "--pipeline-parallel-size", str(PP_SIZE), "--distributed-executor-backend", - "ray", + "mp", ] os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND From 2eb9f4ff262bb39859baebf8d2109abcdadee860 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 25 Jul 2024 18:08:33 -0700 Subject: [PATCH 11/45] [ci] Mark tensorizer as soft fail and separate from grouped test (#6810) [ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e7dd1fdb2e660..633bc5ca95bf9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -17,11 +17,10 @@ steps: - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker -- label: Tensorizer, Metrics, Tracing Test +- label: Metrics, Tracing Test fast_check: true fast_check_only: true commands: - - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer - pytest -v -s metrics # Metrics - "pip install \ opentelemetry-sdk \ @@ -221,6 +220,8 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] + soft_fail: true + fast_check: true commands: - apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn From 062a1d0fab111723ab768f94bdd48a6adc054007 Mon Sep 17 00:00:00 2001 From: QQSong Date: Thu, 25 Jul 2024 19:24:58 -0700 Subject: [PATCH 12/45] Fix ReplicatedLinear weight loading (#6793) --- vllm/model_executor/layers/linear.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 0e0a2b72f93d4..b6e280ae65049 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -199,12 +199,16 @@ def __init__(self, self.input_size, self.output_size, self.params_dtype, + weight_loader=self.weight_loader, prefix=prefix) if bias: self.bias = Parameter( torch.empty(self.output_size, dtype=self.params_dtype)) - set_weight_attrs(self.bias, {"output_dim": 0}) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) else: self.register_parameter("bias", None) From 084a01fd3544557990f8af8af6fd3c1185bae848 Mon Sep 17 00:00:00 2001 From: Anthony Platanios Date: Fri, 26 Jul 2024 00:25:35 -0400 Subject: [PATCH 13/45] [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU executor. (#6770) --- vllm/executor/multiproc_gpu_executor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 9811fc2a55199..19f7a497cdd9f 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -1,6 +1,7 @@ import asyncio import os import signal +import threading import weakref from functools import partial from typing import Any, List, Optional @@ -115,8 +116,9 @@ def shutdown(signum, frame): if executor := ref(): executor.shutdown() - signal.signal(signal.SIGINT, shutdown) - signal.signal(signal.SIGTERM, shutdown) + if threading.current_thread() is threading.main_thread(): + signal.signal(signal.SIGINT, shutdown) + signal.signal(signal.SIGTERM, shutdown) self.driver_worker = self._create_worker( distributed_init_method=distributed_init_method) From 89a84b0bb7b30706a02836234a94493ea8f780bf Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Fri, 26 Jul 2024 12:31:31 +0800 Subject: [PATCH 14/45] [Core] Use array to speedup padding (#6779) --- vllm/model_executor/layers/sampler.py | 2 +- vllm/model_executor/sampling_metadata.py | 21 ++++++++++++--------- vllm/sequence.py | 23 ++++++++++++++++------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c376797a054f..121458f8156a1 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -220,7 +220,7 @@ def _apply_min_tokens_penalty( seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] - if len(seq_data.output_token_ids) < min_tokens: + if len(seq_data.output_token_ids_array) < min_tokens: seqs_to_penalize.append(j) if seqs_to_penalize: diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 390b5d173ebcd..27b37a9d53470 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,4 +1,5 @@ import random +from array import array from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -329,8 +330,8 @@ def from_sampling_metadata( user-defined seed for each sequence. extra_entropy: extra entropy to use when generating seeds. """ - prompt_tokens: List[List[int]] = [] - output_tokens: List[List[int]] = [] + prompt_tokens: List[array] = [] + output_tokens: List[array] = [] top_ks: List[int] = [] temperatures: List[float] = [] top_ps: List[float] = [] @@ -432,13 +433,15 @@ def from_sampling_metadata( if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) - prompt_tokens.extend([] for _ in range(prefill_len)) - output_tokens.extend([] for _ in range(prefill_len)) + prompt_tokens.extend( + array('l') for _ in range(prefill_len)) + output_tokens.extend( + array('l') for _ in range(prefill_len)) if seq_group.do_sample: for seq_id in seq_ids: seq_data = seq_group.seq_data[seq_id] - prompt_tokens.append(list(seq_data.prompt_token_ids)) - output_tokens.append(list(seq_data.output_token_ids)) + prompt_tokens.append(seq_data.prompt_token_ids_array) + output_tokens.append(seq_data.output_token_ids_array) sampling_tensors = SamplingTensors.from_lists( temperatures, top_ps, top_ks, min_ps, presence_penalties, @@ -454,9 +457,9 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], frequency_penalties: List[float], repetition_penalties: List[float], sampling_seeds: List[int], sample_indices: List[int], - prompt_tokens: List[List[int]], - output_tokens: List[List[int]], vocab_size: int, - extra_seeds_to_generate: int, device: torch.device, + prompt_tokens: List[array], output_tokens: List[array], + vocab_size: int, extra_seeds_to_generate: int, + device: torch.device, dtype: torch.dtype) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. diff --git a/vllm/sequence.py b/vllm/sequence.py index 0cd4c7e71d78d..72821ecea0f47 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -3,6 +3,7 @@ import enum import math from abc import ABC, abstractmethod +from array import array from collections import defaultdict from dataclasses import dataclass, field from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple, @@ -119,10 +120,10 @@ def __init__( prompt_token_ids: List[int], output_token_ids: Optional[List[int]] = None, ) -> None: - self._prompt_token_ids: List[int] = list(prompt_token_ids) + self._prompt_token_ids = array('l', prompt_token_ids) self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) - self._output_token_ids: List[int] = ( - list(output_token_ids) if output_token_ids is not None else []) + self._output_token_ids = array( + 'l', output_token_ids if output_token_ids is not None else []) self.cumulative_logprob = 0.0 # The number of tokens that are computed (that run against the model). @@ -132,8 +133,8 @@ def __init__( self._update_cached_all_tokens() def _update_cached_all_tokens(self): - self._cached_all_token_ids: List[int] = (self._prompt_token_ids + - self._output_token_ids) + self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._output_token_ids) @property def prompt_token_ids(self) -> Tuple[int, ...]: @@ -141,19 +142,27 @@ def prompt_token_ids(self) -> Tuple[int, ...]: @prompt_token_ids.setter def prompt_token_ids(self, new_prompt_token_ids) -> None: - self._prompt_token_ids = list(new_prompt_token_ids) + self._prompt_token_ids = array('l', new_prompt_token_ids) self._prompt_token_ids_tuple = tuple(new_prompt_token_ids) self._update_cached_all_tokens() + @property + def prompt_token_ids_array(self) -> array: + return self._prompt_token_ids + @property def output_token_ids(self) -> Tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter def output_token_ids(self, new_output_token_ids) -> None: - self._output_token_ids = list(new_output_token_ids) + self._output_token_ids = array('l', new_output_token_ids) self._update_cached_all_tokens() + @property + def output_token_ids_array(self) -> array: + return self._output_token_ids + def append_token_id(self, token_id: int, logprob: float) -> None: self._output_token_ids.append(token_id) self._cached_all_token_ids.append(token_id) From 85ad7e2d012edd87de9e84e93ed3204c80599695 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 21:48:05 -0700 Subject: [PATCH 15/45] [doc][debugging] add known issues for hangs (#6816) --- docs/source/getting_started/debugging.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 2aa52e79888a3..d7066f2325b3a 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -65,6 +65,10 @@ Here are some common issues that can cause hangs: If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. +Some known issues: + +- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_ . + .. warning:: After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on. From 07278c37ddd898d842bbddc382e4f67ac08dae35 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Jul 2024 14:33:42 -0400 Subject: [PATCH 16/45] [Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) --- .../configs/Minitron-4B-Base.yaml | 11 + .../lm-eval-harness/configs/models-small.txt | 1 + vllm/model_executor/layers/activation.py | 16 + .../model_executor/layers/rotary_embedding.py | 3 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/nemotron.py | 531 ++++++++++++++++++ vllm/transformers_utils/config.py | 3 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/nemotron.py | 209 +++++++ 9 files changed, 776 insertions(+), 1 deletion(-) create mode 100644 .buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml create mode 100644 vllm/model_executor/models/nemotron.py create mode 100644 vllm/transformers_utils/configs/nemotron.py diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml new file mode 100644 index 0000000000000..a0466748ea71e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1 +model_name: "nvidia/Minitron-4B-Base" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.252 + - name: "exact_match,flexible-extract" + value: 0.252 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 109692395acf6..e4df4b547aa5e 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -4,5 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml +Minitron-4B-Base.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-FP8W8.yaml diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5bfdba67b443d..6578193a31597 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -159,6 +159,21 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: +class ReLUSquaredActivation(CustomOp): + """ + Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 + """ + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + relu_applied = nn.functional.relu(x) + squared = torch.square(relu_applied) + return squared + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. @@ -207,6 +222,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): "gelu_new": NewGELU(), "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), "relu": nn.ReLU(), + "relu2": ReLUSquaredActivation(), "quick_gelu": QuickGELU(), } diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 60ba4623edc38..aecba0ae74911 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -774,6 +774,7 @@ def get_rope( is_neox_style: bool = True, rope_scaling: Optional[Dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, + rotary_percent: float = 1.0, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() @@ -786,6 +787,8 @@ def get_rope( rope_scaling_args = tuple(rope_scaling_tuple.items()) else: rope_scaling_args = None + if rotary_percent < 1.0: + rotary_dim = int(rotary_dim * rotary_percent) key = (head_size, rotary_dim, max_position, base, is_neox_style, rope_scaling_args, dtype) if key in _ROPE_DICT: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 7df5b8fa64710..ead64c0e92553 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -51,6 +51,7 @@ "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPMV": ("minicpmv", "MiniCPMV"), + "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py new file mode 100644 index 0000000000000..bb85f20ab9802 --- /dev/null +++ b/vllm/model_executor/models/nemotron.py @@ -0,0 +1,531 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Nemotron model compatible with HuggingFace weights.""" +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import torch +from torch import nn + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.transformers_utils.configs import NemotronConfig + +from .interfaces import SupportsLoRA +from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers + +# The architecture is pretty similar to Llama, with these changes: +# - There is no gate_proj, just up_proj +# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm +# - Squared ReLU instead of SwiGLU +# - Adds a rotary_percent to RoPE + + +def _cast_if_autocast_enabled(*args): + if not torch.is_autocast_enabled(): + return args + else: + return torch.cuda.amp.autocast_mode._cast( + args, torch.get_autocast_gpu_dtype()) + + +class NemotronLayerNorm1P(nn.LayerNorm): + + def __init__(self, + normalized_shape: Union[int, List[int], torch.Size], + eps: float = 1e-5, + elementwise_affine: bool = True, + bias: bool = True, + device=None, + dtype=None): + super().__init__(normalized_shape, eps, elementwise_affine, bias, + device, dtype) + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if residual is not None: + x = x + residual + residual = x + args = _cast_if_autocast_enabled(x, self.normalized_shape, + self.weight + 1, self.bias, self.eps) + with torch.cuda.amp.autocast(enabled=False): + x = torch.nn.functional.layer_norm(*args) + return x if residual is None else (x, residual) + + +class NemotronMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.up_proj = ColumnParallelLinear(input_size=hidden_size, + output_size=intermediate_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") + self.down_proj = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_fn = get_act_fn(hidden_act) + + def forward(self, x): + up, _ = self.up_proj(x) + x = self.act_fn(up) + x, _ = self.down_proj(x) + return x + + +class NemotronAttention(nn.Module): + + def __init__( + self, + config: NemotronConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + self.head_dim = getattr(config, "head_dim", + self.hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.rotary_percent = config.rope_percent + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + rotary_percent=self.rotary_percent, + ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class NemotronDecoderLayer(nn.Module): + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # Support abacusai/Smaug-72B-v0.1 with attention_bias + # Support internlm/internlm-7b with bias + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False) + self.self_attn = NemotronAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = NemotronMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = NemotronLayerNorm1P(config.hidden_size, + eps=config.norm_eps) + self.post_attention_layernorm = NemotronLayerNorm1P( + config.hidden_size, eps=config.norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class NemotronModel(nn.Module): + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + else: + self.embed_tokens = PPMissingLayer() + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: NemotronDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers") + if get_pp_group().is_last_rank: + self.norm = NemotronLayerNorm1P(config.hidden_size, + eps=config.norm_eps) + else: + self.norm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i - self.start_layer], + attn_metadata, + residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class NemotronForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head" + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + } + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + + assert isinstance(config, NemotronConfig) + + self.config = config + self.lora_config = lora_config + + self.model = NemotronModel(config, + cache_config, + quant_config, + lora_config=lora_config, + prefix="model") + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + self.sampler = Sampler() + else: + self.lm_head = PPMissingLayer() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors) + return model_output + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 652505a892142..3ba2e01985598 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -8,7 +8,7 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, JAISConfig, MedusaConfig, MLPSpeculatorConfig, MPTConfig, - RWConfig) + NemotronConfig, RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -26,6 +26,7 @@ "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, + "nemotron": NemotronConfig, } for name, cls in _CONFIG_REGISTRY.items(): diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 51de11ca3e42a..1750950b3c38b 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -8,6 +8,7 @@ from vllm.transformers_utils.configs.medusa import MedusaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig +from vllm.transformers_utils.configs.nemotron import NemotronConfig __all__ = [ "ChatGLMConfig", @@ -17,4 +18,5 @@ "JAISConfig", "MedusaConfig", "MLPSpeculatorConfig", + "NemotronConfig", ] diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py new file mode 100644 index 0000000000000..a22a9f475dda9 --- /dev/null +++ b/vllm/transformers_utils/configs/nemotron.py @@ -0,0 +1,209 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Nemotron model configuration""" + +from transformers import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class NemotronConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`NemotronModel`]. It is used to instantiate an Nemotron model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the Nemotron-8B. + + Configuration objects inherit from [`PretrainedConfig`] and can be + used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Nemotron model. Defines the number of + different tokens that can be represented by the + `inputs_ids` passed when calling [`NemotronModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the + Transformer decoder. + head_dim (`int`, *optional*, defaults to None): + Projection weights dimension in multi-head attention. Set to + hidden_size // num_attention_heads if None + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to + implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use + Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention + (MQA) otherwise GQA is used. When converting a multi-head + checkpoint to a GQA checkpoint, each group key and value + head should be constructed by meanpooling all the original + heads within that group. For more details checkout + [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it + is not specified, will default to `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the + decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used + with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values + attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE + embeddings. Currently supports two scaling strategies: linear + and dynamic. Their scaling factor must be a float greater than 1. + The expected format is `{"type": strategy name, + "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output + projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj and down_proj layers in the MLP + layers. + + ```python + >>> from transformers import NemotronModel, NemotronConfig + + >>> # Initializing a Nemotron nemotron-15b style configuration + >>> configuration = NemotronConfig() + + >>> # Initializing a model from the nemotron-15b style configuration + >>> model = NemotronModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "nemotron" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=6144, + intermediate_size=24576, + num_hidden_layers=32, + num_attention_heads=48, + head_dim=None, + num_key_value_heads=None, + hidden_act="relu2", + max_position_embeddings=4096, + initializer_range=0.0134, + norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=2, + eos_token_id=3, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + rope_percent=0.5, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + head_dim = head_dim or kwargs.get("kv_channels", None) + self.head_dim = head_dim if head_dim is not None else ( + hidden_size // num_attention_heads) + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_eps = norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + rope_percent = rope_percent or kwargs.get("rope_percentage", None) + self.rope_percent = rope_percent + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, + dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with two fields, " + f"`type` and `factor`, got {self.rope_scaling}") + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in [ + "linear", "dynamic" + ]: + raise ValueError( + "`rope_scaling`'s type field must be one of ['linear', " + f"'dynamic'], got {rope_scaling_type}") + if rope_scaling_factor is None or not isinstance( + rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError( + "`rope_scaling`'s factor field must be a float > 1, got " + f"{rope_scaling_factor}") From 50704f52c4643777fb0e5dc99f6c048dd9f54f2d Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 26 Jul 2024 14:41:04 -0400 Subject: [PATCH 17/45] [Bugfix][Kernel] Promote another index to int64_t (#6838) --- csrc/quantization/fp8/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 090f95d1bda71..6dae32b25f9c4 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, const scalar_t* __restrict__ input, int64_t num_elems) { __shared__ float cache[1024]; - int i = blockDim.x * blockIdx.x + threadIdx.x; + int64_t i = blockDim.x * blockIdx.x + threadIdx.x; // First store maximum for all values processes by // the current thread in cache[threadIdx.x] From 71734f1bf263ed4877e928d7d9c4522d12e9c61f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 12:28:32 -0700 Subject: [PATCH 18/45] [Build/CI][ROCm] Minor simplification to Dockerfile.rocm (#6811) --- Dockerfile.rocm | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index ff39791456398..7b4c0166a04bd 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -53,9 +53,9 @@ RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(whic # Install torch == 2.5.0 on ROCm RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ *"rocm-6.1"*) \ - python3 -m pip uninstall -y torch torchaudio torchvision \ + python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \ + torch==2.5.0.dev20240710 \ torchvision==0.20.0.dev20240710 \ --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ *) ;; esac @@ -127,13 +127,6 @@ FROM base AS final # Import the vLLM development directory from the build context COPY . . -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually remove it so that later steps of numpy upgrade can continue -RUN case "$(which python3)" in \ - *"/opt/conda/envs/py_3.9"*) \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ - *) ;; esac - # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade numba scipy huggingface-hub[cli] From aa4867791ecd73a5f55b7bad4d9372954e661fe4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 12:39:49 -0700 Subject: [PATCH 19/45] [Misc][TPU] Support TPU in initialize_ray_cluster (#6812) --- vllm/executor/ray_utils.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index fcbfa30d7a38a..58b864070f727 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_ip, is_hip, is_xpu +from vllm.utils import get_ip, is_hip, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -93,32 +93,38 @@ def initialize_ray_cluster( # Placement group is already set. return + device_str = "GPU" if not is_tpu() else "TPU" # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: # We are in a placement group bundles = current_placement_group.bundle_specs # Verify that we can use the placement group. - gpu_bundles = 0 + device_bundles = 0 for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) - if bundle_gpus > 1: + bundle_devices = bundle.get(device_str, 0) + if bundle_devices > 1: raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") - if bundle_gpus: - gpu_bundles += 1 - if parallel_config.world_size > gpu_bundles: + "Placement group bundle cannot have more than 1 " + f"{device_str}.") + if bundle_devices: + device_bundles += 1 + if parallel_config.world_size > device_bundles: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group." + f"Required number of devices: {parallel_config.world_size}. " + f"Total number of devices: {device_bundles}.") else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) - if parallel_config.world_size > num_gpus_in_cluster: + num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + if parallel_config.world_size > num_devices_in_cluster: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group.") # Create a new placement group - placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size) + placement_group_specs = ([{ + device_str: 1 + }] * parallel_config.world_size) current_placement_group = ray.util.placement_group( placement_group_specs) # Wait until PG is ready - this will block until all From 3bbb4936dc5aa7737750410ab4b4647817dcf9a3 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 27 Jul 2024 04:50:10 +0800 Subject: [PATCH 20/45] [Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) --- .buildkite/run-cpu-test.sh | 28 +- Dockerfile.cpu | 9 +- cmake/cpu_extension.cmake | 4 + csrc/cpu/torch_bindings.cpp | 7 + csrc/cpu/utils.cpp | 65 +++++ .../getting_started/cpu-installation.rst | 55 +++- requirements-cpu.txt | 4 +- vllm/distributed/parallel_state.py | 3 + vllm/engine/async_llm_engine.py | 2 - vllm/envs.py | 8 +- vllm/executor/cpu_executor.py | 264 +++++++++++++++--- vllm/utils.py | 21 -- vllm/worker/cpu_model_runner.py | 7 +- vllm/worker/cpu_worker.py | 17 +- 14 files changed, 404 insertions(+), 90 deletions(-) create mode 100644 csrc/cpu/utils.cpp diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index a7678aae54644..21deec2bba973 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -3,26 +3,38 @@ set -ex # Try building the docker image -docker build -t cpu-test -f Dockerfile.cpu . -docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . +numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image +# Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test + --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 + --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 # offline inference -docker exec cpu-test bash -c "python3 examples/offline_inference.py" docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test -docker exec cpu-test bash -c "cd tests; +docker exec cpu-test bash -c " pip install pytest Pillow protobuf - cd ../ pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported + +# online inference +docker exec cpu-test bash -c " + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=48-92 + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f95d748f1e4be..c473ba431e680 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -2,8 +2,8 @@ FROM ubuntu:22.04 AS cpu-test-1 -RUN apt-get update -y \ - && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \ +RUN apt-get update -y \ + && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html @@ -13,8 +13,9 @@ RUN pip install intel-openmp ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD" +RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install --upgrade pip \ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy @@ -25,7 +26,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... ARG VLLM_CPU_DISABLE_AVX512 diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 690559ee265e9..118f9b28e0ae3 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -83,6 +83,8 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") +list(APPEND LIBS "numa") + # # Define extension targets @@ -95,6 +97,7 @@ set(VLLM_EXT_SRC "csrc/cpu/activation.cpp" "csrc/cpu/attention.cpp" "csrc/cpu/cache.cpp" + "csrc/cpu/utils.cpp" "csrc/cpu/layernorm.cpp" "csrc/cpu/pos_encoding.cpp" "csrc/cpu/torch_bindings.cpp") @@ -104,6 +107,7 @@ define_gpu_extension_target( DESTINATION vllm LANGUAGE CXX SOURCES ${VLLM_EXT_SRC} + LIBRARIES ${LIBS} COMPILE_FLAGS ${CXX_COMPILE_FLAGS} USE_SABI 3 WITH_SOABI diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 5be0e9810b5b9..7d549e271a30d 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -4,6 +4,8 @@ #include +void init_cpu_threads_env(const std::string& cpu_ids); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -107,4 +109,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } +TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { + // CPU utils + utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env); +} + REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp new file mode 100644 index 0000000000000..5782580baa861 --- /dev/null +++ b/csrc/cpu/utils.cpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include + +#include "cpu_types.hpp" + +void init_cpu_threads_env(const std::string& cpu_ids) { + bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); + TORCH_CHECK(omp_cpu_mask->size > 0); + std::vector omp_cpu_ids; + omp_cpu_ids.reserve(omp_cpu_mask->size); + + constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp); + + for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) { + unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size]; + int i = 0; + while (group_mask) { + if (group_mask & 1) { + omp_cpu_ids.emplace_back(offset + i); + } + ++i; + group_mask >>= 1; + } + } + + // Memory node binding + if (numa_available() != -1) { + int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front()); + bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str()); + bitmask* src_mask = numa_get_membind(); + + int pid = getpid(); + + // move all existing pages to the specified numa node. + *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); + int page_num = numa_migrate_pages(pid, src_mask, mask); + if (page_num == -1) { + TORCH_CHECK(false, + "numa_migrate_pages failed. errno: " + std::to_string(errno)); + } + + // restrict memory allocation node. + numa_set_membind(mask); + numa_set_strict(1); + } + + // OMP threads binding + omp_set_num_threads((int)omp_cpu_ids.size()); + torch::set_num_threads((int)omp_cpu_ids.size()); + TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads()); + TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads()); +#pragma omp parallel for schedule(static, 1) + for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { + cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size); + size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size); + CPU_ZERO_S(size, mask); + CPU_SET_S(omp_cpu_ids[i], size, mask); + sched_setaffinity(0, sizeof(cpu_set_t), mask); + CPU_FREE(mask); + } + + numa_free_nodemask(omp_cpu_mask); +} diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 1c97515dbecd9..7fc469e06844f 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -10,6 +10,7 @@ Table of contents: #. :ref:`Requirements ` #. :ref:`Quick start using Dockerfile ` #. :ref:`Build from source ` +#. :ref:`Related runtime environment variables ` #. :ref:`Intel Extension for PyTorch ` #. :ref:`Performance tips ` @@ -47,7 +48,7 @@ Build from source .. code-block:: console $ sudo apt-get update -y - $ sudo apt-get install -y gcc-12 g++-12 + $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - Second, install Python packages for vLLM CPU backend building: @@ -71,6 +72,15 @@ Build from source - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +.. _env_intro: + +Related runtime environment variables +------------------------------------- + +- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + .. _ipex_guidance: Intel Extension for PyTorch @@ -78,15 +88,11 @@ Intel Extension for PyTorch - `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. -- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed. - .. _cpu_backend_performance_tips: Performance tips ----------------- -- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: .. code-block:: console @@ -96,11 +102,44 @@ Performance tips $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD $ python examples/offline_inference.py # run vLLM -- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: -- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading. +.. code-block:: console + + $ export VLLM_CPU_KVCACHE_SPACE=40 + $ export VLLM_CPU_OMP_THREADS_BIND=0-29 + $ vllm serve facebook/opt-125m + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +.. code-block:: console -- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores and memory nodes, to avoid the remote memory node access. ``numactl`` is an useful tool for CPU core and memory binding on NUMA platform. Besides, ``--cpuset-cpus`` and ``--cpuset-mems`` arguments of ``docker run`` are also useful. + $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + + # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. + CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ + 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + + # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 + $ export VLLM_CPU_OMP_THREADS_BIND=0-7 + $ python examples/offline_inference.py + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 754070df21c0a..a8ce104d83290 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.1+cpu; platform_machine != "ppc64le" -torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +torch == 2.4.0; platform_machine != "ppc64le" +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index e9c6fc3a255e4..58cae46d9af27 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -296,6 +296,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: pynccl_comm = self.pynccl_comm if (pynccl_comm is not None and not pynccl_comm.disabled): pynccl_comm.all_reduce(input_) + elif input_.is_cpu: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) else: torch.distributed.all_reduce(input_, group=self.device_group) return input_ diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 16b7bc64a2849..93cc319f11c42 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -410,8 +410,6 @@ def _get_executor_cls( from vllm.executor.tpu_executor import TPUExecutorAsync executor_class = TPUExecutorAsync elif engine_config.device_config.device_type == "cpu": - assert distributed_executor_backend is None, ( - "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync elif engine_config.device_config.device_type == "openvino": diff --git a/vllm/envs.py b/vllm/envs.py index 595992e51db87..f06b6d66ea6f4 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,6 +29,7 @@ VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False @@ -241,11 +242,16 @@ def get_default_config_root(): "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), - # CPU key-value cache space + # (CPU backend only) CPU key-value cache space. # default is 4GB "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), + # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", + # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. + "VLLM_CPU_OMP_THREADS_BIND": + lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + # OpenVINO key-value cache space # default is 4GB "VLLM_OPENVINO_KVCACHE_SPACE": diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 23e429dac7232..3229e5ad20afa 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -1,16 +1,21 @@ -from typing import List, Set, Tuple +import os +from functools import partial +from typing import Any, Awaitable, List, Optional, Set, Tuple, Union import torch import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, SchedulerConfig from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, + ResultHandler, WorkerMonitor) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) +from vllm.utils import (get_distributed_init_method, get_open_port, + get_vllm_instance_id, make_async) +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -22,46 +27,173 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" assert self.lora_config is None, "cpu backend doesn't support LoRA" + + # + # Environment variables for CPU executor + # + + # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers + os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() + + # Disable torch async compiling which won't work with daemonic processes + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + + # Intel OpenMP setting + ld_prealod_str = os.getenv("LD_PRELOAD", "") + if "libiomp5.so" in ld_prealod_str: + # The time(milliseconds) that a thread should wait after + # completing the execution of a parallel region, before sleeping. + os.environ['KMP_BLOCKTIME'] = "1" + # Prevents the CPU to run into low performance state + os.environ['KMP_TPAUSE'] = "0" + # Provides fine granularity parallelism + os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" + + # To hint IPEX uses shared memory based AllReduce + os.environ["LOCAL_WORLD_SIZE"] = str( + self.parallel_config.tensor_parallel_size) + self.model_config = _verify_and_get_model_config(self.model_config) self.cache_config = _verify_and_get_cache_config(self.cache_config) self.scheduler_config = _verify_and_get_scheduler_config( self.scheduler_config) - # Instantiate the worker and load the model to CPU. - self._init_worker() - - def _init_worker(self): - from vllm.worker.cpu_worker import CPUWorker + # Multiprocessing-based executor does not support multi-node setting. + # Since it only works for single node, we can use the loopback address + # 127.0.0.1 for communication. + ip = "127.0.0.1" + port = get_open_port() + self.distributed_init_method = get_distributed_init_method(ip, port) + + is_async = isinstance(self, CPUExecutorAsync) + + world_size = self.parallel_config.tensor_parallel_size + result_handler = ResultHandler() + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + self.workers = [] + + if is_async: + self.workers = [ + ProcessWorkerWrapper( + result_handler, + partial( + self._create_worker, + rank=rank, + local_rank=rank, + )) for rank in range(0, world_size) + ] + self.driver_worker = self.workers[0] + self.workers = self.workers[1:] + self.driver_method_invoker = _async_driver_method_invoker + else: + self.driver_worker = self._create_worker() + self.driver_method_invoker = _driver_method_invoker + + if world_size != 1: + self.workers = [ + ProcessWorkerWrapper( + result_handler, + partial( + self._create_worker, + rank=rank, + local_rank=rank, + )) for rank in range(1, world_size) + ] + + if world_size != 1 or is_async: + if is_async: + async_worker_list = self.workers + [self.driver_worker] + else: + async_worker_list = self.workers + self.worker_monitor = WorkerMonitor(async_worker_list, + result_handler) + result_handler.start() + self.worker_monitor.start() + + self._run_workers("init_device") + self._run_workers("load_model") + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + ): + worker_module_name = "vllm.worker.cpu_worker" + worker_class_name = "CPUWorker" + + wrapper = WorkerWrapperBase( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + ) - assert self.parallel_config.world_size == 1, ( - "CPUExecutor only supports single CPU socket currently.") + assert self.distributed_init_method is not None - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = CPUWorker( + kwargs = dict( model_config=self.model_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, cache_config=self.cache_config, load_config=self.load_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, + local_rank=local_rank, + rank=rank, + distributed_init_method=self.distributed_init_method, lora_config=self.lora_config, multimodal_config=self.multimodal_config, kv_cache_dtype=self.cache_config.cache_dtype, prompt_adapter_config=self.prompt_adapter_config, - is_driver_worker=True, + is_driver_worker=rank == 0, ) - self.driver_worker.init_device() - self.driver_worker.load_model() + wrapper.init_worker(**kwargs) + + return wrapper.worker + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. + + Args: + async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than + blocking on the results. + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + # Start the workers first. + worker_outputs = [ + worker.execute_method(method, *args, **kwargs) + for worker in self.workers + ] + + if async_run_remote_workers_only: + # Just return futures + return worker_outputs + + driver_worker_output = self.driver_method_invoker( + self.driver_worker, method, *args, **kwargs) + + # Get the results of the workers. + return [driver_worker_output + ] + [output.get() for output in worker_outputs] def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks by invoking the underlying worker. """ - return self.driver_worker.determine_num_available_blocks() + return self.driver_method_invoker(self.driver_worker, + "determine_num_available_blocks") def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: @@ -74,43 +206,95 @@ def initialize_cache(self, num_gpu_blocks: int, # referred as `gpu block`. Because we want to reuse the existing block # management procedure. logger.info("# CPU blocks: %d", num_gpu_blocks) - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = self.driver_worker.execute_model(execute_model_req) + if (self.parallel_config.tensor_parallel_size > 1 + and self.parallel_worker_tasks is None): + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + ) + output = self.driver_method_invoker(self.driver_worker, + "execute_model", execute_model_req) return output + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + """ + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + self.driver_method_invoker(self.driver_worker, "execute_model", None) + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + def add_lora(self, lora_request: LoRARequest) -> bool: - return self.driver_worker.add_lora(lora_request) + return all(self._run_workers("add_lora", lora_request)) def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_id) + return all(self._run_workers("remove_lora", lora_id)) def pin_lora(self, lora_id: int) -> bool: - return self.driver_worker.pin_lora(lora_id) + assert lora_id > 0, "lora_id must be greater than 0." + return all(self._run_workers( + "pin_lora", + lora_id=lora_id, + )) def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() + return self.driver_method_invoker(self.driver_worker, "list_loras") def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return self.driver_worker.add_prompt_adapter(prompt_adapter_request) + return all( + self._run_workers( + "add_prompt_adapter", + prompt_adapter_request, + )) def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.driver_worker.remove_prompt_adapter(prompt_adapter_id) + return all( + self._run_workers( + "remove_prompt_adapter", + prompt_adapter_id, + )) def list_prompt_adapters(self) -> Set[int]: - return self.driver_worker.list_prompt_adapters() + return self.driver_method_invoker(self.driver_worker, + "list_prompt_adapters") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.driver_worker.pin_prompt_adapter(prompt_adapter_id) + return all(self._run_workers( + "pin_prompt_adapter", + prompt_adapter_id, + )) def check_health(self) -> None: - # CPUExecutor will always be healthy as long as - # it's running. - return + """Raises an error if engine is unhealthy.""" + if self.worker_monitor is not None and not self.worker_monitor.is_alive( + ): + raise RuntimeError("Worker processes are not running") + + def shutdown(self): + if (worker_monitor := getattr(self, "worker_monitor", + None)) is not None: + worker_monitor.close() + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + for result in parallel_worker_tasks: + result.get() class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): @@ -118,14 +302,12 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model + output = await make_async(self.execute_model )(execute_model_req=execute_model_req, ) return output async def check_health_async(self) -> None: - # CPUExecutor will always be healthy as long as - # it's running. - return + self.check_health() def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: @@ -170,3 +352,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: f" {kv_cache_space}, expect a positive integer value.") return config + + +def _driver_method_invoker(driver, method: str, *args, **kwargs): + return getattr(driver, method)(*args, **kwargs) + + +def _async_driver_method_invoker(driver, method: str, *args, **kwargs): + return driver.execute_method(method, *args, **kwargs).get() diff --git a/vllm/utils.py b/vllm/utils.py index 876c3bf90b02c..90be09fc7b967 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -404,27 +404,6 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def init_kmp_env(): - if not is_cpu(): - return - - ld_prealod_str = os.getenv("LD_PRELOAD", "") - if "libiomp5.so" not in ld_prealod_str: - return - - # The time(milliseconds) that a thread should wait after completing the - # execution of a parallel region, before sleeping. - os.environ['KMP_BLOCKTIME'] = "1" - # dump settings on start up - os.environ['KMP_SETTINGS'] = "1" - # Prevents the CPU to run into low performance state - os.environ['KMP_TPAUSE'] = "0" - # Provides fine granularity parallelism - os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" - - def chunk_list(lst: List[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" for i in range(0, len(lst), chunk_size): diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 83f4ba69fb728..71763c08ec45f 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -42,6 +42,7 @@ class CPUModelInput(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None sampling_metadata: Optional["SamplingMetadata"] = None multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None + virtual_engine: Optional[int] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -204,8 +205,8 @@ def _prepare_prompt( attn_metadata = self.attn_backend.make_metadata( is_prompt=True, seq_lens=seq_lens, - seq_lens_tensor=None, - max_decode_seq_len=None, + seq_lens_tensor=torch.tensor([]), + max_decode_seq_len=0, num_prefills=len(seq_lens), num_prefill_tokens=num_prompt_tokens, num_decode_tokens=0, @@ -345,7 +346,7 @@ def prepare_model_input( multi_modal_kwargs=multi_modal_kwargs, ) - @torch.inference_mode() + @torch.no_grad() def execute_model( self, model_input: CPUModelInput, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3c22c73267b7f..735d48c908d61 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -4,6 +4,7 @@ import torch import torch.distributed +import vllm.envs as envs from vllm.attention import get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, @@ -13,7 +14,7 @@ from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.cpu_model_runner import CPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerInput) @@ -152,13 +153,18 @@ def __init__( if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." - # try to initialize intel openmp optimized tunings - init_kmp_env() - if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() + + # Setup OpenMP threads affinity. + omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND + if omp_cpuids == "all": + self.local_omp_cpuid = "all" + else: + self.local_omp_cpuid = omp_cpuids.split("|")[rank] + self.model_runner: CPUModelRunner = CPUModelRunner( model_config, parallel_config, @@ -177,6 +183,9 @@ def __init__( self.cpu_cache: List[List[torch.Tensor]] def init_device(self) -> None: + if self.local_omp_cpuid != "all": + torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) + self.init_distributed_environment() # Set random seed. set_random_seed(self.model_config.seed) From 281977bd6eccade50be461f5a22cc51b74006976 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Jul 2024 17:32:44 -0400 Subject: [PATCH 21/45] [Doc] Add Nemotron to supported model docs (#6843) --- docs/source/models/supported_models.rst | 4 ++++ vllm/model_executor/layers/activation.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index dc8bd6fb245df..483f552bba238 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -113,6 +113,10 @@ Decoder-only Language Models - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - + * - :code:`NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 6578193a31597..4c14fe476ee4a 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -166,9 +166,7 @@ class ReLUSquaredActivation(CustomOp): def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" - relu_applied = nn.functional.relu(x) - squared = torch.square(relu_applied) - return squared + return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: return self.forward_native(x) From 150a1ffbfd3d0429d30fa5ab841f53903a0a8a62 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 26 Jul 2024 17:39:10 -0400 Subject: [PATCH 22/45] [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) --- docs/source/serving/run_on_sky.rst | 430 ++++++++++++++++------------- 1 file changed, 243 insertions(+), 187 deletions(-) diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst index bd33c76cec3de..674b14a879bc3 100644 --- a/docs/source/serving/run_on_sky.rst +++ b/docs/source/serving/run_on_sky.rst @@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot .. raw:: html -

- vLLM -

+

+ vLLM +

vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. @@ -21,8 +21,8 @@ Prerequisites .. code-block:: console - pip install skypilot-nightly - sky check + pip install skypilot-nightly + sky check Run on a single instance @@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. .. code-block:: console - (task, pid=7431) Running on public URL: https://.gradio.live + (task, pid=7431) Running on public URL: https://.gradio.live **Optional**: Serve the 70B model instead of the default 8B and use more GPU: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct Scale up to multiple replicas @@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + .. raw:: html -
- Click to see the full recipe YAML +
+ Click to see the full recipe YAML .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? max_tokens: 1 - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log .. raw:: html -
+
Start the serving the Llama-3 8B model on multiple replicas: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN Wait until the service is ready: .. code-block:: console - watch -n10 sky serve status vllm + watch -n10 sky serve status vllm .. raw:: html -
- Example outputs: +
+ Example outputs: .. code-block:: console - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4 + Service Replicas + SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION + vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 + vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 .. raw:: html - -
+ +
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: .. code-block:: console - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' - -To enable autoscaling, you could specify additional configs in `services`: + ENDPOINT=$(sky serve status --endpoint 8081 vllm) + curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: .. code-block:: yaml - services: - replica_policy: - min_replicas: 0 - max_replicas: 3 - target_qps_per_replica: 2 + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 This will scale the service up to when the QPS exceeds 2 for each replica. + +.. raw:: html + +
+ Click to see the full recipe YAML + + +.. code-block:: yaml + + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + + +.. raw:: html + +
+ +To update the service with the new config: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN + + +To stop the service: + +.. code-block:: console + + sky serve down vllm + **Optional**: Connect a GUI to the endpoint ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend, .. raw:: html -
- Click to see the full GUI YAML +
+ Click to see the full GUI YAML .. code-block:: yaml - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - - resources: - cpus: 2 - - setup: | - conda activate vllm - if [ $? -ne 0 ]; then - conda create -n vllm python=3.10 -y - conda activate vllm - fi - - # Install Gradio for web UI. - pip install gradio openai - - run: | - conda activate vllm - export PATH=$PATH:/sbin - WORKER_IP=$(hostname -I | cut -d' ' -f1) - CONTROLLER_PORT=21001 - WORKER_PORT=21002 - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + + resources: + cpus: 2 + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + + run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log + .. raw:: html - -
+ +
1. Start the chat web UI: .. code-block:: console - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) 2. Then, we can access the GUI at the returned gradio link: .. code-block:: console - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live From b5f49ee55beac7bb314fd8880bdb718ed117dacb Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:26:45 -0400 Subject: [PATCH 23/45] Update README.md (#6847) --- examples/fp8/quantizer/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md index 8f89a74a6a367..0b6944f688b49 100644 --- a/examples/fp8/quantizer/README.md +++ b/examples/fp8/quantizer/README.md @@ -16,7 +16,7 @@ #### Run on H100 system for speed if FP8; number of GPUs depends on the model size #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: -`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` +`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1` Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) ``` From bb5494676f5f57f1cf7cf72598de5434a2a22865 Mon Sep 17 00:00:00 2001 From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:32:20 -0700 Subject: [PATCH 24/45] enforce eager mode with bnb quantization temporarily (#6846) --- vllm/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 6403a53f86281..92fde449b43fd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -282,6 +282,10 @@ def verify_with_parallel_config( raise ValueError( "BitAndBytes quantization with TP or PP is not supported yet.") + if self.quantization == "bitsandbytes" and self.enforce_eager is False: + raise ValueError( + "BitAndBytes with enforce_eager = False is not supported yet.") + def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled.""" From d09b94ca588c6de1e627194264357e14460ae2eb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 18:45:57 -0700 Subject: [PATCH 25/45] [TPU] Support collective communications in XLA devices (#6813) --- .../device_communicators/tpu_communicator.py | 30 +++++++++++++++++++ vllm/distributed/parallel_state.py | 22 ++++++++++++++ vllm/lora/layers.py | 4 +++ .../model_executor/layers/logits_processor.py | 16 ++++++++-- 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 vllm/distributed/device_communicators/tpu_communicator.py diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py new file mode 100644 index 0000000000000..69a9a516f3ebe --- /dev/null +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -0,0 +1,30 @@ +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.platforms import current_platform + +if current_platform.is_tpu(): + import torch_xla.core.xla_model as xm + from torch_xla._internal import pjrt + + +class TpuCommunicator: + + def __init__(self, group: ProcessGroup): + if not current_platform.is_tpu(): + self.disabled = True + return + self.disabled = False + + local_rank = dist.get_rank(group) + world_size = dist.get_world_size(group) + pjrt.initialize_multiprocess(local_rank, world_size) + xm._init_world_size_ordinal() + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + return xm.all_reduce(xm.REDUCE_SUM, x) + + def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: + assert dim == -1, "TPUs only support dim=-1 for all-gather." + return xm.all_gather(x, dim=dim) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 58cae46d9af27..4116b1729d188 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -133,6 +133,7 @@ def __init__( torch_distributed_backend: Union[str, Backend], use_pynccl: bool, use_custom_allreduce: bool, + use_tpu_communicator: bool, use_message_queue_broadcaster: bool = False, ): @@ -164,6 +165,7 @@ def __init__( self.use_pynccl = use_pynccl self.use_custom_allreduce = use_custom_allreduce + self.use_tpu_communicator = use_tpu_communicator # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( @@ -190,6 +192,12 @@ def __init__( else: self.ca_comm = None + from vllm.distributed.device_communicators.tpu_communicator import ( + TpuCommunicator) + self.tpu_communicator: Optional[TpuCommunicator] + if use_tpu_communicator and self.world_size > 1: + self.tpu_communicator = TpuCommunicator(group=self.cpu_group) + from vllm.distributed.device_communicators.shm_broadcast import ( MessageQueue) self.mq_broadcaster: Optional[MessageQueue] = None @@ -289,6 +297,12 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: # Bypass the function if we are using only 1 GPU. if self.world_size == 1: return input_ + + # For TPUs, use TPU communicator. + tpu_comm = self.tpu_communicator + if tpu_comm is not None and not tpu_comm.disabled: + return tpu_comm.all_reduce(input_) + if ca_comm is not None: out = ca_comm.custom_all_reduce(input_) if out is not None: @@ -310,6 +324,12 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: return input_ assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + + # For TPUs, use TPU communicator. + tpu_comm = self.tpu_communicator + if tpu_comm is not None and not tpu_comm.disabled: + return tpu_comm.all_gather(input_, dim) + if dim < 0: # Convert negative dim to positive. dim += input_.dim() @@ -727,6 +747,7 @@ def init_world_group(ranks: List[int], local_rank: int, torch_distributed_backend=backend, use_pynccl=False, use_custom_allreduce=False, + use_tpu_communicator=False, ) @@ -745,6 +766,7 @@ def init_model_parallel_group( torch_distributed_backend=backend, use_pynccl=True, use_custom_allreduce=use_custom_allreduce, + use_tpu_communicator=True, use_message_queue_broadcaster=use_message_queue_broadcaster, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 40de134c0a5ee..87de285a373a2 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1067,6 +1067,10 @@ def scale(self): def soft_cap(self): return self.base_layer.soft_cap + @property + def use_gather(self): + return self.base_layer.use_gather + @property def org_vocab_size(self): return self.base_layer.org_vocab_size diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index f6fcf49ef464b..bd3e7e114204f 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -5,10 +5,12 @@ import torch import torch.nn as nn -from vllm.distributed import tensor_model_parallel_gather +from vllm.distributed import (tensor_model_parallel_all_gather, + tensor_model_parallel_gather) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform class LogitsProcessor(nn.Module): @@ -39,6 +41,8 @@ def __init__(self, self.org_vocab_size = org_vocab_size or vocab_size # Soft cap the logits. Used in Gemma 2. self.soft_cap = soft_cap + # Whether to use gather or all-gather to gather the logits. + self.use_gather = not current_platform.is_tpu() def forward( self, @@ -76,7 +80,15 @@ def _get_logits(self, hidden_states: torch.Tensor, logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) - logits = tensor_model_parallel_gather(logits) + if self.use_gather: + logits = tensor_model_parallel_gather(logits) + else: + # Gather is not supported for some devices such as TPUs. + # Use all-gather instead. + # NOTE(woosuk): Here, the outputs of every device should not be None + # because XLA requires strict SPMD among all devices. Every device + # should execute the same operations after gathering the logits. + logits = tensor_model_parallel_all_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[:, :self.org_vocab_size] From 981b0d567355063d5453e382a85970cae083c615 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 27 Jul 2024 09:58:25 +0800 Subject: [PATCH 26/45] [Frontend] Factor out code for running uvicorn (#6828) --- vllm/entrypoints/api_server.py | 74 ++++++++++++++++++--------- vllm/entrypoints/openai/api_server.py | 72 ++++++++------------------ vllm/server/__init__.py | 3 ++ vllm/server/launch.py | 42 +++++++++++++++ 4 files changed, 116 insertions(+), 75 deletions(-) create mode 100644 vllm/server/__init__.py create mode 100644 vllm/server/launch.py diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 66941442c8c9c..3476357658522 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -5,12 +5,12 @@ We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. """ - +import asyncio import json import ssl -from typing import AsyncGenerator +from argparse import Namespace +from typing import Any, AsyncGenerator, Optional -import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -18,8 +18,10 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.logger import init_logger from vllm.sampling_params import SamplingParams +from vllm.server import serve_http from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") @@ -81,6 +83,50 @@ async def stream_results() -> AsyncGenerator[bytes, None]: return JSONResponse(ret) +def build_app(args: Namespace) -> FastAPI: + global app + + app.root_path = args.root_path + return app + + +async def init_app( + args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, +) -> FastAPI: + app = build_app(args) + + global engine + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = (llm_engine + if llm_engine is not None else AsyncLLMEngine.from_engine_args( + engine_args, usage_context=UsageContext.API_SERVER)) + + return app + + +async def run_server(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, + **uvicorn_kwargs: Any) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) + + app = await init_app(args, llm_engine) + await serve_http( + app, + host=args.host, + port=args.port, + log_level=args.log_level, + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + **uvicorn_kwargs, + ) + + if __name__ == "__main__": parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default=None) @@ -105,25 +151,5 @@ async def stream_results() -> AsyncGenerator[bytes, None]: parser.add_argument("--log-level", type=str, default="debug") parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args( - engine_args, usage_context=UsageContext.API_SERVER) - - app.root_path = args.root_path - logger.info("Available routes are:") - for route in app.routes: - if not hasattr(route, 'methods'): - continue - methods = ', '.join(route.methods) - logger.info("Route: %s, Methods: %s", route.path, methods) - - uvicorn.run(app, - host=args.host, - port=args.port, - log_level=args.log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs) + asyncio.run(run_server(args)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0fe4dd245b5e6..c1640a10a407d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -2,14 +2,12 @@ import importlib import inspect import re -import signal +from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Optional, Set +from typing import Any, Optional, Set -import fastapi -import uvicorn -from fastapi import APIRouter, Request +from fastapi import APIRouter, FastAPI, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -38,6 +36,7 @@ from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.logger import init_logger +from vllm.server import serve_http from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION @@ -57,7 +56,7 @@ @asynccontextmanager -async def lifespan(app: fastapi.FastAPI): +async def lifespan(app: FastAPI): async def _force_log(): while True: @@ -75,7 +74,7 @@ async def _force_log(): router = APIRouter() -def mount_metrics(app: fastapi.FastAPI): +def mount_metrics(app: FastAPI): # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app()) # Workaround for 307 Redirect for /metrics @@ -165,8 +164,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return JSONResponse(content=generator.model_dump()) -def build_app(args): - app = fastapi.FastAPI(lifespan=lifespan) +def build_app(args: Namespace) -> FastAPI: + app = FastAPI(lifespan=lifespan) app.include_router(router) app.root_path = args.root_path @@ -214,11 +213,8 @@ async def authentication(request: Request, call_next): return app -async def build_server( - args, - llm_engine: Optional[AsyncLLMEngine] = None, - **uvicorn_kwargs, -) -> uvicorn.Server: +async def init_app(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI: app = build_app(args) if args.served_model_name is not None: @@ -281,14 +277,17 @@ async def build_server( ) app.root_path = args.root_path - logger.info("Available routes are:") - for route in app.routes: - if not hasattr(route, 'methods'): - continue - methods = ', '.join(route.methods) - logger.info("Route: %s, Methods: %s", route.path, methods) + return app + + +async def run_server(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, + **uvicorn_kwargs: Any) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) - config = uvicorn.Config( + app = await init_app(args, llm_engine) + await serve_http( app, host=args.host, port=args.port, @@ -301,36 +300,6 @@ async def build_server( **uvicorn_kwargs, ) - return uvicorn.Server(config) - - -async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None: - logger.info("vLLM API server version %s", VLLM_VERSION) - logger.info("args: %s", args) - - server = await build_server( - args, - llm_engine, - **uvicorn_kwargs, - ) - - loop = asyncio.get_running_loop() - - server_task = loop.create_task(server.serve()) - - def signal_handler() -> None: - # prevents the uvicorn signal handler to exit early - server_task.cancel() - - loop.add_signal_handler(signal.SIGINT, signal_handler) - loop.add_signal_handler(signal.SIGTERM, signal_handler) - - try: - await server_task - except asyncio.CancelledError: - print("Gracefully stopping http server") - await server.shutdown() - if __name__ == "__main__": # NOTE(simon): @@ -339,4 +308,5 @@ def signal_handler() -> None: description="vLLM OpenAI-Compatible RESTful API server.") parser = make_arg_parser(parser) args = parser.parse_args() + asyncio.run(run_server(args)) diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py new file mode 100644 index 0000000000000..17c98b4dad6c9 --- /dev/null +++ b/vllm/server/__init__.py @@ -0,0 +1,3 @@ +from .launch import serve_http + +__all__ = ["serve_http"] diff --git a/vllm/server/launch.py b/vllm/server/launch.py new file mode 100644 index 0000000000000..1a8aeb7f1022b --- /dev/null +++ b/vllm/server/launch.py @@ -0,0 +1,42 @@ +import asyncio +import signal +from typing import Any + +import uvicorn +from fastapi import FastAPI + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None: + logger.info("Available routes are:") + for route in app.routes: + methods = getattr(route, "methods", None) + path = getattr(route, "path", None) + + if methods is None or path is None: + continue + + logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) + + config = uvicorn.Config(app, **uvicorn_kwargs) + server = uvicorn.Server(config) + + loop = asyncio.get_running_loop() + + server_task = loop.create_task(server.serve()) + + def signal_handler() -> None: + # prevents the uvicorn signal handler to exit early + server_task.cancel() + + loop.add_signal_handler(signal.SIGINT, signal_handler) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + try: + await server_task + except asyncio.CancelledError: + logger.info("Gracefully stopping http server") + await server.shutdown() From 55712941e57bcfd662db2905811d6e2807b9153f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 26 Jul 2024 22:27:44 -0400 Subject: [PATCH 27/45] [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b (#6852) --- .../broadcast_load_epilogue_c3x.hpp | 46 +++++++++++++++---- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 877a9f5b9e5de..e4bc9752ed7db 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -328,20 +328,36 @@ struct Sm90ColOrScalarBroadcast { return EmptyProducerLoadCallbacks{}; } - template + template struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { CUTLASS_DEVICE - ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params) - : tCgCol(cute::forward(tCgCol)), - tCrCol(cute::forward(tCrCol)), - params(params) {} + ConsumerStoreCallbacks( + GTensor&& tCgCol, + RTensor&& tCrCol, + CTensor&& tCcCol, + ProblemShape problem_shape, + Params const& params + ): + tCgCol(cute::forward(tCgCol)), + tCrCol(cute::forward(tCrCol)), + tCcCol(cute::forward(tCcCol)), + m(get<0>(problem_shape)), + params(params) {} GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) - RTensor tCrCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + RTensor tCrCol; + CTensor tCcCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) Params const& params; + int m; CUTLASS_DEVICE void begin() { + Tensor pred = make_tensor(shape(tCgCol)); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(pred); ++i) { + pred(i) = get<0>(tCcCol(i)) < m; + } + if (!params.col_broadcast) { fill(tCrCol, *(params.ptr_col)); return; @@ -349,7 +365,7 @@ struct Sm90ColOrScalarBroadcast { // Filter so we don't issue redundant copies over stride-0 modes // (only works if 0-strides are in same location, which is by construction) - copy_aligned(filter(tCgCol), filter(tCrCol)); + copy_if(pred, filter(tCgCol), filter(tCrCol)); } template @@ -381,8 +397,20 @@ struct Sm90ColOrScalarBroadcast { mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) - return ConsumerStoreCallbacks( - cute::move(tCgCol), cute::move(tCrCol), params); + // Generate an identity tensor matching the shape of the global tensor and + // partition the same way, this will be used to generate the predicate + // tensor for loading + Tensor cCol = make_identity_tensor(mCol.shape()); + Tensor tCcCol = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); + + return ConsumerStoreCallbacks( + cute::move(tCgCol), + cute::move(tCrCol), + cute::move(tCcCol), + args.problem_shape_mnkl, + params + ); } }; From 969d03226514d99f43cf7d17d1e336231d91751a Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Fri, 26 Jul 2024 23:02:25 -0400 Subject: [PATCH 28/45] [Bugfix]: Fix Tensorizer test failures (#6835) --- .buildkite/test-pipeline.yaml | 1 - tests/tensorizer_loader/conftest.py | 45 ++++++++++++++++++++++ tests/tensorizer_loader/test_tensorizer.py | 7 +--- 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 tests/tensorizer_loader/conftest.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 633bc5ca95bf9..5b4a786305e1f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -220,7 +220,6 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] - soft_fail: true fast_check: true commands: - apt-get install -y curl libsodium23 diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py new file mode 100644 index 0000000000000..c5c6fc1057d31 --- /dev/null +++ b/tests/tensorizer_loader/conftest.py @@ -0,0 +1,45 @@ +# isort: skip_file + +import contextlib +import gc + +import pytest +import ray +import torch + +from vllm.distributed import (destroy_distributed_environment, + destroy_model_parallel) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + + +def cleanup(): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.cuda.empty_cache() + ray.shutdown() + + +@pytest.fixture() +def should_do_global_cleanup_after_test(request) -> bool: + """Allow subdirectories to skip global cleanup by overriding this fixture. + This can provide a ~10x speedup for non-GPU unit tests since they don't need + to initialize torch. + """ + + return True + + +@pytest.fixture(autouse=True) +def cleanup_fixture(should_do_global_cleanup_after_test: bool): + yield + if should_do_global_cleanup_after_test: + cleanup() + + +@pytest.fixture(autouse=True) +def tensorizer_config(): + config = TensorizerConfig(tensorizer_uri="vllm") + return config \ No newline at end of file diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b7030e3cd6d42..2adeae8874bdb 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -40,7 +40,6 @@ tensorize_model_for_testing_script = os.path.join( os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py") - def is_curl_installed(): try: subprocess.check_call(['curl', '--version']) @@ -63,10 +62,6 @@ def write_keyfile(keyfile_path: str): with open(keyfile_path, 'wb') as f: f.write(encryption_params.key) -@pytest.fixture(autouse=True) -def tensorizer_config(): - config = TensorizerConfig(tensorizer_uri="vllm") - return config @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent') @@ -105,6 +100,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + cleanup() with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -316,6 +312,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): + cleanup() model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) From ced36cd89b9c012eb066ef863b2d1ecf052f3e00 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 20:16:13 -0700 Subject: [PATCH 29/45] [ROCm] Upgrade PyTorch nightly version (#6845) --- Dockerfile.rocm | 4 ++-- docs/source/getting_started/amd-installation.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 7b4c0166a04bd..64bc0f3c12c75 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ *"rocm-6.1"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 \ - torchvision==0.20.0.dev20240710 \ + torch==2.5.0.dev20240726 \ + torchvision==0.20.0.dev20240726 \ --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ *) ;; esac diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 1c7d274b7c47e..9648d07d2790c 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 + $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 $ # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi From 3c3012398e4aecde9e40981d79a0576203158d24 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:20:16 -0700 Subject: [PATCH 30/45] [Doc] add VLLM_TARGET_DEVICE=neuron to documentation for neuron (#6844) Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com> --- docs/source/getting_started/neuron-installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index 62bf779c339d5..0816524468cab 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able $ git clone https://github.com/vllm-project/vllm.git $ cd vllm $ pip install -U -r requirements-neuron.txt - $ pip install . + $ VLLM_TARGET_DEVICE="neuron" pip install . If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. From ed94e4f427bce8611e198d051dbd3b0097b448e8 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Sat, 27 Jul 2024 06:45:31 +0300 Subject: [PATCH 31/45] [Bugfix][Model] Jamba assertions and no chunked prefill by default for Jamba (#6784) --- vllm/engine/arg_utils.py | 6 +++++- vllm/model_executor/models/jamba.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd64d3345b830..bad5be4917216 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + has_seqlen_agnostic_layers = ( + model_config.contains_seqlen_agnostic_layers( + parallel_config)) if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and not self.enable_prefix_caching): + and not self.enable_prefix_caching + and not has_seqlen_agnostic_layers): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index d4e4f0055aa2b..3444578227259 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -644,6 +644,11 @@ def __init__( lora_config: Optional[LoRAConfig] = None, scheduler_config: Optional[SchedulerConfig] = None, ) -> None: + assert not scheduler_config.chunked_prefill_enabled, \ + "Jamba currently does not support chunked prefill" + assert not cache_config.enable_prefix_caching, \ + "Jamba currently does not support prefix caching" + super().__init__() self.config = config self.scheduler_config = scheduler_config From 14dbd5a7674e5de2862c18adb711d9feecd35063 Mon Sep 17 00:00:00 2001 From: Joe Date: Fri, 26 Jul 2024 20:47:50 -0700 Subject: [PATCH 32/45] [Model] H2O Danube3-4b (#6451) --- .buildkite/run-cpu-test.sh | 2 +- .../kernels/benchmark_paged_attention.py | 2 +- benchmarks/kernels/benchmark_rope.py | 2 +- csrc/attention/attention_kernels.cu | 6 +++ tests/kernels/test_attention.py | 4 +- tests/kernels/test_cache.py | 8 ++- tests/kernels/test_pos_encoding.py | 2 +- tests/models/test_danube3_4b.py | 52 +++++++++++++++++++ vllm/attention/ops/paged_attn.py | 2 +- vllm/utils.py | 6 +++ 10 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 tests/models/test_danube3_4b.py diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 21deec2bba973..45bc8eb2f8477 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest Pillow protobuf - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 78cac8a555d1b..a04433142da42 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 78736c7a7ba6f..f542684a9a2a9 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora( parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--dtype", diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 350dbce1d7ba9..875570a1e894f 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -751,6 +751,9 @@ void paged_attention_v1_launcher( case 112: LAUNCH_PAGED_ATTENTION_V1(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V1(128); break; @@ -912,6 +915,9 @@ void paged_attention_v2_launcher( case 112: LAUNCH_PAGED_ATTENTION_V2(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V2(128); break; diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 2e6412c28958e..c7c6707461c3e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -28,7 +28,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] @@ -134,6 +134,8 @@ def test_paged_attention( seed: int, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f9a609464abfc..3fb9b59be1701 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -11,7 +11,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing @@ -52,6 +52,8 @@ def test_copy_blocks( kv_cache_dtype: str, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -124,6 +126,8 @@ def test_reshape_and_cache( device: str, kv_cache_dtype: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -325,6 +329,8 @@ def test_swap_blocks( ) -> None: if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 4c83659929d41..4a7ad6e0fa21d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing diff --git a/tests/models/test_danube3_4b.py b/tests/models/test_danube3_4b.py new file mode 100644 index 0000000000000..bfaa275f73c19 --- /dev/null +++ b/tests/models/test_danube3_4b.py @@ -0,0 +1,52 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +This tests danube3 separately because its head size isn't supported on CPU yet. + +Run `pytest tests/models/test_danube3_4b.py`. +""" +import pytest + +from .utils import check_outputs_equal + +MODELS = ["h2oai/h2o-danube3-4b-base"] + +target_dtype = "half" + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +def test_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index ce7b4d129779c..0f6d2f2d1ab3f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -31,7 +31,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 192, 256] + return [64, 80, 96, 112, 120, 128, 192, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/utils.py b/vllm/utils.py index 90be09fc7b967..1448316e66edb 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -508,6 +508,12 @@ def create_kv_caches_with_random( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + + if cache_dtype == "fp8" and head_size % 16: + raise ValueError( + f"Does not support key cache of type fp8 with head_size {head_size}" + ) + torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) From 52f07e3dec2b76045208f5cfea5670b85a719cc6 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 20:54:27 -0700 Subject: [PATCH 33/45] [Hardware][TPU] Implement tensor parallelism with Ray (#5871) --- requirements-tpu.txt | 1 + vllm/attention/backends/pallas.py | 4 +- vllm/engine/llm_engine.py | 10 +- vllm/executor/ray_tpu_executor.py | 313 ++++++++++++++++++++++++++++++ vllm/worker/tpu_model_runner.py | 42 ++-- vllm/worker/tpu_worker.py | 16 +- 6 files changed, 365 insertions(+), 21 deletions(-) create mode 100644 vllm/executor/ray_tpu_executor.py diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 22487f5524dd7..c2140fbffec9f 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -4,4 +4,5 @@ # Dependencies for TPU # Currently, the TPU backend uses a nightly version of PyTorch XLA. # You can install the dependencies in Dockerfile.tpu. +ray triton # To avoid import errors diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b83a83bb177d4..c53a2f91b89d7 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -55,8 +55,8 @@ class PallasMetadata(AttentionMetadata): # Currently, input sequences can only contain all prefills # or all decoding. - block_tables: Optional[torch.Tensor] - context_lens: Optional[torch.Tensor] + block_tables: Optional[torch.Tensor] = None + context_lens: Optional[torch.Tensor] = None @property def prefill_metadata(self) -> Optional["PallasMetadata"]: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 48d5305892219..004348d4c49a3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -394,8 +394,14 @@ def _get_executor_cls(cls, from vllm.executor.neuron_executor import NeuronExecutor executor_class = NeuronExecutor elif engine_config.device_config.device_type == "tpu": - from vllm.executor.tpu_executor import TPUExecutor - executor_class = TPUExecutor + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_tpu_executor import RayTPUExecutor + executor_class = RayTPUExecutor + else: + assert distributed_executor_backend is None + from vllm.executor.tpu_executor import TPUExecutor + executor_class = TPUExecutor elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py new file mode 100644 index 0000000000000..7048d47980723 --- /dev/null +++ b/vllm/executor/ray_tpu_executor.py @@ -0,0 +1,313 @@ +import asyncio +import os +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple, + Union) + +import vllm.envs as envs +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.tpu_executor import TPUExecutor +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + get_vllm_instance_id, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +class RayTPUExecutor(TPUExecutor): + + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + super().__init__(*args, **kwargs) + + def _init_executor(self) -> None: + assert self.parallel_config.distributed_executor_backend == "ray" + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel TPU workers. + self._init_workers_ray(placement_group) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("TPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + assert self.speculative_config is None + worker_module_name = "vllm.worker.tpu_worker" + worker_class_name = "TPUWorker" + + worker = ray.remote( + num_cpus=0, + resources={"TPU": 1}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any TPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "TPU node.") + + # Get the set of TPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) + for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for _ in worker_node_and_gpu_ids] + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + + if len(node_workers) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + - async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than blocking + on the results. + - args/kwargs: All workers share the same args/kwargs + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + # Get the results of the ray workers. + if self.workers: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + num_blocks = self._run_workers("determine_num_available_blocks", ) + num_tpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + return num_tpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + **self.extra_execute_model_run_workers_kwargs) + + # Only the driver worker returns the sampling results. + return self._driver_execute_model(execute_model_req) + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + + +class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_method = make_async(self.driver_worker.execute_method) + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 8a8b412db6731..e5bb101fc7df4 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from unittest.mock import patch import numpy as np import torch @@ -45,6 +46,7 @@ class ModelInputForTPU(ModelRunnerInputBase): num_samples: int best_of: List[int] seq_groups: List[List[int]] + virtual_engine: int = 0 def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -55,6 +57,9 @@ def as_broadcastable_tensor_dict( "t": self.t, "p": self.p, "num_samples": self.num_samples, + "best_of": self.best_of, + "seq_groups": self.seq_groups, + "virtual_engine": self.virtual_engine, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -113,16 +118,30 @@ def __init__( def load_model(self) -> None: self.device = self.device_config.device - model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - parallel_config=self.parallel_config, - cache_config=self.cache_config, - scheduler_config=self.scheduler_config, - multimodal_config=self.multimodal_config, - lora_config=None, - ) + # NOTE(woosuk): While the executor assigns the TP ranks to the worker + # process, the ranks can be different from the ranks internally assigned + # by the xm runtime. Therefore, there is a mismatch in the rank + # assignment between the gloo (cpu) runtime and the xm (tpu) runtime. + # This is not a problem in linear layers because all-reduce is + # rank-agnostic. However, it matters for all-gather as the ranks + # determine the order of concatenating the output tensors. + # As a workaround, we use the xm's rank assignment only when loading + # the embedding weights. + xm_tp_rank = xm.get_ordinal() + with patch( + "vllm.model_executor.layers.vocab_parallel_embedding." + "get_tensor_model_parallel_rank", + return_value=xm_tp_rank): + model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + parallel_config=self.parallel_config, + cache_config=self.cache_config, + scheduler_config=self.scheduler_config, + multimodal_config=self.multimodal_config, + lora_config=None, + ) model = model.eval() xm.wait_device_ops() @@ -463,10 +482,11 @@ def make_model_input_from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend) return model_input + @torch.no_grad() def execute_model( self, model_input: ModelInputForTPU, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: Optional[List[Any]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, ) -> List[SamplerOutput]: diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 03011e03058d8..c88aba7ae08cd 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -70,13 +70,13 @@ def __init__( def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" - self.device = xm.xla_device() - self.device_config.device = self.device torch.set_grad_enabled(False) torch.set_default_dtype(self.model_config.dtype) - # NOTE(woosuk): This is just a hack to initialize the TP group. - # This cannot perform the actual communication ops. + # NOTE(woosuk): This is just to initialize the TP group and broadcast + # the input objects on CPU. The all-reduce and all-gather ops on TPU + # are invoked by `xm.all_reduce` and `xm.all_gather` which use their + # own context. init_distributed_environment( world_size=self.parallel_config.world_size, rank=self.rank, @@ -88,6 +88,11 @@ def init_device(self) -> None: self.parallel_config.tensor_parallel_size, self.parallel_config.pipeline_parallel_size) + # Device initialization should happen after initializing the distributed + # runtime. + self.device = xm.xla_device() + self.device_config.device = self.device + # Set random seed. set_random_seed(self.model_config.seed) xm.set_rng_state(self.model_config.seed, self.device) @@ -200,8 +205,7 @@ def get_cache_block_size_bytes(self) -> int: @property def do_metadata_broadcast(self) -> bool: - # TODO(woosuk): Support TP. - return False + return self.parallel_config.tensor_parallel_size > 1 @property def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: From c53041ae3b8ded4ac4c3fc745be6bc695b9f0c78 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 27 Jul 2024 05:47:33 +0100 Subject: [PATCH 34/45] [Doc] Add missing mock import to docs `conf.py` (#6834) --- .readthedocs.yaml | 1 + docs/source/conf.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 428e199088589..f1959ad2743f3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,7 @@ build: sphinx: configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: diff --git a/docs/source/conf.py b/docs/source/conf.py index f4cec05663fcd..b867bfd89dc17 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -94,6 +94,7 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ + "aiohttp", "cpuinfo", "torch", "transformers", @@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: } autodoc_preserve_defaults = True +autodoc_warningiserror = True navigation_with_keys = False From 593e79e7337f7fd9e92b7554dabdff96769dbf15 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Fri, 26 Jul 2024 23:15:20 -0600 Subject: [PATCH 35/45] [Bugfix] torch.set_num_threads() in multiproc_gpu_executor (#6802) [Bugfix] Use torch.set_num_threads() to configure parallelism in multiproc_gpu_executor (#6802) Signed-off-by: Travis Johnson --- vllm/executor/multiproc_gpu_executor.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 19f7a497cdd9f..e1e92958e667c 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -6,6 +6,8 @@ from functools import partial from typing import Any, List, Optional +import torch + from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.gpu_executor import create_worker @@ -45,10 +47,23 @@ def _init_executor(self) -> None: # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU - # contention amongst the shards - if "OMP_NUM_THREADS" not in os.environ: - os.environ["OMP_NUM_THREADS"] = "1" + # Configure thread parallelism if OMP_NUM_THREADS isn't set + # + # Helps to avoid CPU contention. The default of spawning a thread per + # core combined with multiprocessing for each GPU can have a negative + # impact on performance. The contention is amplified when running in a + # container where CPU limits can cause throttling. + default_omp_num_threads = 1 + if "OMP_NUM_THREADS" not in os.environ and ( + current_parallelism := + torch.get_num_threads()) > default_omp_num_threads: + logger.warning( + "Reducing Torch parallelism from %d threads to %d to avoid " + "unnecessary CPU contention. Set OMP_NUM_THREADS in the " + "external environment to tune this value as needed.", + current_parallelism, default_omp_num_threads) + os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) + torch.set_num_threads(default_omp_num_threads) # workaround for https://github.com/vllm-project/vllm/issues/6103 if world_size > 1: From aa46953a20685377fc51dcde172114ddd7ffdc68 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:44:13 -0700 Subject: [PATCH 36/45] [Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858) Co-authored-by: Cyrus Leung --- examples/fuyu_example.py | 31 ---- examples/llava_example.py | 25 --- examples/llava_next_example.py | 36 ---- examples/minicpmv_example.py | 55 ------ examples/offline_inference_vision_language.py | 174 ++++++++++++++++++ examples/paligemma_example.py | 25 --- examples/phi3v_example.py | 40 ---- 7 files changed, 174 insertions(+), 212 deletions(-) delete mode 100644 examples/fuyu_example.py delete mode 100644 examples/llava_example.py delete mode 100644 examples/llava_next_example.py delete mode 100644 examples/minicpmv_example.py create mode 100644 examples/offline_inference_vision_language.py delete mode 100644 examples/paligemma_example.py delete mode 100644 examples/phi3v_example.py diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py deleted file mode 100644 index c92b8fb4bc286..0000000000000 --- a/examples/fuyu_example.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_fuyu(): - llm = LLM(model="adept/fuyu-8b", max_model_len=4096) - - # single-image prompt - prompt = "What is the highest life expectancy at of male?\n" - url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png" - image = Image.open(requests.get(url, stream=True).raw) - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_fuyu() diff --git a/examples/llava_example.py b/examples/llava_example.py deleted file mode 100644 index 4c9eabd261e5c..0000000000000 --- a/examples/llava_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_llava(): - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_llava() diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py deleted file mode 100644 index fd53a6def1a13..0000000000000 --- a/examples/llava_next_example.py +++ /dev/null @@ -1,36 +0,0 @@ -from io import BytesIO - -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_llava_next(): - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096) - - prompt = "[INST] \nWhat is shown in this image? [/INST]" - url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" - image = Image.open(BytesIO(requests.get(url).content)) - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=100) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - } - }, - sampling_params=sampling_params) - - generated_text = "" - for o in outputs: - generated_text += o.outputs[0].text - - print(f"LLM output:{generated_text}") - - -if __name__ == "__main__": - run_llava_next() diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py deleted file mode 100644 index bf20a7ea04ad4..0000000000000 --- a/examples/minicpmv_example.py +++ /dev/null @@ -1,55 +0,0 @@ -from transformers import AutoTokenizer - -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - -# 2.0 -# The official repo doesn't work yet, so we need to use a fork for now -# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -# MODEL_NAME = "HwwwH/MiniCPM-V-2" -# 2.5 -MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" - -image = ImageAsset("stop_sign").pil_image.convert("RGB") - -tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) -llm = LLM(model=MODEL_NAME, - gpu_memory_utilization=1, - trust_remote_code=True, - max_model_len=4096) - -messages = [{ - 'role': - 'user', - 'content': - '(./)\n' + "What's the content of the image?" -}] -prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) -# 2.0 -# stop_token_ids = [tokenizer.eos_id] -# 2.5 -stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - -sampling_params = SamplingParams( - stop_token_ids=stop_token_ids, - # temperature=0.7, - # top_p=0.8, - # top_k=100, - # seed=3472, - max_tokens=1024, - # min_tokens=150, - temperature=0, - use_beam_search=True, - # length_penalty=1.2, - best_of=3) - -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - } -}, - sampling_params=sampling_params) -print(outputs[0].outputs[0].text) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py new file mode 100644 index 0000000000000..4a09f77ca59f8 --- /dev/null +++ b/examples/offline_inference_vision_language.py @@ -0,0 +1,174 @@ +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset +from vllm.utils import FlexibleArgumentParser + +# Input image and question +image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +question = "What is the content of this image?" + + +# LLaVA-1.5 +def run_llava(question): + + prompt = f"USER: \n{question}\nASSISTANT:" + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + return llm, prompt + + +# LLaVA-1.6/LLaVA-NeXT +def run_llava_next(question): + + prompt = f"[INST] \n{question} [/INST]" + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") + + return llm, prompt + + +# Fuyu +def run_fuyu(question): + + prompt = f"{question}\n" + llm = LLM(model="adept/fuyu-8b") + + return llm, prompt + + +# Phi-3-Vision +def run_phi3v(question): + + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501 + # Note: The default setting of max_num_seqs (256) and + # max_model_len (128k) for this model may cause OOM. + # You may lower either to run this example on lower-end GPUs. + + # In this example, we override max_num_seqs to 5 while + # keeping the original context length of 128k. + llm = LLM( + model="microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True, + max_num_seqs=5, + ) + return llm, prompt + + +# PaliGemma +def run_paligemma(question): + + prompt = question + llm = LLM(model="google/paligemma-3b-mix-224") + + return llm, prompt + + +# Chameleon +def run_chameleon(question): + + prompt = f"{question}" + llm = LLM(model="facebook/chameleon-7b") + return llm, prompt + + +# MiniCPM-V +def run_minicpmv(question): + + # 2.0 + # The official repo doesn't work yet, so we need to use a fork for now + # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa + # model_name = "HwwwH/MiniCPM-V-2" + + # 2.5 + model_name = "openbmb/MiniCPM-Llama3-V-2_5" + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + llm = LLM( + model=model_name, + trust_remote_code=True, + ) + + messages = [{ + 'role': 'user', + 'content': f'(./)\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + return llm, prompt + + +model_example_map = { + "llava": run_llava, + "llava-next": run_llava_next, + "fuyu": run_fuyu, + "phi3_v": run_phi3v, + "paligemma": run_paligemma, + "chameleon": run_chameleon, + "minicpmv": run_minicpmv, +} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + llm, prompt = model_example_map[model](question) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, max_tokens=64) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models') + args = parser.parse_args() + parser.add_argument('--model-type', + '-m', + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + + args = parser.parse_args() + main(args) diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py deleted file mode 100644 index 92a3cb3ac4129..0000000000000 --- a/examples/paligemma_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_paligemma(): - llm = LLM(model="google/paligemma-3b-mix-224") - - prompt = "caption es" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_paligemma() diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py deleted file mode 100644 index ae8c38d84e8fd..0000000000000 --- a/examples/phi3v_example.py +++ /dev/null @@ -1,40 +0,0 @@ -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - - -def run_phi3v(): - model_path = "microsoft/Phi-3-vision-128k-instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (128k) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # In this example, we override max_num_seqs to 5 while - # keeping the original context length of 128k. - llm = LLM( - model=model_path, - trust_remote_code=True, - max_num_seqs=5, - ) - - image = ImageAsset("cherry_blossom").pil_image - - # single-image prompt - prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_phi3v() From 925de97e05dd4709fcd80691cb37da5e582c22e8 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 26 Jul 2024 23:24:08 -0700 Subject: [PATCH 37/45] [Bugfix] Fix VLM example typo (#6859) --- examples/offline_inference_vision_language.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4a09f77ca59f8..8a63653343db6 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -158,7 +158,6 @@ def main(args): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models') - args = parser.parse_args() parser.add_argument('--model-type', '-m', type=str, From a57d75821c6177da75fdebf171d528eef5301961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Sat, 27 Jul 2024 17:07:02 +0800 Subject: [PATCH 38/45] [bugfix] make args.stream work (#6831) --- examples/api_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/api_client.py b/examples/api_client.py index 27a2a08b7b0c3..49a085febdc57 100644 --- a/examples/api_client.py +++ b/examples/api_client.py @@ -31,7 +31,10 @@ def post_http_request(prompt: str, "max_tokens": 16, "stream": stream, } - response = requests.post(api_url, headers=headers, json=pload, stream=True) + response = requests.post(api_url, + headers=headers, + json=pload, + stream=stream) return response From ecb33a28cb6c10ebf3b1aa139f72e759cacb8c15 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 27 Jul 2024 02:54:14 -0700 Subject: [PATCH 39/45] [CI/Build][Doc] Update CI and Doc for VLM example changes (#6860) --- .buildkite/test-pipeline.yaml | 3 +-- docs/source/models/vlm.rst | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5b4a786305e1f..be8807df0b098 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -140,14 +140,13 @@ steps: working_dir: "/vllm-workspace/examples" mirror_hardwares: [amd] commands: - # install aws cli for llava_example.py # install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - - python3 llava_example.py + - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - label: Inputs Test diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index ef4ce0d44a162..a385605c9f8f6 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI generated_text = o.outputs[0].text print(generated_text) -A code example can be found in `examples/llava_example.py `_. +A code example can be found in `examples/offline_inference_vision_language.py `_. Online OpenAI Vision API Compatible Inference From 1ad86acf1789650e2ff27586e36a8159d52755dd Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 27 Jul 2024 19:53:07 +0800 Subject: [PATCH 40/45] [Model] Initial support for BLIP-2 (#5920) Co-authored-by: ywang96 --- docs/source/models/supported_models.rst | 8 + examples/offline_inference_vision_language.py | 11 + examples/template_blip2.jinja | 11 + tests/models/test_blip2.py | 102 +++ tests/models/test_fuyu.py | 8 +- tests/models/test_minicpmv.py | 8 +- tests/models/test_phi3v.py | 8 +- vllm/model_executor/models/__init__.py | 6 +- vllm/model_executor/models/blip.py | 269 +++++++ vllm/model_executor/models/blip2.py | 669 ++++++++++++++++++ vllm/model_executor/models/opt.py | 17 +- vllm/multimodal/base.py | 11 +- 12 files changed, 1107 insertions(+), 21 deletions(-) create mode 100644 examples/template_blip2.jinja create mode 100644 tests/models/test_blip2.py create mode 100644 vllm/model_executor/models/blip.py create mode 100644 vllm/model_executor/models/blip2.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 483f552bba238..83c1b9c8bce86 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo The following is the list of model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. +---- + Decoder-only Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: @@ -186,6 +188,10 @@ Vision Language Models - Models - Example HuggingFace Models - :ref:`LoRA ` + * - :code:`Blip2ForConditionalGeneration` + - BLIP-2 + - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - :code:`facebook/chameleon-7b` etc. @@ -215,6 +221,8 @@ Vision Language Models - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. - +---- + If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` for instructions on how to implement support for your model. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 8a63653343db6..04ba1a96314c9 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -106,6 +106,16 @@ def run_minicpmv(question): return llm, prompt +# BLIP-2 +def run_blip2(question): + + # BLIP-2 prompt format is inaccurate on HuggingFace model repository. + # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa + prompt = f"Question: {question} Answer:" + llm = LLM(model="Salesforce/blip2-opt-2.7b") + return llm, prompt + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -114,6 +124,7 @@ def run_minicpmv(question): "paligemma": run_paligemma, "chameleon": run_chameleon, "minicpmv": run_minicpmv, + "blip-2": run_blip2, } diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja new file mode 100644 index 0000000000000..fd41a7f7fa666 --- /dev/null +++ b/examples/template_blip2.jinja @@ -0,0 +1,11 @@ +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- 'Question: ' + message['content'] + ' ' -}} + {%- elif message['role'] == 'assistant' -%} + {{- 'Answer: ' + message['content'] + ' ' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{- 'Answer:' -}} +{% endif %} diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py new file mode 100644 index 0000000000000..26afd57ae6106 --- /dev/null +++ b/tests/models/test_blip2.py @@ -0,0 +1,102 @@ +from typing import List, Optional, Tuple + +import pytest +from transformers import AutoTokenizer + +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "Question: What's the content of the image? Answer:", + "cherry_blossom": + "Question: What is the season? Answer:", +}) + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + _, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "\n" + + tokenizer = AutoTokenizer.from_pretrained(model) + hf_output_ids = tokenizer.encode(hf_output_str) + assert hf_output_ids[0] == tokenizer.bos_token_id + hf_output_ids = hf_output_ids[1:] + + return hf_output_ids, hf_output_str, out_logprobs + + +@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py index 25f63a3d64d0e..7d0f3be5ea008 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/test_fuyu.py @@ -77,8 +77,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model: @@ -89,9 +89,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 9124fa7a6238c..c57f0f8c08548 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -88,9 +88,9 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images, + images=images, stop_token_ids=stop_token_ids) - for prompts, vllm_images in inputs_per_image + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): @@ -114,9 +114,9 @@ def to(self, device: torch.types.Device): hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, tokenizer=tokenizer) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 9da25ab8d78fe..35ffe4ef50a85 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -101,8 +101,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] # use eager mode for hf runner, since phi3_v didn't work with flash_attn @@ -114,9 +114,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index ead64c0e92553..fe04c6db5fbc2 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -16,6 +16,8 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "Blip2ForConditionalGeneration": + ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), @@ -56,8 +58,8 @@ "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), - "PaliGemmaForConditionalGeneration": - ("paligemma", "PaliGemmaForConditionalGeneration"), + "PaliGemmaForConditionalGeneration": ("paligemma", + "PaliGemmaForConditionalGeneration"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py new file mode 100644 index 0000000000000..0b124d5e8a85a --- /dev/null +++ b/vllm/model_executor/models/blip.py @@ -0,0 +1,269 @@ +"""Minimal implementation of BlipVisionModel intended to be only used +within a vision language model.""" +from typing import Optional, Union + +import torch +import torch.nn as nn +from PIL import Image +from transformers import Blip2VisionConfig, BlipVisionConfig +from transformers.models.blip.modeling_blip import BlipAttention + +from vllm.config import ModelConfig +from vllm.inputs import LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal.image import (cached_get_tokenizer, + repeat_and_pad_image_tokens) +from vllm.sequence import SequenceData + + +def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_blip_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_blip_image_feature_size( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_blip_image_tokens( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_image_feature_size(hf_config) + + +def dummy_seq_data_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + seq_len: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + token_ids = [image_token_id] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + return SequenceData(token_ids) + + +def dummy_image_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def input_processor_for_blip( + model_config: ModelConfig, + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + llm_inputs: LLMInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + new_prompt, new_token_ids = repeat_and_pad_image_tokens( + tokenizer, + llm_inputs.get("prompt"), + llm_inputs["prompt_token_ids"], + image_token_id=image_token_id, + repeat_count=image_feature_size, + ) + + # NOTE: Create a defensive copy of the original inputs + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa +class BlipVisionEmbeddings(nn.Module): + + def __init__(self, config: BlipVisionConfig): + super().__init__() + + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + + self.num_patches = get_blip_num_patches(image_size=self.image_size, + patch_size=self.patch_size) + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter( + torch.randn(1, self.num_positions, self.embed_dim)) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + + position_embeds = self.position_embedding.to(target_dtype) + embeddings = embeddings + position_embeds[:, :embeddings.size(1), :] + + return embeddings + + +class BlipMLP(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.config = config + + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class BlipEncoderLayer(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = BlipMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`BlipEncoderLayer`]. + + Args: + config: BlipConfig + """ + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + + self.layers = nn.ModuleList([ + BlipEncoderLayer(config=config, quant_config=quant_config) + for _ in range(num_hidden_layers) + ]) + + def forward(self, inputs_embeds: torch.Tensor): + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class BlipVisionModel(nn.Module): + config_class = BlipVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + ) + self.post_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + hidden_states = self.embeddings(pixel_values) + hidden_states = self.encoder(inputs_embeds=hidden_states) + + return self.post_layernorm(hidden_states) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py new file mode 100644 index 0000000000000..e00e6c0806957 --- /dev/null +++ b/vllm/model_executor/models/blip2.py @@ -0,0 +1,669 @@ +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +import torch.nn as nn +from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, + apply_chunking_to_forward) + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.opt import OPTModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData + +from .blip import (BlipVisionModel, dummy_image_for_blip, + get_max_blip_image_tokens) +from .interfaces import SupportsVision +from .utils import merge_vision_embeddings + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.lm_head": "lm_head", + "language_model.model": "language_model", +} + + +class Blip2QFormerMultiHeadAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.config = config + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of " + f"the number of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = (config.hidden_size // + config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + kv_hidden_size = config.encoder_hidden_size + else: + kv_hidden_size = config.hidden_size + self.key = nn.Linear(kv_hidden_size, self.all_head_size) + self.value = nn.Linear(kv_hidden_size, self.all_head_size) + + self.position_embedding_type = getattr(config, + "position_embedding_type", + "absolute") + if self.position_embedding_type != "absolute": + raise NotImplementedError("Unsupported position_embedding_type: " + f"{self.position_embedding_type}") + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + x = x.view(*x.size()[:-1], self.num_attention_heads, + self.attention_head_size) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ): + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_probs = torch.softmax(attention_scores * self.scaling, + dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + context_layer = context_layer.view(*context_layer.size()[:-2], + self.all_head_size) + + return context_layer + + +class Blip2QFormerSelfOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.attention = Blip2QFormerMultiHeadAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=is_cross_attention, + ) + + self.output = Blip2QFormerSelfOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.Tensor]: + self_output = self.attention( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + attention_output = self.output(self_output, hidden_states) + + return attention_output + + +class Blip2QFormerIntermediate(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = get_act_fn(config.hidden_act) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + layer_idx: int, + ) -> None: + super().__init__() + + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config, + quant_config=quant_config, + cache_config=cache_config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ): + attention_output = self.attention(hidden_states) + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + query_attention_output = self.crossattention( + query_attention_output, + encoder_hidden_states=encoder_hidden_states, + ) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], + dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + + return layer_output + + def feed_forward_chunk(self, + attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query( + self, attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layer = nn.ModuleList([ + Blip2QFormerLayer(config, + quant_config=quant_config, + cache_config=cache_config, + layer_idx=layer_idx) + for layer_idx in range(config.num_hidden_layers) + ]) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ) -> torch.Tensor: + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + + hidden_states = layer_module( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return hidden_states + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025 +class Blip2QFormerModel(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config, + quant_config=quant_config, + cache_config=cache_config) + + def forward( + self, + query_embeds: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + ) -> torch.Tensor: + query_length = query_embeds.shape[1] + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + sequence_output = self.encoder( + embedding_output, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return sequence_output + + +class Blip2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +Blip2ImageInputs = Blip2ImagePixelInputs + +# We use this internally as placeholders since there is no image token +# defined on the HuggingFace repo +BLIP2_IMAGE_TOKEN = "" +BLIP2_IMAGE_TOKEN_ID = 50265 + + +def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: + return hf_config.num_query_tokens + + +def get_max_blip2_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + if isinstance(vision_config, Blip2VisionConfig): + return get_max_blip_image_tokens(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def dummy_data_for_blip2(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + image_feature_size = get_blip2_image_feature_size(hf_config) + token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + seq_data = SequenceData(token_ids) + + if isinstance(vision_config, Blip2VisionConfig): + mm_data = dummy_image_for_blip(vision_config) + + return seq_data, mm_data + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def input_processor_for_blip2(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + hf_config = ctx.get_hf_config(Blip2Config) + image_feature_size = get_blip2_image_feature_size(hf_config) + + # The original model places image tokens at the front + # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 + new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + new_token_ids += llm_inputs["prompt_token_ids"] + + new_prompt = llm_inputs.get("prompt") + if new_prompt is not None: + new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) +@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +class Blip2ForConditionalGeneration(nn.Module, SupportsVision): + + def __init__(self, + config: Blip2Config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + # TODO: Optionally initializes this for supporting embeddings. + self.vision_model = BlipVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter( + torch.zeros(1, config.num_query_tokens, + config.qformer_config.hidden_size)) + + self.qformer = Blip2QFormerModel(config.qformer_config, + cache_config=cache_config, + quant_config=quant_config) + + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + self.quant_config = quant_config + + self.language_model = OPTModel(config.text_config, cache_config, + quant_config) + + self.unpadded_vocab_size = config.text_config.vocab_size + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size) + self.sampler = Sampler() + + def get_lm_head(self): + return self.language_model.decoder.embed_tokens + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Blip2ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Blip2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + def _image_pixels_to_features(self, vision_model: BlipVisionModel, + pixel_values: torch.Tensor) -> torch.Tensor: + + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_model(pixel_values) + + return image_features + + def _process_image_pixels(self, + inputs: Blip2ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + + return self._image_pixels_to_features(self.vision_model, pixel_values) + + def _process_image_input(self, + image_input: Blip2ImageInputs) -> torch.Tensor: + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + + query_tokens = self.query_tokens.expand(image_features.shape[0], -1, + -1) + query_output = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_features, + ) + + return self.language_projection(query_output) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> SamplerOutput: + """Run forward pass for BLIP-2. + + One key thing to understand is the `input_ids` already accounts for the + positions of the to-be-inserted image embeddings. + + Concretely, consider a text prompt: + `"Question: What's the content of the image? Answer:"`. + + Tokenizer outputs: + `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`. + + To reserve space in KV cache, we have to insert placeholder tokens + before they are inputted to the model, so the input processor prepends + dummy tokens (denoted as `50265`), resulting in: + `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`. + + We insert 32 tokens since it corresponds to the number of query + embeddings outputted by the Q-Former and inputted to the language model. + + This way, the `positions` and `attn_metadata` are consistent + with the `input_ids`. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + pixel_values: The pixels in each input image. + + See also: + :class:`Blip2ImageInputs` + """ + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, + vision_embeddings, + BLIP2_IMAGE_TOKEN_ID) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.get_lm_head(), hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # only doing this for language model part for now. + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + if "lm_head.weight" in name: + continue + if "rotary_emb.inv_freq" in name: + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + use_default_weight_loading = False + if "vision" in name: + if self.vision_model is not None: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index edc16710c0229..a05090cd46648 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -237,14 +237,19 @@ def __init__( for _ in range(config.num_hidden_layers) ]) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) pos_embeds = self.embed_positions(positions) if self.project_in is not None: inputs_embeds, _ = self.project_in(inputs_embeds) @@ -272,14 +277,22 @@ def __init__( super().__init__() self.decoder = OPTDecoder(config, cache_config, quant_config) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.decoder.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.decoder(input_ids, positions, kv_caches, attn_metadata) + return self.decoder(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) class OPTForCausalLM(nn.Module): diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 0d435bd644e29..5abd0ad61cdf9 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,8 +1,9 @@ import sys from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict, - TypeVar, Union, cast) +from typing import Any, Callable, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import Type, TypedDict, TypeVar, Union, cast import torch import torch.types @@ -15,13 +16,13 @@ logger = init_logger(__name__) -NestedTensors = Union[List[torch.Tensor], torch.Tensor] +NestedTensors = Union[GenericSequence[torch.Tensor], torch.Tensor] """ Use a list instead of a tensor if the dimensions of each element do not match. Currently only supports up to singly nested list of tensors. """ -BatchedTensors = Union[List[NestedTensors], NestedTensors] +BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors] """ If each input tensor in the batch has the same size, this is a single batched tensor; otherwise, this is a list of :class:`NestedTensors` with one element @@ -53,7 +54,7 @@ def try_concat( # may be list rather than tensors if isinstance(tensors[0], list): return [[t.to(device=device) for t in tensor[0]] - for tensor in tensors] + for tensor in cast(List[List[torch.Tensor]], tensors)] tensors_ = cast(List[torch.Tensor], tensors) From f954d0715c8b68e780aac4a4f3ffd1ab56bebfcd Mon Sep 17 00:00:00 2001 From: Chenggang Wu Date: Sat, 27 Jul 2024 09:24:46 -0700 Subject: [PATCH 41/45] [Docs] Add RunLLM chat widget (#6857) --- docs/source/_static/custom.js | 16 ++++++++++++++++ docs/source/conf.py | 2 ++ 2 files changed, 18 insertions(+) create mode 100644 docs/source/_static/custom.js diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js new file mode 100644 index 0000000000000..f475be71fc448 --- /dev/null +++ b/docs/source/_static/custom.js @@ -0,0 +1,16 @@ +document.addEventListener("DOMContentLoaded", function () { + var script = document.createElement("script"); + script.type = "module"; + script.id = "runllm-widget-script" + + script.src = "https://widget.runllm.com"; + + script.setAttribute("version", "stable"); + script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. + script.setAttribute("runllm-name", "vLLM"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-assistant-id", "207"); + + script.async = true; + document.head.appendChild(script); + }); \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b867bfd89dc17..1093b30bca11d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,6 +68,8 @@ 'use_repository_button': True, 'use_edit_page_button': True, } +html_static_path = ["_static"] +html_js_files = ["custom.js"] # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') From fad5576c58864a6c2cf528f67e60e03a949b3dac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 27 Jul 2024 10:28:33 -0700 Subject: [PATCH 42/45] [TPU] Reduce compilation time & Upgrade PyTorch XLA version (#6856) --- Dockerfile.tpu | 2 +- docs/source/getting_started/tpu-installation.rst | 9 ++++++++- vllm/attention/backends/pallas.py | 1 - .../device_communicators/tpu_communicator.py | 3 ++- vllm/worker/tpu_model_runner.py | 15 +++++++++++++-- vllm/worker/tpu_worker.py | 1 - 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index be7dbe63cb237..4fc14d6bd186c 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20240713" +ARG NIGHTLY_DATE="20240726" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 5e2f514a4a509..2e6c522422c22 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -56,7 +56,7 @@ First, install the dependencies: $ pip uninstall torch torch-xla -y $ # Install PyTorch and PyTorch XLA. - $ export DATE="+20240713" + $ export DATE="+20240726" $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl @@ -75,6 +75,13 @@ Next, build vLLM from source. This will only take a few seconds: $ VLLM_TARGET_DEVICE="tpu" python setup.py develop +.. note:: + + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. + The compilation time may take 20~30 minutes in the first run. + However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + + .. tip:: If you encounter the following error: diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index c53a2f91b89d7..2269ac2606e89 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -3,7 +3,6 @@ import torch import torch_xla.experimental.custom_kernel # Required to register custom ops. -import torch_xla.experimental.dynamo_set_buffer_donor from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index 69a9a516f3ebe..16525887cf4eb 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -6,6 +6,7 @@ if current_platform.is_tpu(): import torch_xla.core.xla_model as xm + import torch_xla.runtime as xr from torch_xla._internal import pjrt @@ -20,7 +21,7 @@ def __init__(self, group: ProcessGroup): local_rank = dist.get_rank(group) world_size = dist.get_world_size(group) pjrt.initialize_multiprocess(local_rank, world_size) - xm._init_world_size_ordinal() + xr._init_world_size_ordinal() def all_reduce(self, x: torch.Tensor) -> torch.Tensor: return xm.all_reduce(xm.REDUCE_SUM, x) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index e5bb101fc7df4..1692094af8c41 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn import torch_xla.core.xla_model as xm +import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, @@ -127,7 +128,7 @@ def load_model(self) -> None: # determine the order of concatenating the output tensors. # As a workaround, we use the xm's rank assignment only when loading # the embedding weights. - xm_tp_rank = xm.get_ordinal() + xm_tp_rank = xr.global_ordinal() with patch( "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", @@ -146,7 +147,17 @@ def load_model(self) -> None: xm.wait_device_ops() model = ModelWrapper(model) - self.model = torch.compile(model, backend="openxla", fullgraph=True) + # NOTE(woosuk): There are two stages of compilation: torch.compile and + # XLA compilation. Setting dynamic=True can reduce the torch.compile + # overhead by reusing the FX graph for different shapes. + # However, the XLA graph will still require static shapes and needs to + # be re-compiled for every different shapes. This overhead is inevitable + # in the first run, but can be skipped afterwards as we cache the XLA + # graphs in the disk (VLLM_XLA_CACHE_PATH). + self.model = torch.compile(model, + backend="openxla", + fullgraph=True, + dynamic=True) def _dummy_run( self, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index c88aba7ae08cd..17fa5c35457c2 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -3,7 +3,6 @@ import torch import torch_xla.core.xla_model as xm -import torch_xla.experimental.dynamo_set_buffer_donor # noqa: F401 import torch_xla.runtime as xr import vllm.envs as envs From 75acdaa4b616c2e95c55a47d3158ceec9c72c503 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Sat, 27 Jul 2024 17:52:33 -0400 Subject: [PATCH 43/45] [Kernel] Increase precision of GPTQ/AWQ Marlin kernel (#6795) --- benchmarks/kernels/benchmark_marlin.py | 23 ++- csrc/ops.h | 3 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 150 ++++++++++++++---- tests/kernels/test_marlin_gemm.py | 13 +- vllm/_custom_ops.py | 6 +- .../layers/quantization/utils/marlin_utils.py | 17 +- 6 files changed, 168 insertions(+), 44 deletions(-) diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 3da4cecd7eeff..684985b81f690 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -10,7 +10,7 @@ GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS) + MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace, marlin_quantize) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( @@ -56,6 +56,8 @@ def bench_run(results: List[benchmark.Measurement], model: str, (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = marlin_24_quantize(b, num_bits, group_size) + marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) + # GPTQ quant (w_ref, q_w, s, g_idx, rand_perm) = quantize_weights(b, num_bits, group_size, act_order) @@ -87,6 +89,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, "marlin_w_ref": marlin_w_ref, "marlin_q_w": marlin_q_w, "marlin_s": marlin_s, + "marlin_zp": marlin_zp, "marlin_g_idx": marlin_g_idx, "marlin_sort_indices": marlin_sort_indices, "marlin_rand_perm": marlin_rand_perm, @@ -125,11 +128,21 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp16", + ).blocked_autorange(min_run_time=min_run_time)) + + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, - description="gptq_marlin_gemm", + description="gptq_marlin_gemm_fp32", ).blocked_autorange(min_run_time=min_run_time)) if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS @@ -183,12 +196,12 @@ def main(args): ) > 0 and is_k_full not in args.limit_k_full: continue - for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS: + for num_bits in MARLIN_SUPPORTED_NUM_BITS: if len(args.limit_num_bits ) > 0 and num_bits not in args.limit_num_bits: continue - for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES: + for group_size in MARLIN_SUPPORTED_GROUP_SIZES: if len( args.limit_group_size ) > 0 and group_size not in args.limit_group_size: diff --git a/csrc/ops.h b/csrc/ops.h index 9ef1fcb465bf3..f075850248d1c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -93,7 +93,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp); + bool is_k_full, bool has_zp, + bool use_fp32_reduce); torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 122c5c16b58ce..36ae2bfafa7c2 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -59,14 +59,16 @@ __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn const int* __restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_fp32_reduce // whether to use fp32 global reduce ) {} } // namespace gptq_marlin @@ -532,16 +534,18 @@ __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape // (k/groupsize)x(n/pack_factor) const int* __restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_fp32_reduce // whether to use fp32 global reduce ) { // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * @@ -595,6 +599,8 @@ __global__ void Marlin( int slice_idx; // index of threadblock in current slice; numbered bottom to // top + int par_id = 0; + // We can easily implement parallel problem execution by just remapping // indices and advancing global pointers if (slice_col_par >= n_tiles) { @@ -602,6 +608,7 @@ __global__ void Marlin( C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; locks += (slice_col_par / n_tiles) * n_tiles; slice_col = slice_col_par % n_tiles; + par_id = slice_col_par / n_tiles; } // Compute all information about the current slice which is required for @@ -632,6 +639,7 @@ __global__ void Marlin( C += 16 * thread_m_blocks * prob_n / 8; locks += n_tiles; slice_col = 0; + par_id++; } }; init_slice(); @@ -1321,7 +1329,7 @@ __global__ void Marlin( // finally have to globally reduce over the results. As the striped // partitioning minimizes the number of such reductions and our outputs are // usually rather small, we perform this reduction serially in L2 cache. - auto global_reduce = [&](bool first = false, bool last = false) { + auto global_reduce_fp16 = [&](bool first = false, bool last = false) { // We are very careful here to reduce directly in the output buffer to // maximize L2 cache utilization in this step. To do this, we write out // results in FP16 (but still reduce with FP32 compute). @@ -1382,6 +1390,53 @@ __global__ void Marlin( } }; + // Globally reduce over threadblocks that compute the same column block. + // We use a tmp C buffer to reduce in full fp32 precision. + auto global_reduce_fp32 = [&](bool first = false, bool last = false) { + constexpr int tb_m = thread_m_blocks * 16; + constexpr int tb_n = thread_n_blocks * 16; + + constexpr int c_size = tb_m * tb_n * sizeof(float) / 16; + + constexpr int active_threads = 32 * thread_n_blocks / 4; + bool is_th_active = threadIdx.x < active_threads; + + int par_offset = c_size * n_tiles * par_id; + int slice_offset = c_size * slice_col; + + constexpr int num_floats = thread_m_blocks * 4 * 2 * 4; + constexpr int th_size = num_floats * sizeof(float) / 16; + + int c_cur_offset = par_offset + slice_offset; + + if (!is_th_active) { + return; + } + + if (!first) { + float* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + sh[threadIdx.x] = + C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; + + float* sh_c_ptr = reinterpret_cast(&sh[threadIdx.x]); + #pragma unroll + for (int f = 0; f < 4; f++) { + frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; + } + } + } + + if (!last) { + int4* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k]; + } + } + }; + // Write out the reduce final result in the correct layout. We only actually // reshuffle matrix fragments in this step, the reduction above is performed // in fragment layout. @@ -1606,7 +1661,11 @@ __global__ void Marlin( if (slice_count > 1) { // only globally reduce if there is more than one // block in a slice barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); + if (use_fp32_reduce) { + global_reduce_fp32(slice_idx == 0, last); + } else { + global_reduce_fp16(slice_idx == 0, last); + } barrier_release(&locks[slice_col], last); } if (last) // only the last block in a slice actually writes the result @@ -1661,8 +1720,8 @@ __global__ void Marlin( THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \ HAS_ZP, GROUP_BLOCKS> \ <<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups, \ - prob_m, prob_n, prob_k, locks); \ + A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ + num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ } typedef struct { @@ -1801,6 +1860,27 @@ bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, return true; } +int determine_reduce_max_m(int prob_m, int max_par) { + constexpr int tile_m_size = 16; + + if (prob_m <= tile_m_size) { + return tile_m_size; + + } else if (prob_m <= tile_m_size * 2) { + return tile_m_size * 2; + + } else if (prob_m <= tile_m_size * 3) { + return tile_m_size * 3; + + } else if (prob_m <= tile_m_size * 4) { + return tile_m_size * 4; + + } else { + int cur_par = min(div_ceil(prob_m, tile_m_size * 4), max_par); + return tile_m_size * 4 * cur_par; + } +} + exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, @@ -1880,13 +1960,13 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) template -void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp, - void* g_idx, void* perm, void* a_tmp, int prob_m, - int prob_n, int prob_k, void* workspace, int num_bits, - bool has_act_order, bool is_k_full, bool has_zp, - int num_groups, int group_size, int dev, +void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp, + void* s, void* zp, void* g_idx, void* perm, void* a_tmp, + int prob_m, int prob_n, int prob_k, void* workspace, + int num_bits, bool has_act_order, bool is_k_full, + bool has_zp, int num_groups, int group_size, int dev, cudaStream_t stream, int thread_k, int thread_n, int sms, - int max_par) { + int max_par, bool use_fp32_reduce) { TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, @@ -1970,6 +2050,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp, const int4* A_ptr = (const int4*)A; const int4* B_ptr = (const int4*)B; int4* C_ptr = (int4*)C; + int4* C_tmp_ptr = (int4*)C_tmp; const int4* s_ptr = (const int4*)s; const int4* zp_ptr = (const int4*)zp; const int* g_idx_ptr = (const int*)g_idx; @@ -2049,7 +2130,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp) { + bool is_k_full, bool has_zp, + bool use_fp32_reduce) { // Verify num_bits TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); @@ -2099,6 +2181,17 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor c = torch::empty({size_m, size_n}, options); torch::Tensor a_tmp = torch::empty({size_m, size_k}, options); + // Alloc C tmp buffer that is going to be used for the global reduce + int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par); + int reduce_n = size_n; + auto options_fp32 = + torch::TensorOptions().dtype(at::kFloat).device(a.device()); + if (!use_fp32_reduce) { + reduce_max_m = 0; + reduce_n = 0; + } + torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32); + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as // auto -1) int thread_k = -1; @@ -2171,20 +2264,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, if (a.scalar_type() == at::ScalarType::Half) { marlin::marlin_mm_f16i4( a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), - perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, + c_tmp.data_ptr(), b_scales.data_ptr(), + b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), + a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else if (a.scalar_type() == at::ScalarType::BFloat16) { marlin::marlin_mm_f16i4( a.data_ptr(), b_q_weight.data_ptr(), - c.data_ptr(), b_scales.data_ptr(), - b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), - a_tmp.data_ptr(), size_m, size_n, size_k, + c.data_ptr(), c_tmp.data_ptr(), + b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else { TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); } diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 42087fdcce959..bd35ef2eb2552 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -27,6 +27,7 @@ ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] +USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] MARLIN_N_CHUNKS = [64, 128, 256] @@ -175,6 +176,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, @pytest.mark.parametrize("mnk_factors", MNK_FACTORS) @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS) @pytest.mark.parametrize("is_k_full", K_FULL_OPTS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) def test_gptq_marlin_gemm( k_chunk, n_chunk, @@ -183,6 +185,7 @@ def test_gptq_marlin_gemm( mnk_factors, act_order, is_k_full, + use_fp32_reduce, ): m_factor, n_factor, k_factor = mnk_factors @@ -222,8 +225,9 @@ def test_gptq_marlin_gemm( a_input.shape[0], b_weight.shape[1], a_input.shape[1], - is_k_full, + is_k_full=is_k_full, has_zp=False, + use_fp32_reduce=use_fp32_reduce, ) output_ref = torch.matmul(a_input, w_ref) @@ -365,12 +369,14 @@ def test_fp8_marlin_gemm( @pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS) @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES) @pytest.mark.parametrize("mnk_factors", MNK_FACTORS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) def test_awq_marlin_gemm( k_chunk, n_chunk, num_bits, group_size, mnk_factors, + use_fp32_reduce, ): m_factor, n_factor, k_factor = mnk_factors @@ -407,8 +413,9 @@ def test_awq_marlin_gemm( a_input.shape[0], b_weight.shape[1], a_input.shape[1], - is_k_full, - has_zp, + is_k_full=is_k_full, + has_zp=has_zp, + use_fp32_reduce=use_fp32_reduce, ) output_ref = torch.matmul(a_input, w_ref) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0186594656cc1..ad9f01be6ddd4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -286,12 +286,12 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_scales: torch.Tensor, b_zeros: torch.Tensor, g_idx: torch.Tensor, perm: torch.Tensor, workspace: torch.Tensor, num_bits: int, size_m: int, - size_n: int, size_k: int, is_k_full: bool, - has_zp: bool) -> torch.Tensor: + size_n: int, size_k: int, is_k_full: bool, has_zp: bool, + use_fp32_reduce: bool) -> torch.Tensor: return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros, g_idx, perm, workspace, num_bits, size_m, size_n, size_k, is_k_full, - has_zp) + has_zp, use_fp32_reduce) # fp8 marlin diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 25a7cd7bde653..b789ca20cadb3 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -16,6 +16,11 @@ MARLIN_SUPPORTED_NUM_BITS = [4, 8] MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] +# In case there is a performance issue with Marlin, the variable below can be +# changed to False, which allows Marlin to perform global reductions in fp16 +# precision (instead of fp32), and therefore, save on some memory movements. +USE_FP32_REDUCE_DEFAULT = True + def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool, min_capability: Optional[int], @@ -244,7 +249,8 @@ def apply_gptq_marlin_linear( output_size_per_partition: int, input_size_per_partition: int, is_k_full: bool, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + bias: Optional[torch.Tensor] = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor: reshaped_x = input.reshape(-1, input.shape[-1]) out_shape = input.shape[:-1] + (output_size_per_partition, ) @@ -260,7 +266,8 @@ def apply_gptq_marlin_linear( size_n=output_size_per_partition, size_k=input_size_per_partition, is_k_full=is_k_full, - has_zp=False) + has_zp=False, + use_fp32_reduce=use_fp32_reduce) if bias is not None: output.add_(bias) # In-place add @@ -279,7 +286,8 @@ def apply_awq_marlin_linear( num_bits: int, output_size_per_partition: int, input_size_per_partition: int, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + bias: Optional[torch.Tensor] = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor: reshaped_x = input.reshape(-1, input.shape[-1]) out_shape = input.shape[:-1] + (output_size_per_partition, ) @@ -295,7 +303,8 @@ def apply_awq_marlin_linear( size_n=output_size_per_partition, size_k=input_size_per_partition, is_k_full=True, - has_zp=True) + has_zp=True, + use_fp32_reduce=use_fp32_reduce) if bias is not None: output.add_(bias) # In-place add From b1366a953498fde9c5e7ab91915367ebc69008b2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 27 Jul 2024 18:05:17 -0400 Subject: [PATCH 44/45] Add Nemotron to PP_SUPPORTED_MODELS (#6863) --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index 92fde449b43fd..e7b54e04b00d5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -39,6 +39,7 @@ "Phi3ForCausalLM", "GPT2LMHeadModel", "MixtralForCausalLM", + "NemotronForCausalLM", ] From 3eeb148f467e3619e8890b1a5ebe86a173f91bc9 Mon Sep 17 00:00:00 2001 From: Elsa Granger <6374697+zeyugao@users.noreply.github.com> Date: Sun, 28 Jul 2024 23:13:49 +0800 Subject: [PATCH 45/45] [Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (#6871) --- .../layers/quantization/fbgemm_fp8.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 5e8d1f1947421..e7c3859967c71 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -9,6 +9,7 @@ UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: FBGEMMFp8Config): self.quant_config = quant_config + self.cutlass_fp8_supported = cutlass_fp8_supported() def create_weights( self, @@ -139,11 +141,12 @@ def apply(self, size_k=layer.input_size_per_partition, bias=bias) - return apply_fp8_linear(input=x, - weight=layer.weight, - weight_scale=layer.weight_scale, - input_scale=None, - input_scale_ub=layer.input_scale_ub, - bias=bias, - cutlass_fp8_supported=True, - use_per_token_if_dynamic=True) + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + input_scale_ub=layer.input_scale_ub, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported, + use_per_token_if_dynamic=True)